2019年3月17日 星期日

Python #crawling爬蟲 #save as a file as csv or html 存檔 #sqllite3 #dataframe.astype(str) #rename #remove column which is NaN# set index #lambda無名氏函數 no name function #pandas type to number轉數字 #date.strftime #list(filter)

import requests #think it as a browser
import pandas as pd

res = requests.get("website address") #inside the () you can put the website link

res.text # it will show the word you can not understand

res.encoding = 'utf-8' #utf-8 ,so we can read chinese word

res.text
# save the file
f = open('andrew_file.html','w',encoding = 'utf-8')
f.write(res.text)
f.close()

#we need to use pandas to read html as dataframe

df = pd.read_html('andrew_file.html')

print(df[1])

***************************************************************************
import requests #think it as a browser
import pandas as pd

res = requests.get("website address") #inside the () you can put the website link

res.text # it will show the word you can not understand

res.encoding = 'utf-8' #utf-8 ,so we can read chinese word

res.text
# save the file
f = open('andrew_file.html','w',encoding = 'utf-8')
f.write(res.text)
f.close()

#we need to use pandas to read html as dataframe

df = pd.read_html('andrew_file.html')

print(df[1])
*********************************************************************
#We also can directly let it be as dataframe , and no need to save
from io import StringIO
#StringIo make res.text like a file
dfs = pd.read_html(StringIO(res.text))
dfs[3]

*********************************************************************
import requests
import pandas as pd
from io import StringIO

response = requests.get('website address')

#df1 = pd.read_csv(StringIO(response.text))
#want to save as df , but fail , it show ParserError: Error tokenizing data. C error: Expected 6 fields in line 159, saw 17
#different row on the file


lines = response.text.split('\n')
lines[159]

newlines = []
#use for loop to organize the data
#we use ", to seperate ,the reason is we are afraid we meet the string like this '  "你",   ",好,",   "嗎",'
for line in lines:
    if len(line.split('",')) == 17:
        newlines.append(line)
     
print("original line count = ")
print(len(lines))
print("after process line count = " )
print(len(newlines))

j = '\n'
s = j.join(newlines)
s.replace('=','')

df = pd.read_csv(StringIO(s))
#only show first 5 row
df.head()

#but now ,they looks like number , but they are string
#and we dont want ","
#we dont want Unnamed16

#change all the content into string
df = df.astype(str)
#lambda is like a function ,but it make definition simple
#applymap is to excute funtion
df = df.applymap(lambda s:s.replace(',',''))

df = df.set_index('證券代號')
#change sting into number
df = df.apply(lambda s: pd.to_numeric(s,errors = 'coerce' ))

# to delect no use columns
# axis=1 is to check every columns
# how='all'  means if all is nan ,then it will act
# (this also works) df = df[df.columns[df.isnull().sum() != len(df)]]
df.dropna(axis = 1 ,how = 'all',inplace = True)

df.head()

*********************************************************************
#how to write red bar of stock
#length of red bar,1 means no up no down
#<1 means stock closing price is lower(stock price fall)
#>1 means stock closing price is larger(stock price rise)
close_open = df['收盤價']/df['開盤價']
close_open.head(5)
*********************************************************************
#select stock which closing price is larger 5% than opening price
df[close_open > 1.05]
*********************************************************************
#save as csv file , so that we can open by excel
df.to_csv('daily_price.csv',encoding = "utf_8_sig")

#read file ,and assign "證卷代號" as index
df = pd.read_csv('daily_price.csv',index_col = ['證券代號'])

df.head()

*********************************************************************
#save to sqllite3

import sqlite3
conn = sqlite3.connect('test.sqlite3')
df.to_sql('daily_price',conn,if_exists='replace')

#read the file

df = pd.read_sql('Select * from daily_price',conn,index_col = ["證券代號"])

df.head()

*********************************************************************

*********************************************************************
import pandas as pd
import requests
from io import StringIO

def crawl_price(date):
    #change date into string
    datestr = date.strftime('%y%m%d')
   
    r = requests.post('website part1' + datestr + 'website part2')
   
    content = r.text.replace('=','')
   
    lines = content.split('\n')
    #delect row which columns <=10
    lines = list(filter(lambda l:len(l.split('",')) > 10, lines))
    #connect all the lines with \n
    content = "\n".join(lines)
   
    if content == '':
        return None
   
    df = pd.read_csv(StringIO(content))
   
    df = df.astype(str)
    df = df.apply(lambda s:s.str.replace(',',''))
   
    df['date'] = pd.to_datetime(date)
    #change the name of columns
    df = df.rename(columns={'證券代號':'stock_id'})
    df = df.set_index(['stock_id','date'])
    #df = df.set_index('stock_id')
    # turn all the content of table into number , coerce means the content will be NaN if it is fail to translate
    df = df.apply(lambda s:pd.to_numeric(s,errors = 'coerce'))
    #delect the columns which is no need
    df = df[df.columns[df.isnull().all()==False]]
   
    return df
   
   
import datetime
crawl_price(datetime.datetime(2019,3,18))


*********************************************************************

)

沒有留言:

張貼留言