import requests #think it as a browser
import pandas as pd
res = requests.get("website address") #inside the () you can put the website link
res.text # it will show the word you can not understand
res.encoding = 'utf-8' #utf-8 ,so we can read chinese word
res.text
# save the file
f = open('andrew_file.html','w',encoding = 'utf-8')
f.write(res.text)
f.close()
#we need to use pandas to read html as dataframe
df = pd.read_html('andrew_file.html')
print(df[1])
***************************************************************************
import requests #think it as a browser
import pandas as pd
res = requests.get("website address") #inside the () you can put the website link
res.text # it will show the word you can not understand
res.encoding = 'utf-8' #utf-8 ,so we can read chinese word
res.text
# save the file
f = open('andrew_file.html','w',encoding = 'utf-8')
f.write(res.text)
f.close()
#we need to use pandas to read html as dataframe
df = pd.read_html('andrew_file.html')
print(df[1])
*********************************************************************
#We also can directly let it be as dataframe , and no need to save
from io import StringIO
#StringIo make res.text like a file
dfs = pd.read_html(StringIO(res.text))
dfs[3]
*********************************************************************
import requests
import pandas as pd
from io import StringIO
response = requests.get('website address')
#df1 = pd.read_csv(StringIO(response.text))
#want to save as df , but fail , it show ParserError: Error tokenizing data. C error: Expected 6 fields in line 159, saw 17
#different row on the file
lines = response.text.split('\n')
lines[159]
newlines = []
#use for loop to organize the data
#we use ", to seperate ,the reason is we are afraid we meet the string like this ' "你", ",好,", "嗎",'
for line in lines:
if len(line.split('",')) == 17:
newlines.append(line)
print("original line count = ")
print(len(lines))
print("after process line count = " )
print(len(newlines))
j = '\n'
s = j.join(newlines)
s.replace('=','')
df = pd.read_csv(StringIO(s))
#only show first 5 row
df.head()
#but now ,they looks like number , but they are string
#and we dont want ","
#we dont want Unnamed16
#change all the content into string
df = df.astype(str)
#lambda is like a function ,but it make definition simple
#applymap is to excute funtion
df = df.applymap(lambda s:s.replace(',',''))
df = df.set_index('證券代號')
#change sting into number
df = df.apply(lambda s: pd.to_numeric(s,errors = 'coerce' ))
# to delect no use columns
# axis=1 is to check every columns
# how='all' means if all is nan ,then it will act
# (this also works) df = df[df.columns[df.isnull().sum() != len(df)]]
df.dropna(axis = 1 ,how = 'all',inplace = True)
df.head()
*********************************************************************
#how to write red bar of stock
#length of red bar,1 means no up no down
#<1 means stock closing price is lower(stock price fall)
#>1 means stock closing price is larger(stock price rise)
close_open = df['收盤價']/df['開盤價']
close_open.head(5)
*********************************************************************
#select stock which closing price is larger 5% than opening price
df[close_open > 1.05]
*********************************************************************
#save as csv file , so that we can open by excel
df.to_csv('daily_price.csv',encoding = "utf_8_sig")
#read file ,and assign "證卷代號" as index
df = pd.read_csv('daily_price.csv',index_col = ['證券代號'])
df.head()
*********************************************************************
#save to sqllite3
import sqlite3
conn = sqlite3.connect('test.sqlite3')
df.to_sql('daily_price',conn,if_exists='replace')
#read the file
df = pd.read_sql('Select * from daily_price',conn,index_col = ["證券代號"])
df.head()
*********************************************************************
*********************************************************************
import pandas as pd
import requests
from io import StringIO
def crawl_price(date):
#change date into string
datestr = date.strftime('%y%m%d')
r = requests.post('website part1' + datestr + 'website part2')
content = r.text.replace('=','')
lines = content.split('\n')
#delect row which columns <=10
lines = list(filter(lambda l:len(l.split('",')) > 10, lines))
#connect all the lines with \n
content = "\n".join(lines)
if content == '':
return None
df = pd.read_csv(StringIO(content))
df = df.astype(str)
df = df.apply(lambda s:s.str.replace(',',''))
df['date'] = pd.to_datetime(date)
#change the name of columns
df = df.rename(columns={'證券代號':'stock_id'})
df = df.set_index(['stock_id','date'])
#df = df.set_index('stock_id')
# turn all the content of table into number , coerce means the content will be NaN if it is fail to translate
df = df.apply(lambda s:pd.to_numeric(s,errors = 'coerce'))
#delect the columns which is no need
df = df[df.columns[df.isnull().all()==False]]
return df
import datetime
crawl_price(datetime.datetime(2019,3,18))
*********************************************************************
)