Sigrid De los Santos
Remove remaining binary file for Hugging Face
9df4cc0
import requests
from lxml import etree
from tqdm import tqdm
import pandas as pd
from finnlp.data_sources.news._base import News_Downloader
class Eastmoney_Streaming(News_Downloader):
def __init__(self, args={}):
super().__init__(args)
self.dataframe = pd.DataFrame()
def download_streaming_stock(self, stock = "600519", rounds = 3):
print( "Geting pages: ", end = "")
if rounds > 0:
for r in range(rounds):
br = self._gather_pages(stock, r)
if br == "break":
break
else:
r = 1
error_count = 0
while 1:
br = self._gather_pages(stock, r)
if br == "break":
break
elif br == "Error":
error_count +=1
if error_count>10:
print("Connection Error")
r += 1
print( f"Get total {r+1} pages.")
self.dataframe = self.dataframe.reset_index(drop = True)
def _gather_pages(self, stock, page):
print( page, end = " ")
url = f"https://guba.eastmoney.com/list,{stock},1,f_{page}.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
}
requests.DEFAULT_RETRIES = 5 # 增加重试连接次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
response = self._request_get(url, headers=headers)
if response.status_code != 200:
return "Error"
# gather the comtent of the first page
page = etree.HTML(response.text)
trs = page.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr')
have_one = False
for item in trs:
have_one = True
read_amount = item.xpath("./td[1]//text()")[0]
comments = item.xpath("./td[2]//text()")[0]
title = item.xpath("./td[3]/div/a//text()")[0]
content_link = item.xpath("./td[3]/div/a/@href")[0]
author = item.xpath("./td[4]//text()")[0]
time = item.xpath("./td[5]//text()")[0]
tmp = pd.DataFrame([read_amount, comments, title, content_link, author, time]).T
columns = [ "read amount", "comments", "title", "content link", "author", "create time" ]
tmp.columns = columns
self.dataframe = pd.concat([self.dataframe, tmp])
#print(title)
if have_one == False:
return "break"