import requests from lxml import etree from tqdm import tqdm import pandas as pd from finnlp.data_sources.news._base import News_Downloader class Eastmoney_Streaming(News_Downloader): def __init__(self, args={}): super().__init__(args) self.dataframe = pd.DataFrame() def download_streaming_stock(self, stock = "600519", rounds = 3): print( "Geting pages: ", end = "") if rounds > 0: for r in range(rounds): br = self._gather_pages(stock, r) if br == "break": break else: r = 1 error_count = 0 while 1: br = self._gather_pages(stock, r) if br == "break": break elif br == "Error": error_count +=1 if error_count>10: print("Connection Error") r += 1 print( f"Get total {r+1} pages.") self.dataframe = self.dataframe.reset_index(drop = True) def _gather_pages(self, stock, page): print( page, end = " ") url = f"https://guba.eastmoney.com/list,{stock},1,f_{page}.html" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", } requests.DEFAULT_RETRIES = 5 # 增加重试连接次数 s = requests.session() s.keep_alive = False # 关闭多余连接 response = self._request_get(url, headers=headers) if response.status_code != 200: return "Error" # gather the comtent of the first page page = etree.HTML(response.text) trs = page.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr') have_one = False for item in trs: have_one = True read_amount = item.xpath("./td[1]//text()")[0] comments = item.xpath("./td[2]//text()")[0] title = item.xpath("./td[3]/div/a//text()")[0] content_link = item.xpath("./td[3]/div/a/@href")[0] author = item.xpath("./td[4]//text()")[0] time = item.xpath("./td[5]//text()")[0] tmp = pd.DataFrame([read_amount, comments, title, content_link, author, time]).T columns = [ "read amount", "comments", "title", "content link", "author", "create time" ] tmp.columns = columns self.dataframe = pd.concat([self.dataframe, tmp]) #print(title) if have_one == False: return "break"