Spaces:

sigridveronica
/

ai-news-analyzer

Running

ai-news-analyzer / external /FinNLP /finnlp /data_sources /news /eastmoney_streaming.py

Sigrid De los Santos

Remove remaining binary file for Hugging Face

9df4cc0 11 days ago

2.62 kB

	import requests
	from lxml import etree
	from tqdm import tqdm
	import pandas as pd
	from finnlp.data_sources.news._base import News_Downloader


	class Eastmoney_Streaming(News_Downloader):

	def __init__(self, args={}):
	super().__init__(args)
	self.dataframe = pd.DataFrame()

	def download_streaming_stock(self, stock = "600519", rounds = 3):
	print( "Geting pages: ", end = "")
	if rounds > 0:
	for r in range(rounds):
	br = self._gather_pages(stock, r)
	if br == "break":
	break
	else:
	r = 1
	error_count = 0
	while 1:
	br = self._gather_pages(stock, r)
	if br == "break":
	break
	elif br == "Error":
	error_count +=1
	if error_count>10:
	print("Connection Error")
	r += 1
	print( f"Get total {r+1} pages.")
	self.dataframe = self.dataframe.reset_index(drop = True)

	def _gather_pages(self, stock, page):
	print( page, end = " ")
	url = f"https://guba.eastmoney.com/list,{stock},1,f_{page}.html"
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
	}

	requests.DEFAULT_RETRIES = 5 # 增加重试连接次数
	s = requests.session()
	s.keep_alive = False # 关闭多余连接

	response = self._request_get(url, headers=headers)
	if response.status_code != 200:
	return "Error"

	# gather the comtent of the first page
	page = etree.HTML(response.text)
	trs = page.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr')
	have_one = False
	for item in trs:
	have_one = True
	read_amount = item.xpath("./td[1]//text()")[0]
	comments = item.xpath("./td[2]//text()")[0]
	title = item.xpath("./td[3]/div/a//text()")[0]
	content_link = item.xpath("./td[3]/div/a/@href")[0]
	author = item.xpath("./td[4]//text()")[0]
	time = item.xpath("./td[5]//text()")[0]
	tmp = pd.DataFrame([read_amount, comments, title, content_link, author, time]).T
	columns = [ "read amount", "comments", "title", "content link", "author", "create time" ]
	tmp.columns = columns
	self.dataframe = pd.concat([self.dataframe, tmp])
	#print(title)
	if have_one == False:
	return "break"