Spaces:

sigridveronica
/

ai-news-analyzer

Running

ai-news-analyzer / external /FinNLP /finnlp /data_sources /social_media /weibo_streaming.py

Sigrid De los Santos

Remove remaining binary file for Hugging Face

9df4cc0 11 days ago

2.79 kB

	from finnlp.data_sources.social_media._base import Social_Media_Downloader

	from tqdm import tqdm
	from lxml import etree
	import pandas as pd
	import requests
	import time
	import json
	import re

	class Weibo_Streaming(Social_Media_Downloader):
	def __init__(self, args = {}):
	super().__init__(args)
	self.dataframe = pd.DataFrame()

	def download_streaming_stock(self, stock = "茅台", rounds = 3):
	for r in tqdm(range(rounds), desc="Downloading by page.."):
	page = r+1
	self._gather_one_page(page, stock)

	def _gather_one_page(self,page, stock = "茅台", delay = 0.01):
	headers = {
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
	}
	params = {
	"containerid": f"100103type=61&q={stock}&t=",
	"page_type": "searchall",
	"page":page
	}
	url = f"https://m.weibo.cn/api/container/getIndex"
	resp = self._request_get(url, headers=headers, params = params)

	if resp is None:
	return "Error"

	res = json.loads(resp.text)
	res = res["data"]["cards"]
	res = pd.DataFrame(res)

	pbar = tqdm(total = res.shape[0], desc = "Processing the text content and downloading the full passage...")
	res[["content_short","content"]] = res.apply(lambda x:self._process_text(x, pbar, delay), axis= 1, result_type= "expand")

	self.dataframe = pd.concat([self.dataframe, res])

	def _process_text(self,x, pbar, delay = 0.01):
	text = x["mblog"]["text"]
	text = etree.HTML(text)
	content_short = text.xpath(".//text()")
	content_short = ''.join(content_short)

	link = text.xpath('.//a/@href')
	link = [l for l in link if "status" in l ]
	if len(link) >0:
	base_url = "https://m.weibo.cn/"
	url_new = base_url + link[0]
	headers = {
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
	}
	resp = self._request_get(url_new, headers= headers)
	if resp is None:
	content = content_short
	else:
	res = etree.HTML(resp.content)
	scripts = res.xpath('//script')
	content = scripts[2].xpath("text()")
	pattern=re.compile('"text": "(.+),\n')
	result = pattern.findall(content[0])
	content = etree.HTML(result[0])
	content = content.xpath("//text()")
	content = ''.join(content)
	else:
	content = content_short

	pbar.update(1)
	time.sleep(delay)

	return content_short, content