Spaces:
Running
Running
from finnlp.data_sources.social_media._base import Social_Media_Downloader | |
from tqdm import tqdm | |
from lxml import etree | |
import pandas as pd | |
import requests | |
import time | |
import json | |
import re | |
class Weibo_Streaming(Social_Media_Downloader): | |
def __init__(self, args = {}): | |
super().__init__(args) | |
self.dataframe = pd.DataFrame() | |
def download_streaming_stock(self, stock = "θ ε°", rounds = 3): | |
for r in tqdm(range(rounds), desc="Downloading by page.."): | |
page = r+1 | |
self._gather_one_page(page, stock) | |
def _gather_one_page(self,page, stock = "θ ε°", delay = 0.01): | |
headers = { | |
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0" | |
} | |
params = { | |
"containerid": f"100103type=61&q={stock}&t=", | |
"page_type": "searchall", | |
"page":page | |
} | |
url = f"https://m.weibo.cn/api/container/getIndex" | |
resp = self._request_get(url, headers=headers, params = params) | |
if resp is None: | |
return "Error" | |
res = json.loads(resp.text) | |
res = res["data"]["cards"] | |
res = pd.DataFrame(res) | |
pbar = tqdm(total = res.shape[0], desc = "Processing the text content and downloading the full passage...") | |
res[["content_short","content"]] = res.apply(lambda x:self._process_text(x, pbar, delay), axis= 1, result_type= "expand") | |
self.dataframe = pd.concat([self.dataframe, res]) | |
def _process_text(self,x, pbar, delay = 0.01): | |
text = x["mblog"]["text"] | |
text = etree.HTML(text) | |
content_short = text.xpath(".//text()") | |
content_short = ''.join(content_short) | |
link = text.xpath('.//a/@href') | |
link = [l for l in link if "status" in l ] | |
if len(link) >0: | |
base_url = "https://m.weibo.cn/" | |
url_new = base_url + link[0] | |
headers = { | |
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0" | |
} | |
resp = self._request_get(url_new, headers= headers) | |
if resp is None: | |
content = content_short | |
else: | |
res = etree.HTML(resp.content) | |
scripts = res.xpath('//script') | |
content = scripts[2].xpath("text()") | |
pattern=re.compile('"text": "(.+),\n') | |
result = pattern.findall(content[0]) | |
content = etree.HTML(result[0]) | |
content = content.xpath("//text()") | |
content = ''.join(content) | |
else: | |
content = content_short | |
pbar.update(1) | |
time.sleep(delay) | |
return content_short, content | |