File size: 2,788 Bytes
9df4cc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from finnlp.data_sources.social_media._base import Social_Media_Downloader

from tqdm import tqdm
from lxml import etree
import pandas as pd
import requests
import time
import json
import re

class Weibo_Streaming(Social_Media_Downloader):
    def __init__(self, args = {}):
        super().__init__(args)
        self.dataframe = pd.DataFrame()

    def download_streaming_stock(self, stock = "茅台", rounds = 3):
        for r in tqdm(range(rounds), desc="Downloading by page.."):
            page = r+1
            self._gather_one_page(page, stock)

    def _gather_one_page(self,page, stock = "茅台", delay = 0.01):
        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
            }
        params = {
            "containerid": f"100103type=61&q={stock}&t=",
            "page_type": "searchall",
            "page":page
        }
        url = f"https://m.weibo.cn/api/container/getIndex"
        resp = self._request_get(url, headers=headers, params = params)

        if resp is None:
            return "Error"
        
        res = json.loads(resp.text)
        res = res["data"]["cards"]
        res = pd.DataFrame(res)

        pbar = tqdm(total = res.shape[0], desc = "Processing the text content and downloading the full passage...")
        res[["content_short","content"]] = res.apply(lambda x:self._process_text(x, pbar, delay), axis= 1, result_type= "expand")

        self.dataframe = pd.concat([self.dataframe, res]) 
    
    def _process_text(self,x, pbar, delay = 0.01):
        text = x["mblog"]["text"]
        text = etree.HTML(text)
        content_short = text.xpath(".//text()")
        content_short = ''.join(content_short)
        
        link = text.xpath('.//a/@href')
        link = [l for l in link if "status" in l ]
        if len(link) >0:
            base_url = "https://m.weibo.cn/"
            url_new = base_url + link[0]
            headers = {
                "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
            }
            resp = self._request_get(url_new, headers= headers) 
            if resp is None:
                content = content_short
            else:
                res = etree.HTML(resp.content)
                scripts = res.xpath('//script')
                content = scripts[2].xpath("text()")
                pattern=re.compile('"text": "(.+),\n')
                result = pattern.findall(content[0])
                content = etree.HTML(result[0])
                content = content.xpath("//text()")
                content = ''.join(content)
        else:
            content = content_short

        pbar.update(1)
        time.sleep(delay)

        return content_short, content