File size: 2,623 Bytes
9df4cc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import requests
from lxml import etree
from tqdm import tqdm
import pandas as pd
from finnlp.data_sources.news._base import News_Downloader


class Eastmoney_Streaming(News_Downloader):

    def __init__(self, args={}):
        super().__init__(args)
        self.dataframe = pd.DataFrame()

    def download_streaming_stock(self, stock = "600519", rounds = 3):
        print( "Geting pages: ", end = "")
        if rounds > 0:
            for r in range(rounds):
                br = self._gather_pages(stock, r)
                if br == "break":
                    break
        else:
            r = 1
            error_count = 0
            while 1:
                br = self._gather_pages(stock, r)
                if br == "break":
                    break
                elif br == "Error":
                    error_count +=1
                if error_count>10:
                    print("Connection Error")
                r += 1
        print( f"Get total {r+1} pages.")
        self.dataframe = self.dataframe.reset_index(drop = True)
    
    def _gather_pages(self, stock, page):
        print( page, end = " ")
        url = f"https://guba.eastmoney.com/list,{stock},1,f_{page}.html"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        }

        requests.DEFAULT_RETRIES = 5  # 增加重试连接次数
        s = requests.session()
        s.keep_alive = False  # 关闭多余连接
        
        response = self._request_get(url, headers=headers)
        if response.status_code != 200:
            return "Error"
        
        # gather the comtent of the first page
        page = etree.HTML(response.text)
        trs = page.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr')
        have_one = False
        for item in trs:
            have_one = True
            read_amount = item.xpath("./td[1]//text()")[0]
            comments = item.xpath("./td[2]//text()")[0]
            title = item.xpath("./td[3]/div/a//text()")[0]
            content_link = item.xpath("./td[3]/div/a/@href")[0]
            author = item.xpath("./td[4]//text()")[0]
            time = item.xpath("./td[5]//text()")[0]
            tmp = pd.DataFrame([read_amount, comments, title, content_link, author, time]).T
            columns = [ "read amount", "comments", "title", "content link", "author", "create time" ]
            tmp.columns = columns
            self.dataframe = pd.concat([self.dataframe, tmp])
            #print(title)
        if have_one == False:
            return "break"