File size: 3,144 Bytes
9df4cc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import requests
from lxml import etree
from tqdm import tqdm
import pandas as pd
import json
import time as time
from finnlp.data_sources.news._base import News_Downloader

# TODO:
# 1. More Pages
# 2. Contents

class PennyStocks_Streaming(News_Downloader):

    def __init__(self, args={}):
        super().__init__(args)
        self.dataframe = pd.DataFrame()

    def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 2):
        # establish session
        self._connect_session()

        # download first page
        self._download_first_page(keyword, delay = delay)
       
        # download the following pages
        # self._download_other_pages(keyword)
        print("Only support the first page now!")


    def _connect_session(self):
        # since the server will check cookies, we need first 
        # request the main site withour cookies, then finish 
        # searching for the stock information we want.
        self.session = requests.session()
        first_url = "https://pennystocks.com/"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
        }
        print("Requesting https://pennystocks.com ...", end = " ")
        res = self.session.get(headers = headers, url = first_url)
        if res.status_code !=200:
            raise ConnectionError("Can't request https://pennystocks.com. Please check your connection or report this issue on Github")
        
        print("succeed!")

    def _download_first_page(self, keyword = "apple", max_retry = 5, delay = 2):
        url = f"https://pennystocks.com/?s={keyword}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
        }
        res = self.session.get(url = url, headers = headers)
        res = etree.HTML(res.text)
        articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article")
        # not sure why but this really works
        
        while max_retry and len(articles) == 0:
            import time
            time.sleep(delay)
            print("Gathering again ..", end = ' ')
            res = requests.get(url = url, headers = headers, cookies=self.session.cookies)
            res = etree.HTML(res.text)
            articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article")
            max_retry -= 1
            print(f"Remaining Retry: {max_retry}")


        for a in articles:
            title = a.xpath("./header/h2/a//text()")[0]
            time = a.xpath("./div[3]/div/div/ul/li[1]/text()")[0]
            brief = a.xpath("./div[3]/div/div/text()")[0]
            reading_time = a.xpath("./div[3]/div/div/ul/li[2]/text()")[0]
            columns = ["title", "time", "brief", "reading_time"]
            tmp = pd.DataFrame([[title, time, brief, reading_time]], columns=columns)
            self.dataframe = pd.concat([self.dataframe, tmp])


    def _download_other_pages(self, keyword = "apple"):
        pass