File size: 2,493 Bytes
9df4cc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import warnings
warnings.filterwarnings("ignore")
import requests
from lxml import etree
from tqdm import tqdm
import pandas as pd
import json
import time
from finnlp.data_sources.news._base import News_Downloader

# TODO:
# 1. Contents

class InvestorPlace_Streaming(News_Downloader):

    def __init__(self, args={}):
        super().__init__(args)
        self.dataframe = pd.DataFrame()

    def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5):
        url = 'https://investorplace.com/search/'

        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
        }
        print("Downloading ...", end = ' ')
        for page in range(rounds):
            params = {
                'q': keyword,
                "pg": page,
            }
            res = requests.get(url = url, params=params, headers=headers)
            if res.status_code != 200:
                break

            res = etree.HTML(res.text)
            div_list = res.xpath("/html/body/main/section/div/div/div/div[2]/div[1]/div[1]/div")
            divs = []

            for div in div_list:
                divs += div.xpath("./div")

            titles = []
            times = []
            authors = []
            summaries = []

            for div in divs:
                try:
                    title = div.xpath('./h2/a//text()')[0]
                except:
                    title = ''
                try:
                    time_ = div.xpath('div/time//text()')[0].replace('\n','').replace('\t','')
                except:
                    time_ = ''
                try:
                    author = div.xpath('div/span/a/text()')[0].replace('\n','').replace('\t','')
                except:
                    author = ''
                try:
                    summary = div.xpath('p/text()')[0].replace('\n','').replace('\t','')
                except:
                    summary = ''

                titles.append(title)
                times.append(time_)
                authors.append(author)
                summaries.append(summary)
                
                titles.append(title)

            tmp = pd.DataFrame([titles, times, authors, summaries]).T
            tmp.columns = ['title', 'time', 'author', 'summary']
            self.dataframe = pd.concat([self.dataframe, tmp])

            print(page, end = ' ')

            time.sleep(delay)