Spaces:

sigridveronica
/

ai-news-analyzer

Running

File size: 2,493 Bytes

9df4cc0

import warnings
warnings.filterwarnings("ignore")
import requests
from lxml import etree
from tqdm import tqdm
import pandas as pd
import json
import time
from finnlp.data_sources.news._base import News_Downloader

# TODO:
# 1. Contents

class InvestorPlace_Streaming(News_Downloader):

    def __init__(self, args={}):
        super().__init__(args)
        self.dataframe = pd.DataFrame()

    def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5):
        url = 'https://investorplace.com/search/'

        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
        }
        print("Downloading ...", end = ' ')
        for page in range(rounds):
            params = {
                'q': keyword,
                "pg": page,
            }
            res = requests.get(url = url, params=params, headers=headers)
            if res.status_code != 200:
                break

            res = etree.HTML(res.text)
            div_list = res.xpath("/html/body/main/section/div/div/div/div[2]/div[1]/div[1]/div")
            divs = []

            for div in div_list:
                divs += div.xpath("./div")

            titles = []
            times = []
            authors = []
            summaries = []

            for div in divs:
                try:
                    title = div.xpath('./h2/a//text()')[0]
                except:
                    title = ''
                try:
                    time_ = div.xpath('div/time//text()')[0].replace('\n','').replace('\t','')
                except:
                    time_ = ''
                try:
                    author = div.xpath('div/span/a/text()')[0].replace('\n','').replace('\t','')
                except:
                    author = ''
                try:
                    summary = div.xpath('p/text()')[0].replace('\n','').replace('\t','')
                except:
                    summary = ''

                titles.append(title)
                times.append(time_)
                authors.append(author)
                summaries.append(summary)
                
                titles.append(title)

            tmp = pd.DataFrame([titles, times, authors, summaries]).T
            tmp.columns = ['title', 'time', 'author', 'summary']
            self.dataframe = pd.concat([self.dataframe, tmp])

            print(page, end = ' ')

            time.sleep(delay)