Spaces:

sigridveronica
/

ai-news-analyzer

Running

ai-news-analyzer / external /FinNLP /finnlp /data_sources /news /pennystocks_streaming.py

Sigrid De los Santos

Remove remaining binary file for Hugging Face

9df4cc0 11 days ago

3.14 kB

	import requests
	from lxml import etree
	from tqdm import tqdm
	import pandas as pd
	import json
	import time as time
	from finnlp.data_sources.news._base import News_Downloader

	# TODO:
	# 1. More Pages
	# 2. Contents

	class PennyStocks_Streaming(News_Downloader):

	def __init__(self, args={}):
	super().__init__(args)
	self.dataframe = pd.DataFrame()

	def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 2):
	# establish session
	self._connect_session()

	# download first page
	self._download_first_page(keyword, delay = delay)

	# download the following pages
	# self._download_other_pages(keyword)
	print("Only support the first page now!")


	def _connect_session(self):
	# since the server will check cookies, we need first
	# request the main site withour cookies, then finish
	# searching for the stock information we want.
	self.session = requests.session()
	first_url = "https://pennystocks.com/"
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
	}
	print("Requesting https://pennystocks.com ...", end = " ")
	res = self.session.get(headers = headers, url = first_url)
	if res.status_code !=200:
	raise ConnectionError("Can't request https://pennystocks.com. Please check your connection or report this issue on Github")

	print("succeed!")

	def _download_first_page(self, keyword = "apple", max_retry = 5, delay = 2):
	url = f"https://pennystocks.com/?s={keyword}"
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
	}
	res = self.session.get(url = url, headers = headers)
	res = etree.HTML(res.text)
	articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article")
	# not sure why but this really works

	while max_retry and len(articles) == 0:
	import time
	time.sleep(delay)
	print("Gathering again ..", end = ' ')
	res = requests.get(url = url, headers = headers, cookies=self.session.cookies)
	res = etree.HTML(res.text)
	articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article")
	max_retry -= 1
	print(f"Remaining Retry: {max_retry}")


	for a in articles:
	title = a.xpath("./header/h2/a//text()")[0]
	time = a.xpath("./div[3]/div/div/ul/li[1]/text()")[0]
	brief = a.xpath("./div[3]/div/div/text()")[0]
	reading_time = a.xpath("./div[3]/div/div/ul/li[2]/text()")[0]
	columns = ["title", "time", "brief", "reading_time"]
	tmp = pd.DataFrame([[title, time, brief, reading_time]], columns=columns)
	self.dataframe = pd.concat([self.dataframe, tmp])


	def _download_other_pages(self, keyword = "apple"):
	pass