Sigrid De los Santos
Remove remaining binary file for Hugging Face
9df4cc0
import warnings
warnings.filterwarnings("ignore")
import json
import requests
import pandas as pd
from lxml import etree
from tqdm import tqdm
from datetime import datetime
from finnlp.data_sources.news._base import News_Downloader
class SeekingAlpha_Date_Range(News_Downloader):
def __init__(self, args = {}):
super().__init__(args)
def download_date_range_stock(self, start_date, end_date, stock = "AAPL", proxies = None):
self.dataframe = pd.DataFrame()
start_timestamp = int(datetime.strptime(start_date+'-13', "%Y-%m-%d-%H").timestamp())
end_timestamp = int(datetime.strptime(end_date+'-13', "%Y-%m-%d-%H").timestamp())
# Downloading First Page
data, totalpages = self._gather_by_page(start_timestamp, end_timestamp, stock, 1, proxies)
self.dataframe = pd.concat([self.dataframe, data])
# Downloading Other Pages
with tqdm(total=totalpages, desc= "Downloading Titles") as bar:
bar.update(1)
for page in range(2, totalpages+1):
data,_ = self._gather_by_page(start_timestamp, end_timestamp, stock, page, proxies)
self.dataframe = pd.concat([self.dataframe, data])
bar.update(1)
self.dataframe = self.dataframe.reset_index(drop = True)
def _gather_by_page(self, start_timestamp, end_timestamp, stock, page = 1, proxies = None):
url = f"https://seekingalpha.com/api/v3/symbols/{stock}/news?filter[since]={start_timestamp}&filter[until]={end_timestamp}&id={stock}&include=author%2CprimaryTickers%2CsecondaryTickers%2Csentiments&isMounting=true&page[size]=40&page[number]={page}"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0',
'Referer':f'https://seekingalpha.com/symbol/aapl/news?from=2009-12-31T16%3A00%3A00.000Z&to=2022-01-01T15%3A59%3A59.999Z'
}
response = requests.get(url, headers=headers, proxies=proxies)
if response.status_code != 200:
print(f"stock: {stock}, page: {page} went wrong!")
return pd.DataFrame(), 1
else:
res = json.loads(response.text)
data = pd.DataFrame(res["data"])
# make new features
new_columns = ["publishOn", "isLockedPro", "commentCount", "gettyImageUrl", "videoPreviewUrl", "themes", "title", "isPaywalled"]
data[new_columns] = data.apply(lambda x:list(x.attributes.values()), axis = 1,result_type ="expand" )
new_columns = ["author", "sentiments", "primaryTickers", "secondaryTickers", "otherTags"]
data[new_columns] = data.apply(lambda x:list(x.relationships.values()), axis = 1,result_type ="expand" )
# total pages
totalpages = res["meta"]["page"]["totalPages"]
return data, totalpages
def obtain_content(self, parallel = False, proxies = None):
if parallel:
import os
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=os.cpu_count())
self.dataframe['content'] = self.dataframe.parallel_apply(lambda x: self._obtain_content(x, proxies = proxies), axis = 1)
else:
self.dataframe['content'] = self.dataframe.apply(lambda x: self._obtain_content(x, proxies = proxies), axis = 1)
def _obtain_content(self, x, proxies = None):
url = x['links']['self']
url = f"https://seekingalpha.com{url}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0'
}
res = requests.get(url, headers=headers, proxies=proxies)
if res.status_code != 200:
return ''
else:
resp = etree.HTML(res.text)
resp = resp.xpath('//script[5]//text()')
resp = resp[0].split('window.SSR_DATA = ')[1]
resp = resp[:-1]
resp = json.loads(resp)
content = resp['article']['response']['data']['attributes']['content']
content = etree.HTML(content)
content = content.xpath('//text()')
content = [c if c!= ' ' else '\n' for c in content]
content = ''.join(content)
content = content.strip()
return content