Spaces:
Running
Running
File size: 4,354 Bytes
9df4cc0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import warnings
warnings.filterwarnings("ignore")
import json
import requests
import pandas as pd
from lxml import etree
from tqdm import tqdm
from datetime import datetime
from finnlp.data_sources.news._base import News_Downloader
class SeekingAlpha_Date_Range(News_Downloader):
def __init__(self, args = {}):
super().__init__(args)
def download_date_range_stock(self, start_date, end_date, stock = "AAPL", proxies = None):
self.dataframe = pd.DataFrame()
start_timestamp = int(datetime.strptime(start_date+'-13', "%Y-%m-%d-%H").timestamp())
end_timestamp = int(datetime.strptime(end_date+'-13', "%Y-%m-%d-%H").timestamp())
# Downloading First Page
data, totalpages = self._gather_by_page(start_timestamp, end_timestamp, stock, 1, proxies)
self.dataframe = pd.concat([self.dataframe, data])
# Downloading Other Pages
with tqdm(total=totalpages, desc= "Downloading Titles") as bar:
bar.update(1)
for page in range(2, totalpages+1):
data,_ = self._gather_by_page(start_timestamp, end_timestamp, stock, page, proxies)
self.dataframe = pd.concat([self.dataframe, data])
bar.update(1)
self.dataframe = self.dataframe.reset_index(drop = True)
def _gather_by_page(self, start_timestamp, end_timestamp, stock, page = 1, proxies = None):
url = f"https://seekingalpha.com/api/v3/symbols/{stock}/news?filter[since]={start_timestamp}&filter[until]={end_timestamp}&id={stock}&include=author%2CprimaryTickers%2CsecondaryTickers%2Csentiments&isMounting=true&page[size]=40&page[number]={page}"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0',
'Referer':f'https://seekingalpha.com/symbol/aapl/news?from=2009-12-31T16%3A00%3A00.000Z&to=2022-01-01T15%3A59%3A59.999Z'
}
response = requests.get(url, headers=headers, proxies=proxies)
if response.status_code != 200:
print(f"stock: {stock}, page: {page} went wrong!")
return pd.DataFrame(), 1
else:
res = json.loads(response.text)
data = pd.DataFrame(res["data"])
# make new features
new_columns = ["publishOn", "isLockedPro", "commentCount", "gettyImageUrl", "videoPreviewUrl", "themes", "title", "isPaywalled"]
data[new_columns] = data.apply(lambda x:list(x.attributes.values()), axis = 1,result_type ="expand" )
new_columns = ["author", "sentiments", "primaryTickers", "secondaryTickers", "otherTags"]
data[new_columns] = data.apply(lambda x:list(x.relationships.values()), axis = 1,result_type ="expand" )
# total pages
totalpages = res["meta"]["page"]["totalPages"]
return data, totalpages
def obtain_content(self, parallel = False, proxies = None):
if parallel:
import os
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=os.cpu_count())
self.dataframe['content'] = self.dataframe.parallel_apply(lambda x: self._obtain_content(x, proxies = proxies), axis = 1)
else:
self.dataframe['content'] = self.dataframe.apply(lambda x: self._obtain_content(x, proxies = proxies), axis = 1)
def _obtain_content(self, x, proxies = None):
url = x['links']['self']
url = f"https://seekingalpha.com{url}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0'
}
res = requests.get(url, headers=headers, proxies=proxies)
if res.status_code != 200:
return ''
else:
resp = etree.HTML(res.text)
resp = resp.xpath('//script[5]//text()')
resp = resp[0].split('window.SSR_DATA = ')[1]
resp = resp[:-1]
resp = json.loads(resp)
content = resp['article']['response']['data']['attributes']['content']
content = etree.HTML(content)
content = content.xpath('//text()')
content = [c if c!= ' ' else '\n' for c in content]
content = ''.join(content)
content = content.strip()
return content
|