Spaces:
Running
Running
File size: 5,874 Bytes
9df4cc0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
from finnlp.data_sources.company_announcement._base import Company_Announcement_Downloader
from tqdm import tqdm
from lxml import etree
import pandas as pd
import requests
import json
import time
class SEC_Announcement(Company_Announcement_Downloader):
def __init__(self, args = {}):
super().__init__(args)
self.dataframe = pd.DataFrame()
def download_date_range_stock(self, start_date, end_date, stock = "AAPL", delay = 0.1):
entityName = self._get_entity_name(stock)
# first page
total_pages = self._gather_one_page(start_date, end_date, 1, entityName, delay)
# other pages
if total_pages>1:
for page in tqdm(range(1, total_pages), desc="Downloading other page..."):
self._gather_one_page(start_date, end_date, page + 1, entityName, delay )
self.dataframe = self.dataframe.reset_index(drop = True)
def _get_entity_name(self, stock = "AAPL"):
url = "https://efts.sec.gov/LATEST/search-index"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
params = {
"keysTyped":stock
}
resp = self._request_get(url = url, headers= headers, params= params)
if resp is None:
raise ConnectionError("Can't get entity name")
res = json.loads(resp.text)
item_list = res["hits"]["hits"]
entityName_list = []
for item in item_list:
c_name_one = item["_source"]["entity_words"]
c_name_two = item["_id"].zfill(10)
entityName = f"{c_name_one} (CIK {c_name_two})"
entityName_list.append(entityName)
entityName = entityName_list[0]
return entityName
def _gather_one_page(self, start_date, end_date, page, entityName = "Apple Inc. (AAPL) (CIK 0000320193)", delay = 0.01):
from_ = (page-1)*100
url = "https://efts.sec.gov/LATEST/search-index"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
params = {
"dateRange": "all",
"entityName": entityName,
"startdt": start_date,
"enddt": end_date,
"from" : from_,
"page" : page,
}
resp = self._request_get(url = url, headers= headers, params= params)
if resp is None:
return 'Error'
res = json.loads(resp.text)
# total
total_items = res["hits"]["total"]["value"]
if total_items % 100 == 0:
total_pages = total_items // 100
else:
total_pages = total_items // 100 + 1
items = res["hits"]["hits"]
url_base = "https://www.sec.gov/Archives/edgar/data"
for item in tqdm(items, desc="Downloading by item..." ):
url_third = item["_source"]["xsl"]
url_second, url_fourth = item["_id"].split(":")
url_second = url_second.split("-")
url_first = url_second[0]
url_first = url_first.strip("0")
url_second = ''.join(url_second)
url_first, url_second, url_fourth
if url_third is not None:
url_new = f"{url_base}/{url_first}/{url_second}/{url_third}/{url_fourth}"
else:
url_new = f"{url_base}/{url_first}/{url_second}/{url_fourth}"
respn = self._request_get(url = url_new, headers= headers)
if respn is None:
continue
try:
res = etree.HTML(respn.text)
content = res.xpath("/html/body//text()")
content = [c for c in content if c != "\n"]
content = "".join(content)
_id = item["_id"]
ciks = item["_source"]["ciks"]
period_ending = item["_source"]["period_ending"]
root_form = item["_source"]["root_form"]
file_num = item["_source"]["file_num"]
display_names = item["_source"]["display_names"]
xsl = item["_source"]["xsl"]
sequence = item["_source"]["sequence"]
file_date = item["_source"]["file_date"]
biz_states = item["_source"]["biz_states"]
sics = item["_source"]["sics"]
form = item["_source"]["form"]
adsh = item["_source"]["adsh"]
film_num = item["_source"]["film_num"]
biz_locations = item["_source"]["biz_locations"]
file_type = item["_source"]["file_type"]
file_description = item["_source"]["file_description"]
inc_states = item["_source"]["inc_states"]
ite = item["_source"]["items"]
data = [
_id, ciks, period_ending, root_form, file_num, display_names, xsl, sequence,
file_date, biz_states, sics, form, adsh, film_num, biz_locations, file_type,
file_description, inc_states, ite, content
]
columns = [
"_id", "ciks", "period_ending", "root_form", "file_num", "display_names", "xsl", "sequence",
"file_date", "biz_states", "sics", "form", "adsh", "film_num", "biz_locations", "file_type",
"file_description", "inc_states", "ite", "content"
]
tmp = pd.DataFrame(data = data).T
tmp.columns = columns
self.dataframe = pd.concat([self.dataframe, tmp])
time.sleep(delay)
except:
continue
return total_pages
|