import time from collections import namedtuple from pathlib import Path from typing import List import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from faker import Faker fake = Faker() MAX_RETRIES = 10 SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL = 0.1 FILING_DETAILS_FILENAME_STEM = "filing-details" SEC_EDGAR_SEARCH_API_ENDPOINT = "https://efts.sec.gov/LATEST/search-index" SEC_EDGAR_ARCHIVES_BASE_URL = "https://www.sec.gov/Archives/edgar/data" retries = Retry( total=MAX_RETRIES, backoff_factor=SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL, status_forcelist=[403, 500, 502, 503, 504], ) FilingMetadata = namedtuple( "FilingMetadata", [ "accession_number", "full_submission_url", "filing_details_url", "filing_details_filename", ], ) class EdgarSearchApiError(Exception): pass def form_request_payload( ticker_or_cik: str, filing_types: List[str], start_date: str, end_date: str, start_index: int, query: str, ) -> dict: payload = { "dateRange": "custom", "startdt": start_date, "enddt": end_date, "entityName": ticker_or_cik, "forms": filing_types, "from": start_index, "q": query, } return payload def build_filing_metadata_from_hit(hit: dict) -> FilingMetadata: accession_number, filing_details_filename = hit["_id"].split(":", 1) # Company CIK should be last in the CIK list. This list may also include # the CIKs of executives carrying out insider transactions like in form 4. cik = hit["_source"]["ciks"][-1] accession_number_no_dashes = accession_number.replace("-", "", 2) submission_base_url = ( f"{SEC_EDGAR_ARCHIVES_BASE_URL}/{cik}/{accession_number_no_dashes}" ) full_submission_url = f"{submission_base_url}/{accession_number}.txt" # Get XSL if human readable is wanted # XSL is required to download the human-readable # and styled version of XML documents like form 4 # SEC_EDGAR_ARCHIVES_BASE_URL + /320193/000032019320000066/wf-form4_159839550969947.xml # SEC_EDGAR_ARCHIVES_BASE_URL + # /320193/000032019320000066/xslF345X03/wf-form4_159839550969947.xml # xsl = hit["_source"]["xsl"] # if xsl is not None: # filing_details_url = f"{submission_base_url}/{xsl}/{filing_details_filename}" # else: # filing_details_url = f"{submission_base_url}/{filing_details_filename}" filing_details_url = f"{submission_base_url}/{filing_details_filename}" filing_details_filename_extension = Path(filing_details_filename).suffix.replace( "htm", "html" ) filing_details_filename = ( f"{FILING_DETAILS_FILENAME_STEM}{filing_details_filename_extension}" ) return FilingMetadata( accession_number=accession_number, full_submission_url=full_submission_url, filing_details_url=filing_details_url, filing_details_filename=filing_details_filename, ) def generate_random_user_agent() -> str: return f"{fake.first_name()} {fake.last_name()} {fake.email()}" def get_filing_urls_to_download( filing_type: str, ticker_or_cik: str, num_filings_to_download: int, after_date: str, before_date: str, include_amends: bool, query: str = "", ) -> List[FilingMetadata]: """Get the filings URL to download the data Returns: List[FilingMetadata]: Filing metadata from SEC """ filings_to_fetch: List[FilingMetadata] = [] start_index = 0 client = requests.Session() client.mount("http://", HTTPAdapter(max_retries=retries)) client.mount("https://", HTTPAdapter(max_retries=retries)) try: while len(filings_to_fetch) < num_filings_to_download: payload = form_request_payload( ticker_or_cik, [filing_type], after_date, before_date, start_index, query, ) headers = { "User-Agent": generate_random_user_agent(), "Accept-Encoding": "gzip, deflate", "Host": "efts.sec.gov", } resp = client.post( SEC_EDGAR_SEARCH_API_ENDPOINT, json=payload, headers=headers ) resp.raise_for_status() search_query_results = resp.json() if "error" in search_query_results: try: root_cause = search_query_results["error"]["root_cause"] if not root_cause: # pragma: no cover raise ValueError error_reason = root_cause[0]["reason"] raise EdgarSearchApiError( f"Edgar Search API encountered an error: {error_reason}. " f"Request payload:\n{payload}" ) except (ValueError, KeyError): # pragma: no cover raise EdgarSearchApiError( "Edgar Search API encountered an unknown error. " f"Request payload:\n{payload}" ) from None query_hits = search_query_results["hits"]["hits"] # No more results to process if not query_hits: break for hit in query_hits: hit_filing_type = hit["_source"]["file_type"] is_amend = hit_filing_type[-2:] == "/A" if not include_amends and is_amend: continue if is_amend: num_filings_to_download += 1 # Work around bug where incorrect filings are sometimes included. # For example, AAPL 8-K searches include N-Q entries. if not is_amend and hit_filing_type != filing_type: continue metadata = build_filing_metadata_from_hit(hit) filings_to_fetch.append(metadata) if len(filings_to_fetch) == num_filings_to_download: return filings_to_fetch # Edgar queries 100 entries at a time, but it is best to set this # from the response payload in case it changes in the future query_size = search_query_results["query"]["size"] start_index += query_size # Prevent rate limiting time.sleep(SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL) finally: client.close() return filings_to_fetch