Spaces:
Sleeping
Sleeping
File size: 3,840 Bytes
706ec74 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import os
import pandas as pd
from bs4 import BeautifulSoup
from newspaper import Article
import requests
from categorize_text import classify_text_domain
from time import sleep
# Dictionary to track visited links
visited_links = {}
def get_article_metadata(url, company_name):
"""Fetches metadata from a given article URL."""
try:
article = Article(url)
article.download()
article.parse()
article.nlp()
# Filter by company name
if company_name.lower() not in article.text.lower():
return None # Skip articles that do not mention the company
return {
"title": article.title,
"summary": article.summary,
"url": url,
"publish_date": article.publish_date,
"domain": classify_text_domain(article.text)
}
except Exception as e:
print(f"Error processing {url}: {e}")
return None
def extract_news(company_name, max_articles=10):
"""Extracts news articles for the given company."""
all_links = [
f"https://timesofindia.indiatimes.com/topic/{company_name}/news",
f"https://economictimes.indiatimes.com/topic/{company_name}",
f"https://www.hindustantimes.com/search?q={company_name}"
]
articles = []
for base_url in all_links:
try:
response = requests.get(base_url, timeout=10)
if response.status_code != 200:
print(f"Failed to access {base_url}")
continue
soup = BeautifulSoup(response.text, 'html.parser')
# Extract article links
for a_tag in soup.find_all('a', href=True):
link = a_tag['href']
full_link = link if link.startswith("http") else f"{base_url}{link}"
# Filter for valid TOI, ET, and HT articles
if ("timesofindia.indiatimes.com" in full_link and "articleshow" in full_link) or \
("economictimes.indiatimes.com" in full_link) or \
("hindustantimes.com" in full_link):
if full_link not in visited_links:
sleep(1) # Add delay to prevent rate limiting
article_data = get_article_metadata(full_link, company_name)
if article_data:
visited_links[full_link] = article_data["domain"]
articles.append(article_data)
if len(articles) >= max_articles:
break
except Exception as e:
print(f"Error scraping {base_url}: {e}")
continue
# Store results in a DataFrame
df = pd.DataFrame(articles)
if df.empty:
print(f"No relevant articles found for {company_name}.")
else:
print(f"\nExtracted {len(articles)} articles for {company_name}")
print(df)
return df
# β
List of 10 Companies to Extract News For
companies = [
"Reliance", "Tata", "Infosys", "Wipro", "HDFC",
"ICICI", "L&T", "Adani", "Bharti Airtel", "Bajaj"
]
# β
Loop through each company and extract articles
output_dir = "company_news"
os.makedirs(output_dir, exist_ok=True)
for company in companies:
print(f"\nπ Extracting news for {company}...")
result_df = extract_news(company, max_articles=10)
# Save results to CSV
if not result_df.empty:
csv_filename = os.path.join(output_dir, f"{company}_news.csv")
result_df.to_csv(csv_filename, index=False)
print(f"β
Saved {company} news articles to {csv_filename}")
else:
print(f"β οΈ No articles found for {company}")
print("\nπ― Extraction completed for all companies!")
|