File size: 3,840 Bytes
706ec74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import pandas as pd
from bs4 import BeautifulSoup
from newspaper import Article
import requests
from categorize_text import classify_text_domain
from time import sleep


# Dictionary to track visited links
visited_links = {}


def get_article_metadata(url, company_name):
    """Fetches metadata from a given article URL."""
    try:
        article = Article(url)
        article.download()
        article.parse()
        article.nlp()

        # Filter by company name
        if company_name.lower() not in article.text.lower():
            return None  # Skip articles that do not mention the company

        return {
            "title": article.title,
            "summary": article.summary,
            "url": url,
            "publish_date": article.publish_date,
            "domain": classify_text_domain(article.text)
        }

    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None


def extract_news(company_name, max_articles=10):
    """Extracts news articles for the given company."""
    
    all_links = [
        f"https://timesofindia.indiatimes.com/topic/{company_name}/news",
        f"https://economictimes.indiatimes.com/topic/{company_name}",
        f"https://www.hindustantimes.com/search?q={company_name}"
    ]

    articles = []
    
    for base_url in all_links:
        try:
            response = requests.get(base_url, timeout=10)
            if response.status_code != 200:
                print(f"Failed to access {base_url}")
                continue

            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract article links
            for a_tag in soup.find_all('a', href=True):
                link = a_tag['href']
                full_link = link if link.startswith("http") else f"{base_url}{link}"

                # Filter for valid TOI, ET, and HT articles
                if ("timesofindia.indiatimes.com" in full_link and "articleshow" in full_link) or \
                   ("economictimes.indiatimes.com" in full_link) or \
                   ("hindustantimes.com" in full_link):
                    
                    if full_link not in visited_links:
                        sleep(1)  # Add delay to prevent rate limiting
                        article_data = get_article_metadata(full_link, company_name)
                        
                        if article_data:
                            visited_links[full_link] = article_data["domain"]
                            articles.append(article_data)
                        
                        if len(articles) >= max_articles:
                            break
        except Exception as e:
            print(f"Error scraping {base_url}: {e}")
            continue

    # Store results in a DataFrame
    df = pd.DataFrame(articles)
    
    if df.empty:
        print(f"No relevant articles found for {company_name}.")
    else:
        print(f"\nExtracted {len(articles)} articles for {company_name}")
        print(df)

    return df


# βœ… List of 10 Companies to Extract News For
companies = [
    "Reliance", "Tata", "Infosys", "Wipro", "HDFC", 
    "ICICI", "L&T", "Adani", "Bharti Airtel", "Bajaj"
]

# βœ… Loop through each company and extract articles
output_dir = "company_news"
os.makedirs(output_dir, exist_ok=True)

for company in companies:
    print(f"\nπŸ” Extracting news for {company}...")
    
    result_df = extract_news(company, max_articles=10)

    # Save results to CSV
    if not result_df.empty:
        csv_filename = os.path.join(output_dir, f"{company}_news.csv")
        result_df.to_csv(csv_filename, index=False)
        print(f"βœ… Saved {company} news articles to {csv_filename}")
    else:
        print(f"⚠️ No articles found for {company}")

print("\n🎯 Extraction completed for all companies!")