File size: 6,372 Bytes
3bf50ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import asyncio
import nltk
import matplotlib.pyplot as plt
from scrapes import get_valid_news_urls, extract_article_content
from sentiV_v2 import analyze_sentiment
from newspaper import Article, Config
from deep_translator import GoogleTranslator  # Replaced googletrans with deep-translator

# Helper: Chunk text into smaller parts based on a fixed word count
def chunk_text_by_words(text, chunk_size=100):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

def process_articles(company_name):
    """Extract articles with metadata from news URLs and only keep those relevant to the company."""
    urls = get_valid_news_urls(company_name)
    articles = []
    # Set up a custom config with a browser user-agent to help avoid 403 errors
    user_agent = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/92.0.4515.159 Safari/537.36')
    config = Config()
    config.browser_user_agent = user_agent
    config.request_timeout = 10

    for url in urls:
        try:
            art = Article(url, config=config)
            art.download()
            art.parse()
            content = art.text.strip() if art.text.strip() else extract_article_content(url)
            # Filter out articles that do not mention the company (case-insensitive)
            if not content or company_name.lower() not in content.lower():
                continue
            article_data = {
                "title": art.title if art.title else "No Title",
                "url": url,
                "date": str(art.publish_date) if art.publish_date else "N/A",
                "content": content
            }
            sentiment, score = analyze_sentiment(content)
            article_data["sentiment"] = sentiment
            article_data["score"] = score
            articles.append(article_data)
        except Exception as e:
            print(f"Error processing article {url}: {e}")
    return articles

def generate_combined_summary(articles):
    """Generate a combined summary from articles.
       First attempts to use a transformers pipeline; if it fails, falls back to Sumy."""
    combined_text = " ".join([article["content"] for article in articles])
    if not combined_text.strip():
        return ""
    # Try using transformers summarizer
    try:
        from transformers import pipeline
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        summary = summarizer(combined_text, max_length=150, min_length=50, do_sample=False)
        return summary[0]["summary_text"]
    except Exception as e:
        print(f"Transformers summarization failed: {e}")
        # Fallback using Sumy extraction-based summarization
        try:
            from sumy.parsers.plaintext import PlaintextParser
            from sumy.nlp.tokenizers import Tokenizer
            from sumy.summarizers.lex_rank import LexRankSummarizer
            parser = PlaintextParser.from_string(combined_text, Tokenizer("english"))
            summarizer_sumy = LexRankSummarizer()
            summary_sentences = summarizer_sumy(parser.document, sentences_count=5)
            summarized_text = " ".join(str(sentence) for sentence in summary_sentences)
            return summarized_text if summarized_text else combined_text[:500]
        except Exception as e2:
            print(f"Sumy summarization failed: {e2}")
            return combined_text[:500]

def translate_to_hindi(text):
    """Translate English text to Hindi using deep-translator for better quality."""
    try:
        translator = GoogleTranslator(source='auto', target='hi')
        return translator.translate(text)
    except Exception as e:
        print(f"Translation failed: {e}")
        return text

def comparative_analysis(articles):
    """Perform comparative sentiment analysis across articles and generate a bar chart."""
    pos, neg, neu = 0, 0, 0
    for article in articles:
        sentiment = article.get("sentiment", "Neutral")
        if sentiment == "Positive":
            pos += 1
        elif sentiment == "Negative":
            neg += 1
        else:
            neu += 1

    # Create a bar chart using matplotlib
    labels = ['Positive', 'Negative', 'Neutral']
    counts = [pos, neg, neu]
    plt.figure(figsize=(6, 4))
    bars = plt.bar(labels, counts, color=['green', 'red', 'gray'])
    plt.title("Comparative Sentiment Analysis")
    plt.xlabel("Sentiment")
    plt.ylabel("Number of Articles")
    for bar, count in zip(bars, counts):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height, str(count), ha='center', va='bottom')
    image_path = "sentiment_analysis.png"
    plt.savefig(image_path)
    plt.close()
    return {"Positive": pos, "Negative": neg, "Neutral": neu, "graph": image_path}

def generate_tts_audio(text, output_file="news_summary.mp3"):
    """Generate TTS audio file from text using Edge TTS (via tts_hindi_edgetts.py)."""
    try:
        from tts_hindi_edgetts import text_to_speech_hindi
        return asyncio.run(text_to_speech_hindi(text, output_file))
    except Exception as e:
        print(f"TTS generation failed: {e}")
        return None

def process_news(company_name):
    """
    Process news by:
      • Extracting articles and metadata (only those relevant to the company)
      • Generating a combined summary of article contents
      • Translating the summary to Hindi
      • Generating a Hindi TTS audio file
      • Performing comparative sentiment analysis with visual output
    """
    articles = process_articles(company_name)
    summary = generate_combined_summary(articles)
    hindi_summary = translate_to_hindi(summary)
    tts_audio = generate_tts_audio(hindi_summary)
    sentiment_distribution = comparative_analysis(articles)
    result = {
        "company": company_name,
        "articles": articles,
        "comparative_sentiment": sentiment_distribution,
        "final_summary": summary,
        "hindi_summary": hindi_summary,
        "tts_audio": tts_audio  # file path for the generated audio
    }
    return result

if __name__ == "__main__":
    company = input("Enter company name: ")
    import json
    data = process_news(company)
    print(json.dumps(data, indent=4, ensure_ascii=False))