Spaces:
Sleeping
Sleeping
import asyncio | |
import nltk | |
import matplotlib.pyplot as plt | |
from scrapes import get_valid_news_urls, extract_article_content | |
from sentiV_v2 import analyze_sentiment | |
from newspaper import Article, Config | |
from deep_translator import GoogleTranslator # Replaced googletrans with deep-translator | |
# Helper: Chunk text into smaller parts based on a fixed word count | |
def chunk_text_by_words(text, chunk_size=100): | |
words = text.split() | |
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] | |
def process_articles(company_name): | |
"""Extract articles with metadata from news URLs and only keep those relevant to the company.""" | |
urls = get_valid_news_urls(company_name) | |
articles = [] | |
# Set up a custom config with a browser user-agent to help avoid 403 errors | |
user_agent = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' | |
'AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/92.0.4515.159 Safari/537.36') | |
config = Config() | |
config.browser_user_agent = user_agent | |
config.request_timeout = 10 | |
for url in urls: | |
try: | |
art = Article(url, config=config) | |
art.download() | |
art.parse() | |
content = art.text.strip() if art.text.strip() else extract_article_content(url) | |
# Filter out articles that do not mention the company (case-insensitive) | |
if not content or company_name.lower() not in content.lower(): | |
continue | |
article_data = { | |
"title": art.title if art.title else "No Title", | |
"url": url, | |
"date": str(art.publish_date) if art.publish_date else "N/A", | |
"content": content | |
} | |
sentiment, score = analyze_sentiment(content) | |
article_data["sentiment"] = sentiment | |
article_data["score"] = score | |
articles.append(article_data) | |
except Exception as e: | |
print(f"Error processing article {url}: {e}") | |
return articles | |
def generate_combined_summary(articles): | |
"""Generate a combined summary from articles. | |
First attempts to use a transformers pipeline; if it fails, falls back to Sumy.""" | |
combined_text = " ".join([article["content"] for article in articles]) | |
if not combined_text.strip(): | |
return "" | |
# Try using transformers summarizer | |
try: | |
from transformers import pipeline | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
summary = summarizer(combined_text, max_length=150, min_length=50, do_sample=False) | |
return summary[0]["summary_text"] | |
except Exception as e: | |
print(f"Transformers summarization failed: {e}") | |
# Fallback using Sumy extraction-based summarization | |
try: | |
from sumy.parsers.plaintext import PlaintextParser | |
from sumy.nlp.tokenizers import Tokenizer | |
from sumy.summarizers.lex_rank import LexRankSummarizer | |
parser = PlaintextParser.from_string(combined_text, Tokenizer("english")) | |
summarizer_sumy = LexRankSummarizer() | |
summary_sentences = summarizer_sumy(parser.document, sentences_count=5) | |
summarized_text = " ".join(str(sentence) for sentence in summary_sentences) | |
return summarized_text if summarized_text else combined_text[:500] | |
except Exception as e2: | |
print(f"Sumy summarization failed: {e2}") | |
return combined_text[:500] | |
def translate_to_hindi(text): | |
"""Translate English text to Hindi using deep-translator for better quality.""" | |
try: | |
translator = GoogleTranslator(source='auto', target='hi') | |
return translator.translate(text) | |
except Exception as e: | |
print(f"Translation failed: {e}") | |
return text | |
def comparative_analysis(articles): | |
"""Perform comparative sentiment analysis across articles and generate a bar chart.""" | |
pos, neg, neu = 0, 0, 0 | |
for article in articles: | |
sentiment = article.get("sentiment", "Neutral") | |
if sentiment == "Positive": | |
pos += 1 | |
elif sentiment == "Negative": | |
neg += 1 | |
else: | |
neu += 1 | |
# Create a bar chart using matplotlib | |
labels = ['Positive', 'Negative', 'Neutral'] | |
counts = [pos, neg, neu] | |
plt.figure(figsize=(6, 4)) | |
bars = plt.bar(labels, counts, color=['green', 'red', 'gray']) | |
plt.title("Comparative Sentiment Analysis") | |
plt.xlabel("Sentiment") | |
plt.ylabel("Number of Articles") | |
for bar, count in zip(bars, counts): | |
height = bar.get_height() | |
plt.text(bar.get_x() + bar.get_width()/2., height, str(count), ha='center', va='bottom') | |
image_path = "sentiment_analysis.png" | |
plt.savefig(image_path) | |
plt.close() | |
return {"Positive": pos, "Negative": neg, "Neutral": neu, "graph": image_path} | |
def generate_tts_audio(text, output_file="news_summary.mp3"): | |
"""Generate TTS audio file from text using Edge TTS (via tts_hindi_edgetts.py).""" | |
try: | |
from tts_hindi_edgetts import text_to_speech_hindi | |
return asyncio.run(text_to_speech_hindi(text, output_file)) | |
except Exception as e: | |
print(f"TTS generation failed: {e}") | |
return None | |
def process_news(company_name): | |
""" | |
Process news by: | |
• Extracting articles and metadata (only those relevant to the company) | |
• Generating a combined summary of article contents | |
• Translating the summary to Hindi | |
• Generating a Hindi TTS audio file | |
• Performing comparative sentiment analysis with visual output | |
""" | |
articles = process_articles(company_name) | |
summary = generate_combined_summary(articles) | |
hindi_summary = translate_to_hindi(summary) | |
tts_audio = generate_tts_audio(hindi_summary) | |
sentiment_distribution = comparative_analysis(articles) | |
result = { | |
"company": company_name, | |
"articles": articles, | |
"comparative_sentiment": sentiment_distribution, | |
"final_summary": summary, | |
"hindi_summary": hindi_summary, | |
"tts_audio": tts_audio # file path for the generated audio | |
} | |
return result | |
if __name__ == "__main__": | |
company = input("Enter company name: ") | |
import json | |
data = process_news(company) | |
print(json.dumps(data, indent=4, ensure_ascii=False)) | |