Spaces:
Sleeping
Sleeping
File size: 6,372 Bytes
3bf50ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import asyncio
import nltk
import matplotlib.pyplot as plt
from scrapes import get_valid_news_urls, extract_article_content
from sentiV_v2 import analyze_sentiment
from newspaper import Article, Config
from deep_translator import GoogleTranslator # Replaced googletrans with deep-translator
# Helper: Chunk text into smaller parts based on a fixed word count
def chunk_text_by_words(text, chunk_size=100):
words = text.split()
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
def process_articles(company_name):
"""Extract articles with metadata from news URLs and only keep those relevant to the company."""
urls = get_valid_news_urls(company_name)
articles = []
# Set up a custom config with a browser user-agent to help avoid 403 errors
user_agent = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/92.0.4515.159 Safari/537.36')
config = Config()
config.browser_user_agent = user_agent
config.request_timeout = 10
for url in urls:
try:
art = Article(url, config=config)
art.download()
art.parse()
content = art.text.strip() if art.text.strip() else extract_article_content(url)
# Filter out articles that do not mention the company (case-insensitive)
if not content or company_name.lower() not in content.lower():
continue
article_data = {
"title": art.title if art.title else "No Title",
"url": url,
"date": str(art.publish_date) if art.publish_date else "N/A",
"content": content
}
sentiment, score = analyze_sentiment(content)
article_data["sentiment"] = sentiment
article_data["score"] = score
articles.append(article_data)
except Exception as e:
print(f"Error processing article {url}: {e}")
return articles
def generate_combined_summary(articles):
"""Generate a combined summary from articles.
First attempts to use a transformers pipeline; if it fails, falls back to Sumy."""
combined_text = " ".join([article["content"] for article in articles])
if not combined_text.strip():
return ""
# Try using transformers summarizer
try:
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(combined_text, max_length=150, min_length=50, do_sample=False)
return summary[0]["summary_text"]
except Exception as e:
print(f"Transformers summarization failed: {e}")
# Fallback using Sumy extraction-based summarization
try:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
parser = PlaintextParser.from_string(combined_text, Tokenizer("english"))
summarizer_sumy = LexRankSummarizer()
summary_sentences = summarizer_sumy(parser.document, sentences_count=5)
summarized_text = " ".join(str(sentence) for sentence in summary_sentences)
return summarized_text if summarized_text else combined_text[:500]
except Exception as e2:
print(f"Sumy summarization failed: {e2}")
return combined_text[:500]
def translate_to_hindi(text):
"""Translate English text to Hindi using deep-translator for better quality."""
try:
translator = GoogleTranslator(source='auto', target='hi')
return translator.translate(text)
except Exception as e:
print(f"Translation failed: {e}")
return text
def comparative_analysis(articles):
"""Perform comparative sentiment analysis across articles and generate a bar chart."""
pos, neg, neu = 0, 0, 0
for article in articles:
sentiment = article.get("sentiment", "Neutral")
if sentiment == "Positive":
pos += 1
elif sentiment == "Negative":
neg += 1
else:
neu += 1
# Create a bar chart using matplotlib
labels = ['Positive', 'Negative', 'Neutral']
counts = [pos, neg, neu]
plt.figure(figsize=(6, 4))
bars = plt.bar(labels, counts, color=['green', 'red', 'gray'])
plt.title("Comparative Sentiment Analysis")
plt.xlabel("Sentiment")
plt.ylabel("Number of Articles")
for bar, count in zip(bars, counts):
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height, str(count), ha='center', va='bottom')
image_path = "sentiment_analysis.png"
plt.savefig(image_path)
plt.close()
return {"Positive": pos, "Negative": neg, "Neutral": neu, "graph": image_path}
def generate_tts_audio(text, output_file="news_summary.mp3"):
"""Generate TTS audio file from text using Edge TTS (via tts_hindi_edgetts.py)."""
try:
from tts_hindi_edgetts import text_to_speech_hindi
return asyncio.run(text_to_speech_hindi(text, output_file))
except Exception as e:
print(f"TTS generation failed: {e}")
return None
def process_news(company_name):
"""
Process news by:
• Extracting articles and metadata (only those relevant to the company)
• Generating a combined summary of article contents
• Translating the summary to Hindi
• Generating a Hindi TTS audio file
• Performing comparative sentiment analysis with visual output
"""
articles = process_articles(company_name)
summary = generate_combined_summary(articles)
hindi_summary = translate_to_hindi(summary)
tts_audio = generate_tts_audio(hindi_summary)
sentiment_distribution = comparative_analysis(articles)
result = {
"company": company_name,
"articles": articles,
"comparative_sentiment": sentiment_distribution,
"final_summary": summary,
"hindi_summary": hindi_summary,
"tts_audio": tts_audio # file path for the generated audio
}
return result
if __name__ == "__main__":
company = input("Enter company name: ")
import json
data = process_news(company)
print(json.dumps(data, indent=4, ensure_ascii=False))
|