Spaces:

Deepakraj2006
/

News_Scraper_TTS

Running

App Files Files Community

News_Scraper_TTS / app.py

Deepakraj2006

Update app.py

50e80a6 verified 6 months ago

raw

history blame contribute delete

8.82 kB

	import os
	from dotenv import load_dotenv
	load_dotenv()

	import requests
	from bs4 import BeautifulSoup
	from newsapi import NewsApiClient
	import pandas as pd
	import torch
	import soundfile as sf
	import gradio as gr
	from transformers import (
	AutoModelForSequenceClassification, AutoTokenizer, pipeline,
	BartTokenizer, BartForConditionalGeneration,
	MarianMTModel, MarianTokenizer,
	BarkModel, AutoProcessor
	)
	import librosa
	import re

	# -------------------------
	# Global Setup and Environment Variables
	# -------------------------
	NEWS_API_KEY = os.getenv("NEWS_API_KEY") # Set this in your .env file
	device = "cpu" # Force CPU since no GPU is available in Hugging Face Spaces

	# -------------------------
	# News Extraction Functions
	# -------------------------
	def fetch_and_scrape_news(company, api_key, count=1, output_file='news_articles.xlsx'):
	print("Starting news fetch from NewsAPI...")
	newsapi = NewsApiClient(api_key=api_key)
	all_articles = newsapi.get_everything(q=company, language='en', sort_by='relevancy', page_size=count)
	articles = all_articles.get('articles', [])

	scraped_data = []
	print(f"Found {len(articles)} articles. Starting scraping individual articles...")
	for i, article in enumerate(articles):
	url = article.get('url')
	if url:
	print(f"Scraping article {i+1}: {url}")
	scraped_article = scrape_news(url)
	if scraped_article:
	scraped_article['url'] = url
	scraped_data.append(scraped_article)
	df = pd.DataFrame(scraped_data)
	df.to_excel(output_file, index=False, header=True)
	print(f"News scraping complete. Data saved to {output_file}")
	return df

	def scrape_news(url):
	headers = {"User-Agent": "Mozilla/5.0"}
	try:
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()
	except Exception as e:
	print(f"Failed to fetch the page: {url} ({e})")
	return None
	soup = BeautifulSoup(response.text, "html.parser")
	headline = soup.find("h1").get_text(strip=True) if soup.find("h1") else "No headline found"
	paragraphs = soup.find_all("p")
	article_text = " ".join(p.get_text(strip=True) for p in paragraphs)
	return {"headline": headline, "content": article_text}

	# -------------------------
	# Sentiment Analysis Setup
	# -------------------------
	print("Loading sentiment analysis model...")
	sentiment_model_name = "cross-encoder/nli-distilroberta-base"
	sentiment_model = AutoModelForSequenceClassification.from_pretrained(
	sentiment_model_name,
	torch_dtype=torch.float32
	)
	sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
	classifier = pipeline("zero-shot-classification", model=sentiment_model, tokenizer=sentiment_tokenizer, device=-1)
	labels = ["positive", "negative", "neutral"]

	# -------------------------
	# Summarization Setup
	# -------------------------
	print("Loading summarization model (BART)...")
	bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
	bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

	def split_into_chunks(text, tokenizer, max_tokens=1024):
	words = text.split()
	chunks = []
	current_chunk = []
	current_length = 0
	for word in words:
	tokenized_word = tokenizer.encode(word, add_special_tokens=False)
	if current_length + len(tokenized_word) <= max_tokens:
	current_chunk.append(word)
	current_length += len(tokenized_word)
	else:
	chunks.append(' '.join(current_chunk))
	current_chunk = [word]
	current_length = len(tokenized_word)
	if current_chunk:
	chunks.append(' '.join(current_chunk))
	return chunks

	# -------------------------
	# Translation Setup (English to Hindi)
	# -------------------------
	print("Loading translation model (MarianMT)...")
	translation_model_name = 'Helsinki-NLP/opus-mt-en-hi'
	trans_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
	trans_model = MarianMTModel.from_pretrained(translation_model_name)

	def translate_text(text):
	tokens = trans_tokenizer(text, return_tensors="pt", padding=True)
	translated = trans_model.generate(**tokens)
	return trans_tokenizer.decode(translated[0], skip_special_tokens=True)

	# -------------------------
	# Bark TTS Setup (Hindi)
	# -------------------------
	print("Loading Bark TTS model...")
	bark_model = BarkModel.from_pretrained("suno/bark-small")
	bark_model.to(device)
	processor = AutoProcessor.from_pretrained("suno/bark")

	# -------------------------
	# Helper Functions for Audio and Text Preprocessing
	# -------------------------
	def normalize_text(text):
	return re.sub(r"[^\w\s]", "", text.lower()).strip()

	def resample_audio(audio_array, orig_sr, target_sr=16000):
	if orig_sr != target_sr:
	audio_array = librosa.resample(audio_array, orig_sr=orig_sr, target_sr=target_sr)
	return audio_array

	# -------------------------
	# Main Pipeline Function
	# -------------------------
	def process_company(company):
	print(f"Processing company: {company}")
	# Step 1: Fetch and scrape news
	print("Fetching and scraping news...")
	fetch_and_scrape_news(company, NEWS_API_KEY)
	df = pd.read_excel('news_articles.xlsx')
	print("Scraped Articles:")
	print(df)

	articles_data = []
	for index, row in df.iterrows():
	print(f"Processing article {index+1}...")
	article_text = row.get("content", "")
	title = row.get("headline", "No title")
	url = row.get("url", "")
	chunks = split_into_chunks(article_text, bart_tokenizer)
	chunk_summaries = []
	for i, chunk in enumerate(chunks):
	print(f"Summarizing chunk {i+1}/{len(chunks)}...")
	inputs = bart_tokenizer([chunk], max_length=1024, return_tensors='pt', truncation=True)
	summary_ids = bart_model.generate(inputs.input_ids, num_beams=2, max_length=50, min_length=30, early_stopping=True)
	chunk_summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
	chunk_summaries.append(chunk_summary)
	final_summary = ' '.join(chunk_summaries)
	print("Performing sentiment analysis...")
	sentiment_result = classifier(final_summary, labels)
	sentiment = sentiment_result["labels"][0]

	articles_data.append({
	"Title": title,
	"Summary": final_summary,
	"Sentiment": sentiment,
	"URL": url
	})

	# Comparative Analysis: Build a simple sentiment distribution
	sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
	for article in articles_data:
	key = article["Sentiment"].capitalize()
	sentiment_distribution[key] += 1
	print("Sentiment distribution computed.")

	# Step 2: Translate summaries and generate Hindi speech
	print("Translating summaries to Hindi...")
	translated_summaries = [translate_text(article["Summary"]) for article in articles_data]
	final_translated_text = "\n\n".join(translated_summaries)

	print("Generating Hindi speech with Bark TTS...")
	inputs = processor(final_translated_text, return_tensors="pt")
	speech_output = bark_model.generate(**inputs)
	audio_path = "final_summary.wav"
	sf.write(audio_path, speech_output[0].cpu().numpy(), bark_model.generation_config.sample_rate)
	print("Audio generated and saved.")

	# Build final report
	report = {
	"Company": company,
	"Articles": articles_data,
	"Comparative Sentiment Score": {
	"Sentiment Distribution": sentiment_distribution,
	"Coverage Differences": "Detailed comparative analysis not implemented",
	"Topic Overlap": "Topic extraction not implemented"
	},
	"Final Sentiment Analysis": "Overall sentiment analysis not fully computed",
	"Audio": audio_path
	}
	print("Final report prepared.")
	return report, audio_path

	# -------------------------
	# Gradio Interface Function
	# -------------------------
	def gradio_interface(company):
	print(f"Received input: {company}")
	report, audio_path = process_company(company)
	return report, audio_path

	# -------------------------
	# Gradio UI Setup
	# -------------------------
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=gr.Textbox(label="Enter Company Name"),
	outputs=[
	gr.JSON(label="News Sentiment Report"),
	gr.Audio(type="filepath", label="Hindi Summary Audio")
	],
	title="News Summarization & Text-to-Speech",
	description="Enter a company name to fetch news articles, perform sentiment analysis, and listen to a Hindi TTS summary."
	)

	if __name__ == "__main__":
	iface.launch()