Spaces:

Manishkumaryadav
/

news-summarize

Sleeping

App Files Files Community

Manishkumaryadav commited on Mar 23

Commit

706ec74

verified ·

1 Parent(s): b4afece

Upload 8 files

Browse files

Files changed (8) hide show

api.py +66 -0
app.py +60 -0
categorize_text.py +62 -0
clearCache.py +12 -0
main2.py +118 -0
requirements.txt +9 -0
sentiment_analysis.py +55 -0
tts_hindi.py +27 -0

api.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from flask import Flask, request, jsonify
+import os
+from sentiment_analysis import perform_sentiment_analysis, comparative_analysis
+from tts_hindi import generate_hindi_coqui_tts
+import pandas as pd
+app = Flask(__name__)
+@app.route('/analyze', methods=['POST'])
+def analyze():
+    """Perform news sentiment analysis and TTS."""
+    try:
+        company_name = request.json.get('company_name')
+        if not company_name:
+            return jsonify({"error": "Company name is required"}), 400
+        # CSV file with extracted articles
+        csv_file = f"company_news/{company_name}_news.csv"
+        if not os.path.exists(csv_file):
+            return jsonify({"error": f"No data found for {company_name}"}), 404
+        # Perform sentiment analysis
+        sentiment_df = perform_sentiment_analysis(csv_file)
+        sentiment_summary = comparative_analysis(sentiment_df)
+        # ✅ Generate Hindi TTS audio
+        summary_text = ". ".join(sentiment_df['summary'].tolist())
+        audio_file = generate_hindi_coqui_tts(summary_text, company_name)
+        # Extract article details
+        articles = sentiment_df[['title', 'summary', 'url']].to_dict(orient='records')
+        return jsonify({
+            "company": company_name,
+            "sentiment_summary": sentiment_summary,
+            "articles": articles,
+            "audio_file": audio_file
+        })
+    except Exception as e:
+        print(f"API Error: {e}")
+        return jsonify({"error": "Internal server error"}), 500
+@app.route('/generate-tts', methods=['POST'])
+def generate_tts_api():
+    data = request.get_json()
+    text = data.get('text')
+    company_name = data.get('company_name', 'default_company')
+    if not text:
+        return jsonify({"error": "Text is required"}), 400
+    audio_file = generate_hindi_coqui_tts(text, company_name)
+    if audio_file and os.path.exists(audio_file):
+        return jsonify({
+            "message": "✅ TTS generated successfully",
+            "audio_file": audio_file
+        })
+    else:
+        return jsonify({"error": "Failed to generate TTS"}), 500
+if __name__ == '__main__':
+    app.run(debug=True)

app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import streamlit as st
+import requests
+import os
+import pandas as pd
+BACKEND_URL = "http://127.0.0.1:5000/analyze"
+st.title("📊 News Sentiment Analysis & TTS in Hindi")
+# Input field for company name
+company_name = st.text_input("Enter Company Name", "")
+if st.button("Analyze"):
+    if not company_name:
+        st.warning("⚠️ Please enter a company name.")
+    else:
+        st.info(f"Analyzing news for {company_name}...")
+        response = requests.post(
+            BACKEND_URL,
+            json={"company_name": company_name}
+        )
+        if response.status_code == 200:
+            data = response.json()
+            st.success("✅ Analysis Complete!")
+            # ✅ Display Sentiment Summary
+            st.subheader("📊 Sentiment Summary")
+            st.json(data["sentiment_summary"])
+            # ✅ Display Articles
+            st.subheader("📰 Extracted Articles")
+            df = pd.DataFrame(data["articles"])
+            for _, article in df.iterrows():
+                st.markdown(f"### [{article['title']}]({article['url']})")
+                st.write(f"**Summary:** {article['summary']}")
+                st.write("---")
+            # ✅ Display Hindi TTS Audio
+            st.subheader("🔊 Hindi TTS Audio Output")
+            audio_file = "output/TestCompany_tts.mp3"
+            if os.path.exists(audio_file):
+                with open(audio_file, "rb") as audio:
+                    st.download_button(
+                        label="🔊 Download Hindi TTS Audio",
+                        data=audio,
+                        file_name="Hindi_TTS.mp3",
+                        mime="audio/mpeg"
+                    )
+                st.audio(audio_file, format="audio/mp3")
+                st.success("✅ Hindi TTS audio displayed successfully!")
+            else:
+                st.error("❌ TTS file not found.")
+        else:
+            st.error("❌ Error analyzing news. Please try again.")

categorize_text.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import pandas as pd
+from nltk.corpus import stopwords
+# Preprocessing function
+def preprocess_text(text):
+    """Tokenize and clean the input text"""
+    tokens = text.lower().split()
+    # Remove punctuation and stopwords
+    stop_words = set(stopwords.words('english'))
+    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
+    return tokens
+# Function to calculate similarity score between text and bag of words
+def similarity_score(text, bow):
+    """Calculate similarity score between text and BoW"""
+    tokens = preprocess_text(text)
+    # Ensure the BoW contains the 'Word' and 'Frequency' columns
+    if 'Word' not in bow.columns or 'Frequency' not in bow.columns:
+        print("Invalid BoW format. Ensure it contains 'Word' and 'Frequency' columns.")
+        return 0
+    # Calculate similarity score
+    common_words = set(tokens) & set(bow['Word'])
+    # Sum the frequencies of matching words
+    score = sum(bow[bow['Word'] == word]['Frequency'].values[0] for word in common_words)
+    return score
+# Function to classify text domain using bag of words
+def classify_text_domain(text):
+    """Classify text domain based on similarity score with BoW files"""
+    # Load BoW CSV files for different domains
+    try:
+        reliance_bow = pd.read_csv("reliance_bow.csv")
+    except FileNotFoundError:
+        print("BoW file not found.")
+        return "Unknown"
+    # Ensure CSV files are not empty
+    if reliance_bow.empty:
+        print("BoW file is empty.")
+        return "Unknown"
+    # Calculate similarity scores
+    scores = {
+        "Reliance": similarity_score(text, reliance_bow)
+    }
+    # Determine the domain with the highest similarity score
+    domain = max(scores, key=scores.get)
+    print(f"Scores: {scores}")  # Display

clearCache.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# import nltk
+# nltk.download('all')
+from gtts import gTTS
+# Sample Hindi text
+text = "नमस्ते, यह एक परीक्षण संदेश है।"
+# Generate TTS in Hindi
+tts = gTTS(text=text, lang='hi')
+tts.save("test_hindi.mp3")
+print("✅ Hindi TTS audio saved successfully!")

main2.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+import pandas as pd
+from bs4 import BeautifulSoup
+from newspaper import Article
+import requests
+from categorize_text import classify_text_domain
+from time import sleep
+# Dictionary to track visited links
+visited_links = {}
+def get_article_metadata(url, company_name):
+    """Fetches metadata from a given article URL."""
+    try:
+        article = Article(url)
+        article.download()
+        article.parse()
+        article.nlp()
+        # Filter by company name
+        if company_name.lower() not in article.text.lower():
+            return None  # Skip articles that do not mention the company
+        return {
+            "title": article.title,
+            "summary": article.summary,
+            "url": url,
+            "publish_date": article.publish_date,
+            "domain": classify_text_domain(article.text)
+        }
+    except Exception as e:
+        print(f"Error processing {url}: {e}")
+        return None
+def extract_news(company_name, max_articles=10):
+    """Extracts news articles for the given company."""
+    all_links = [
+        f"https://timesofindia.indiatimes.com/topic/{company_name}/news",
+        f"https://economictimes.indiatimes.com/topic/{company_name}",
+        f"https://www.hindustantimes.com/search?q={company_name}"
+    ]
+    articles = []
+    for base_url in all_links:
+        try:
+            response = requests.get(base_url, timeout=10)
+            if response.status_code != 200:
+                print(f"Failed to access {base_url}")
+                continue
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Extract article links
+            for a_tag in soup.find_all('a', href=True):
+                link = a_tag['href']
+                full_link = link if link.startswith("http") else f"{base_url}{link}"
+                # Filter for valid TOI, ET, and HT articles
+                if ("timesofindia.indiatimes.com" in full_link and "articleshow" in full_link) or \
+                   ("economictimes.indiatimes.com" in full_link) or \
+                   ("hindustantimes.com" in full_link):
+                    if full_link not in visited_links:
+                        sleep(1)  # Add delay to prevent rate limiting
+                        article_data = get_article_metadata(full_link, company_name)
+                        if article_data:
+                            visited_links[full_link] = article_data["domain"]
+                            articles.append(article_data)
+                        if len(articles) >= max_articles:
+                            break
+        except Exception as e:
+            print(f"Error scraping {base_url}: {e}")
+            continue
+    # Store results in a DataFrame
+    df = pd.DataFrame(articles)
+    if df.empty:
+        print(f"No relevant articles found for {company_name}.")
+    else:
+        print(f"\nExtracted {len(articles)} articles for {company_name}")
+        print(df)
+    return df
+# ✅ List of 10 Companies to Extract News For
+companies = [
+    "Reliance", "Tata", "Infosys", "Wipro", "HDFC",
+    "ICICI", "L&T", "Adani", "Bharti Airtel", "Bajaj"
+]
+# ✅ Loop through each company and extract articles
+output_dir = "company_news"
+os.makedirs(output_dir, exist_ok=True)
+for company in companies:
+    print(f"\n🔍 Extracting news for {company}...")
+    result_df = extract_news(company, max_articles=10)
+    # Save results to CSV
+    if not result_df.empty:
+        csv_filename = os.path.join(output_dir, f"{company}_news.csv")
+        result_df.to_csv(csv_filename, index=False)
+        print(f"✅ Saved {company} news articles to {csv_filename}")
+    else:
+        print(f"⚠️ No articles found for {company}")
+print("\n🎯 Extraction completed for all companies!")

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+flask
+streamlit
+gtts
+nltk
+pandas
+textblob
+beautifulsoup4
+newspaper3k
+requests

sentiment_analysis.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import pandas as pd
+from textblob import TextBlob
+def analyze_sentiment(text):
+    """Perform sentiment analysis on the given text."""
+    try:
+        blob = TextBlob(text)
+        polarity = blob.sentiment.polarity
+        if polarity > 0:
+            sentiment = "Positive"
+        elif polarity < 0:
+            sentiment = "Negative"
+        else:
+            sentiment = "Neutral"
+        return sentiment, round(polarity, 2)
+    except Exception as e:
+        print(f"Error in sentiment analysis: {e}")
+        return "Neutral", 0.0
+def perform_sentiment_analysis(csv_file):
+    """Analyze sentiment for all articles in the CSV."""
+    df = pd.read_csv(csv_file)
+    if 'summary' not in df.columns:
+        print("No 'summary' column found in CSV.")
+        return None
+    df['sentiment'], df['polarity'] = zip(*df['summary'].apply(analyze_sentiment))
+    # Save the result with sentiment analysis
+    output_csv = csv_file.replace('.csv', '_sentiment.csv')
+    df.to_csv(output_csv, index=False)
+    print(f"✅ Sentiment analysis saved to {output_csv}")
+    return df
+def comparative_analysis(df):
+    """Perform comparative sentiment analysis across multiple articles."""
+    sentiment_counts = df['sentiment'].value_counts(normalize=True) * 100
+    print("\n📊 Sentiment Distribution:")
+    print(sentiment_counts)
+    summary = {
+        "positive": sentiment_counts.get("Positive", 0),
+        "negative": sentiment_counts.get("Negative", 0),
+        "neutral": sentiment_counts.get("Neutral", 0)
+    }
+    return summary

tts_hindi.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from TTS.api import TTS
+import os
+def generate_hindi_coqui_tts(text, company_name):
+    """
+    Generate high-quality Hindi TTS using Coqui TTS.
+    """
+    try:
+        output_dir = "output"
+        os.makedirs(output_dir, exist_ok=True)
+        audio_file = os.path.join(output_dir, f"{company_name}_tts_hindi.wav")
+        # ✅ Use pre-trained Hindi model
+        model_name = "tts_models/hi/ek1/vits"
+        tts = TTS(model_name)
+        # ✅ Generate and save Hindi TTS
+        tts.tts_to_file(text=text, file_path=audio_file)
+        print(f"✅ High-quality Hindi TTS saved: {audio_file}")
+        return audio_file
+    except Exception as e:
+        print(f"❌ Error generating Coqui Hindi TTS: {e}")
+        return None