Manishkumaryadav commited on
Commit
706ec74
·
verified ·
1 Parent(s): b4afece

Upload 8 files

Browse files
Files changed (8) hide show
  1. api.py +66 -0
  2. app.py +60 -0
  3. categorize_text.py +62 -0
  4. clearCache.py +12 -0
  5. main2.py +118 -0
  6. requirements.txt +9 -0
  7. sentiment_analysis.py +55 -0
  8. tts_hindi.py +27 -0
api.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import os
3
+ from sentiment_analysis import perform_sentiment_analysis, comparative_analysis
4
+ from tts_hindi import generate_hindi_coqui_tts
5
+ import pandas as pd
6
+
7
+ app = Flask(__name__)
8
+
9
+ @app.route('/analyze', methods=['POST'])
10
+ def analyze():
11
+ """Perform news sentiment analysis and TTS."""
12
+ try:
13
+ company_name = request.json.get('company_name')
14
+
15
+ if not company_name:
16
+ return jsonify({"error": "Company name is required"}), 400
17
+
18
+ # CSV file with extracted articles
19
+ csv_file = f"company_news/{company_name}_news.csv"
20
+
21
+ if not os.path.exists(csv_file):
22
+ return jsonify({"error": f"No data found for {company_name}"}), 404
23
+
24
+ # Perform sentiment analysis
25
+ sentiment_df = perform_sentiment_analysis(csv_file)
26
+ sentiment_summary = comparative_analysis(sentiment_df)
27
+
28
+ # ✅ Generate Hindi TTS audio
29
+ summary_text = ". ".join(sentiment_df['summary'].tolist())
30
+ audio_file = generate_hindi_coqui_tts(summary_text, company_name)
31
+
32
+ # Extract article details
33
+ articles = sentiment_df[['title', 'summary', 'url']].to_dict(orient='records')
34
+
35
+ return jsonify({
36
+ "company": company_name,
37
+ "sentiment_summary": sentiment_summary,
38
+ "articles": articles,
39
+ "audio_file": audio_file
40
+ })
41
+
42
+ except Exception as e:
43
+ print(f"API Error: {e}")
44
+ return jsonify({"error": "Internal server error"}), 500
45
+ @app.route('/generate-tts', methods=['POST'])
46
+ def generate_tts_api():
47
+ data = request.get_json()
48
+
49
+ text = data.get('text')
50
+ company_name = data.get('company_name', 'default_company')
51
+
52
+ if not text:
53
+ return jsonify({"error": "Text is required"}), 400
54
+
55
+ audio_file = generate_hindi_coqui_tts(text, company_name)
56
+
57
+ if audio_file and os.path.exists(audio_file):
58
+ return jsonify({
59
+ "message": "✅ TTS generated successfully",
60
+ "audio_file": audio_file
61
+ })
62
+ else:
63
+ return jsonify({"error": "Failed to generate TTS"}), 500
64
+
65
+ if __name__ == '__main__':
66
+ app.run(debug=True)
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import os
4
+ import pandas as pd
5
+
6
+ BACKEND_URL = "http://127.0.0.1:5000/analyze"
7
+
8
+ st.title("📊 News Sentiment Analysis & TTS in Hindi")
9
+
10
+ # Input field for company name
11
+ company_name = st.text_input("Enter Company Name", "")
12
+
13
+ if st.button("Analyze"):
14
+ if not company_name:
15
+ st.warning("⚠️ Please enter a company name.")
16
+ else:
17
+ st.info(f"Analyzing news for {company_name}...")
18
+
19
+ response = requests.post(
20
+ BACKEND_URL,
21
+ json={"company_name": company_name}
22
+ )
23
+
24
+ if response.status_code == 200:
25
+ data = response.json()
26
+
27
+ st.success("✅ Analysis Complete!")
28
+
29
+ # ✅ Display Sentiment Summary
30
+ st.subheader("📊 Sentiment Summary")
31
+ st.json(data["sentiment_summary"])
32
+
33
+ # ✅ Display Articles
34
+ st.subheader("📰 Extracted Articles")
35
+
36
+ df = pd.DataFrame(data["articles"])
37
+ for _, article in df.iterrows():
38
+ st.markdown(f"### [{article['title']}]({article['url']})")
39
+ st.write(f"**Summary:** {article['summary']}")
40
+ st.write("---")
41
+
42
+ # ✅ Display Hindi TTS Audio
43
+ st.subheader("🔊 Hindi TTS Audio Output")
44
+
45
+ audio_file = "output/TestCompany_tts.mp3"
46
+ if os.path.exists(audio_file):
47
+ with open(audio_file, "rb") as audio:
48
+ st.download_button(
49
+ label="🔊 Download Hindi TTS Audio",
50
+ data=audio,
51
+ file_name="Hindi_TTS.mp3",
52
+ mime="audio/mpeg"
53
+ )
54
+ st.audio(audio_file, format="audio/mp3")
55
+ st.success("✅ Hindi TTS audio displayed successfully!")
56
+ else:
57
+ st.error("❌ TTS file not found.")
58
+
59
+ else:
60
+ st.error("❌ Error analyzing news. Please try again.")
categorize_text.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from nltk.corpus import stopwords
3
+
4
+
5
+ # Preprocessing function
6
+ def preprocess_text(text):
7
+ """Tokenize and clean the input text"""
8
+ tokens = text.lower().split()
9
+
10
+ # Remove punctuation and stopwords
11
+ stop_words = set(stopwords.words('english'))
12
+ tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
13
+
14
+ return tokens
15
+
16
+
17
+ # Function to calculate similarity score between text and bag of words
18
+ def similarity_score(text, bow):
19
+ """Calculate similarity score between text and BoW"""
20
+
21
+ tokens = preprocess_text(text)
22
+
23
+ # Ensure the BoW contains the 'Word' and 'Frequency' columns
24
+ if 'Word' not in bow.columns or 'Frequency' not in bow.columns:
25
+ print("Invalid BoW format. Ensure it contains 'Word' and 'Frequency' columns.")
26
+ return 0
27
+
28
+ # Calculate similarity score
29
+ common_words = set(tokens) & set(bow['Word'])
30
+
31
+ # Sum the frequencies of matching words
32
+ score = sum(bow[bow['Word'] == word]['Frequency'].values[0] for word in common_words)
33
+
34
+ return score
35
+
36
+
37
+ # Function to classify text domain using bag of words
38
+ def classify_text_domain(text):
39
+ """Classify text domain based on similarity score with BoW files"""
40
+
41
+ # Load BoW CSV files for different domains
42
+ try:
43
+ reliance_bow = pd.read_csv("reliance_bow.csv")
44
+ except FileNotFoundError:
45
+ print("BoW file not found.")
46
+ return "Unknown"
47
+
48
+ # Ensure CSV files are not empty
49
+ if reliance_bow.empty:
50
+ print("BoW file is empty.")
51
+ return "Unknown"
52
+
53
+ # Calculate similarity scores
54
+ scores = {
55
+ "Reliance": similarity_score(text, reliance_bow)
56
+ }
57
+
58
+ # Determine the domain with the highest similarity score
59
+ domain = max(scores, key=scores.get)
60
+
61
+ print(f"Scores: {scores}") # Display
62
+
clearCache.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import nltk
2
+ # nltk.download('all')
3
+ from gtts import gTTS
4
+
5
+ # Sample Hindi text
6
+ text = "नमस्ते, यह एक परीक्षण संदेश है।"
7
+
8
+ # Generate TTS in Hindi
9
+ tts = gTTS(text=text, lang='hi')
10
+ tts.save("test_hindi.mp3")
11
+
12
+ print("✅ Hindi TTS audio saved successfully!")
main2.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from bs4 import BeautifulSoup
4
+ from newspaper import Article
5
+ import requests
6
+ from categorize_text import classify_text_domain
7
+ from time import sleep
8
+
9
+
10
+ # Dictionary to track visited links
11
+ visited_links = {}
12
+
13
+
14
+ def get_article_metadata(url, company_name):
15
+ """Fetches metadata from a given article URL."""
16
+ try:
17
+ article = Article(url)
18
+ article.download()
19
+ article.parse()
20
+ article.nlp()
21
+
22
+ # Filter by company name
23
+ if company_name.lower() not in article.text.lower():
24
+ return None # Skip articles that do not mention the company
25
+
26
+ return {
27
+ "title": article.title,
28
+ "summary": article.summary,
29
+ "url": url,
30
+ "publish_date": article.publish_date,
31
+ "domain": classify_text_domain(article.text)
32
+ }
33
+
34
+ except Exception as e:
35
+ print(f"Error processing {url}: {e}")
36
+ return None
37
+
38
+
39
+ def extract_news(company_name, max_articles=10):
40
+ """Extracts news articles for the given company."""
41
+
42
+ all_links = [
43
+ f"https://timesofindia.indiatimes.com/topic/{company_name}/news",
44
+ f"https://economictimes.indiatimes.com/topic/{company_name}",
45
+ f"https://www.hindustantimes.com/search?q={company_name}"
46
+ ]
47
+
48
+ articles = []
49
+
50
+ for base_url in all_links:
51
+ try:
52
+ response = requests.get(base_url, timeout=10)
53
+ if response.status_code != 200:
54
+ print(f"Failed to access {base_url}")
55
+ continue
56
+
57
+ soup = BeautifulSoup(response.text, 'html.parser')
58
+
59
+ # Extract article links
60
+ for a_tag in soup.find_all('a', href=True):
61
+ link = a_tag['href']
62
+ full_link = link if link.startswith("http") else f"{base_url}{link}"
63
+
64
+ # Filter for valid TOI, ET, and HT articles
65
+ if ("timesofindia.indiatimes.com" in full_link and "articleshow" in full_link) or \
66
+ ("economictimes.indiatimes.com" in full_link) or \
67
+ ("hindustantimes.com" in full_link):
68
+
69
+ if full_link not in visited_links:
70
+ sleep(1) # Add delay to prevent rate limiting
71
+ article_data = get_article_metadata(full_link, company_name)
72
+
73
+ if article_data:
74
+ visited_links[full_link] = article_data["domain"]
75
+ articles.append(article_data)
76
+
77
+ if len(articles) >= max_articles:
78
+ break
79
+ except Exception as e:
80
+ print(f"Error scraping {base_url}: {e}")
81
+ continue
82
+
83
+ # Store results in a DataFrame
84
+ df = pd.DataFrame(articles)
85
+
86
+ if df.empty:
87
+ print(f"No relevant articles found for {company_name}.")
88
+ else:
89
+ print(f"\nExtracted {len(articles)} articles for {company_name}")
90
+ print(df)
91
+
92
+ return df
93
+
94
+
95
+ # ✅ List of 10 Companies to Extract News For
96
+ companies = [
97
+ "Reliance", "Tata", "Infosys", "Wipro", "HDFC",
98
+ "ICICI", "L&T", "Adani", "Bharti Airtel", "Bajaj"
99
+ ]
100
+
101
+ # ✅ Loop through each company and extract articles
102
+ output_dir = "company_news"
103
+ os.makedirs(output_dir, exist_ok=True)
104
+
105
+ for company in companies:
106
+ print(f"\n🔍 Extracting news for {company}...")
107
+
108
+ result_df = extract_news(company, max_articles=10)
109
+
110
+ # Save results to CSV
111
+ if not result_df.empty:
112
+ csv_filename = os.path.join(output_dir, f"{company}_news.csv")
113
+ result_df.to_csv(csv_filename, index=False)
114
+ print(f"✅ Saved {company} news articles to {csv_filename}")
115
+ else:
116
+ print(f"⚠️ No articles found for {company}")
117
+
118
+ print("\n🎯 Extraction completed for all companies!")
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ flask
2
+ streamlit
3
+ gtts
4
+ nltk
5
+ pandas
6
+ textblob
7
+ beautifulsoup4
8
+ newspaper3k
9
+ requests
sentiment_analysis.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from textblob import TextBlob
3
+
4
+ def analyze_sentiment(text):
5
+ """Perform sentiment analysis on the given text."""
6
+ try:
7
+ blob = TextBlob(text)
8
+ polarity = blob.sentiment.polarity
9
+
10
+ if polarity > 0:
11
+ sentiment = "Positive"
12
+ elif polarity < 0:
13
+ sentiment = "Negative"
14
+ else:
15
+ sentiment = "Neutral"
16
+
17
+ return sentiment, round(polarity, 2)
18
+
19
+ except Exception as e:
20
+ print(f"Error in sentiment analysis: {e}")
21
+ return "Neutral", 0.0
22
+
23
+
24
+ def perform_sentiment_analysis(csv_file):
25
+ """Analyze sentiment for all articles in the CSV."""
26
+ df = pd.read_csv(csv_file)
27
+
28
+ if 'summary' not in df.columns:
29
+ print("No 'summary' column found in CSV.")
30
+ return None
31
+
32
+ df['sentiment'], df['polarity'] = zip(*df['summary'].apply(analyze_sentiment))
33
+
34
+ # Save the result with sentiment analysis
35
+ output_csv = csv_file.replace('.csv', '_sentiment.csv')
36
+ df.to_csv(output_csv, index=False)
37
+ print(f"✅ Sentiment analysis saved to {output_csv}")
38
+
39
+ return df
40
+
41
+
42
+ def comparative_analysis(df):
43
+ """Perform comparative sentiment analysis across multiple articles."""
44
+ sentiment_counts = df['sentiment'].value_counts(normalize=True) * 100
45
+
46
+ print("\n📊 Sentiment Distribution:")
47
+ print(sentiment_counts)
48
+
49
+ summary = {
50
+ "positive": sentiment_counts.get("Positive", 0),
51
+ "negative": sentiment_counts.get("Negative", 0),
52
+ "neutral": sentiment_counts.get("Neutral", 0)
53
+ }
54
+
55
+ return summary
tts_hindi.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from TTS.api import TTS
2
+ import os
3
+
4
+
5
+ def generate_hindi_coqui_tts(text, company_name):
6
+ """
7
+ Generate high-quality Hindi TTS using Coqui TTS.
8
+ """
9
+ try:
10
+ output_dir = "output"
11
+ os.makedirs(output_dir, exist_ok=True)
12
+
13
+ audio_file = os.path.join(output_dir, f"{company_name}_tts_hindi.wav")
14
+
15
+ # ✅ Use pre-trained Hindi model
16
+ model_name = "tts_models/hi/ek1/vits"
17
+ tts = TTS(model_name)
18
+
19
+ # ✅ Generate and save Hindi TTS
20
+ tts.tts_to_file(text=text, file_path=audio_file)
21
+
22
+ print(f"✅ High-quality Hindi TTS saved: {audio_file}")
23
+ return audio_file
24
+
25
+ except Exception as e:
26
+ print(f"❌ Error generating Coqui Hindi TTS: {e}")
27
+ return None