Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- api.py +66 -0
- app.py +60 -0
- categorize_text.py +62 -0
- clearCache.py +12 -0
- main2.py +118 -0
- requirements.txt +9 -0
- sentiment_analysis.py +55 -0
- tts_hindi.py +27 -0
api.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify
|
2 |
+
import os
|
3 |
+
from sentiment_analysis import perform_sentiment_analysis, comparative_analysis
|
4 |
+
from tts_hindi import generate_hindi_coqui_tts
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
app = Flask(__name__)
|
8 |
+
|
9 |
+
@app.route('/analyze', methods=['POST'])
|
10 |
+
def analyze():
|
11 |
+
"""Perform news sentiment analysis and TTS."""
|
12 |
+
try:
|
13 |
+
company_name = request.json.get('company_name')
|
14 |
+
|
15 |
+
if not company_name:
|
16 |
+
return jsonify({"error": "Company name is required"}), 400
|
17 |
+
|
18 |
+
# CSV file with extracted articles
|
19 |
+
csv_file = f"company_news/{company_name}_news.csv"
|
20 |
+
|
21 |
+
if not os.path.exists(csv_file):
|
22 |
+
return jsonify({"error": f"No data found for {company_name}"}), 404
|
23 |
+
|
24 |
+
# Perform sentiment analysis
|
25 |
+
sentiment_df = perform_sentiment_analysis(csv_file)
|
26 |
+
sentiment_summary = comparative_analysis(sentiment_df)
|
27 |
+
|
28 |
+
# ✅ Generate Hindi TTS audio
|
29 |
+
summary_text = ". ".join(sentiment_df['summary'].tolist())
|
30 |
+
audio_file = generate_hindi_coqui_tts(summary_text, company_name)
|
31 |
+
|
32 |
+
# Extract article details
|
33 |
+
articles = sentiment_df[['title', 'summary', 'url']].to_dict(orient='records')
|
34 |
+
|
35 |
+
return jsonify({
|
36 |
+
"company": company_name,
|
37 |
+
"sentiment_summary": sentiment_summary,
|
38 |
+
"articles": articles,
|
39 |
+
"audio_file": audio_file
|
40 |
+
})
|
41 |
+
|
42 |
+
except Exception as e:
|
43 |
+
print(f"API Error: {e}")
|
44 |
+
return jsonify({"error": "Internal server error"}), 500
|
45 |
+
@app.route('/generate-tts', methods=['POST'])
|
46 |
+
def generate_tts_api():
|
47 |
+
data = request.get_json()
|
48 |
+
|
49 |
+
text = data.get('text')
|
50 |
+
company_name = data.get('company_name', 'default_company')
|
51 |
+
|
52 |
+
if not text:
|
53 |
+
return jsonify({"error": "Text is required"}), 400
|
54 |
+
|
55 |
+
audio_file = generate_hindi_coqui_tts(text, company_name)
|
56 |
+
|
57 |
+
if audio_file and os.path.exists(audio_file):
|
58 |
+
return jsonify({
|
59 |
+
"message": "✅ TTS generated successfully",
|
60 |
+
"audio_file": audio_file
|
61 |
+
})
|
62 |
+
else:
|
63 |
+
return jsonify({"error": "Failed to generate TTS"}), 500
|
64 |
+
|
65 |
+
if __name__ == '__main__':
|
66 |
+
app.run(debug=True)
|
app.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
BACKEND_URL = "http://127.0.0.1:5000/analyze"
|
7 |
+
|
8 |
+
st.title("📊 News Sentiment Analysis & TTS in Hindi")
|
9 |
+
|
10 |
+
# Input field for company name
|
11 |
+
company_name = st.text_input("Enter Company Name", "")
|
12 |
+
|
13 |
+
if st.button("Analyze"):
|
14 |
+
if not company_name:
|
15 |
+
st.warning("⚠️ Please enter a company name.")
|
16 |
+
else:
|
17 |
+
st.info(f"Analyzing news for {company_name}...")
|
18 |
+
|
19 |
+
response = requests.post(
|
20 |
+
BACKEND_URL,
|
21 |
+
json={"company_name": company_name}
|
22 |
+
)
|
23 |
+
|
24 |
+
if response.status_code == 200:
|
25 |
+
data = response.json()
|
26 |
+
|
27 |
+
st.success("✅ Analysis Complete!")
|
28 |
+
|
29 |
+
# ✅ Display Sentiment Summary
|
30 |
+
st.subheader("📊 Sentiment Summary")
|
31 |
+
st.json(data["sentiment_summary"])
|
32 |
+
|
33 |
+
# ✅ Display Articles
|
34 |
+
st.subheader("📰 Extracted Articles")
|
35 |
+
|
36 |
+
df = pd.DataFrame(data["articles"])
|
37 |
+
for _, article in df.iterrows():
|
38 |
+
st.markdown(f"### [{article['title']}]({article['url']})")
|
39 |
+
st.write(f"**Summary:** {article['summary']}")
|
40 |
+
st.write("---")
|
41 |
+
|
42 |
+
# ✅ Display Hindi TTS Audio
|
43 |
+
st.subheader("🔊 Hindi TTS Audio Output")
|
44 |
+
|
45 |
+
audio_file = "output/TestCompany_tts.mp3"
|
46 |
+
if os.path.exists(audio_file):
|
47 |
+
with open(audio_file, "rb") as audio:
|
48 |
+
st.download_button(
|
49 |
+
label="🔊 Download Hindi TTS Audio",
|
50 |
+
data=audio,
|
51 |
+
file_name="Hindi_TTS.mp3",
|
52 |
+
mime="audio/mpeg"
|
53 |
+
)
|
54 |
+
st.audio(audio_file, format="audio/mp3")
|
55 |
+
st.success("✅ Hindi TTS audio displayed successfully!")
|
56 |
+
else:
|
57 |
+
st.error("❌ TTS file not found.")
|
58 |
+
|
59 |
+
else:
|
60 |
+
st.error("❌ Error analyzing news. Please try again.")
|
categorize_text.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from nltk.corpus import stopwords
|
3 |
+
|
4 |
+
|
5 |
+
# Preprocessing function
|
6 |
+
def preprocess_text(text):
|
7 |
+
"""Tokenize and clean the input text"""
|
8 |
+
tokens = text.lower().split()
|
9 |
+
|
10 |
+
# Remove punctuation and stopwords
|
11 |
+
stop_words = set(stopwords.words('english'))
|
12 |
+
tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
|
13 |
+
|
14 |
+
return tokens
|
15 |
+
|
16 |
+
|
17 |
+
# Function to calculate similarity score between text and bag of words
|
18 |
+
def similarity_score(text, bow):
|
19 |
+
"""Calculate similarity score between text and BoW"""
|
20 |
+
|
21 |
+
tokens = preprocess_text(text)
|
22 |
+
|
23 |
+
# Ensure the BoW contains the 'Word' and 'Frequency' columns
|
24 |
+
if 'Word' not in bow.columns or 'Frequency' not in bow.columns:
|
25 |
+
print("Invalid BoW format. Ensure it contains 'Word' and 'Frequency' columns.")
|
26 |
+
return 0
|
27 |
+
|
28 |
+
# Calculate similarity score
|
29 |
+
common_words = set(tokens) & set(bow['Word'])
|
30 |
+
|
31 |
+
# Sum the frequencies of matching words
|
32 |
+
score = sum(bow[bow['Word'] == word]['Frequency'].values[0] for word in common_words)
|
33 |
+
|
34 |
+
return score
|
35 |
+
|
36 |
+
|
37 |
+
# Function to classify text domain using bag of words
|
38 |
+
def classify_text_domain(text):
|
39 |
+
"""Classify text domain based on similarity score with BoW files"""
|
40 |
+
|
41 |
+
# Load BoW CSV files for different domains
|
42 |
+
try:
|
43 |
+
reliance_bow = pd.read_csv("reliance_bow.csv")
|
44 |
+
except FileNotFoundError:
|
45 |
+
print("BoW file not found.")
|
46 |
+
return "Unknown"
|
47 |
+
|
48 |
+
# Ensure CSV files are not empty
|
49 |
+
if reliance_bow.empty:
|
50 |
+
print("BoW file is empty.")
|
51 |
+
return "Unknown"
|
52 |
+
|
53 |
+
# Calculate similarity scores
|
54 |
+
scores = {
|
55 |
+
"Reliance": similarity_score(text, reliance_bow)
|
56 |
+
}
|
57 |
+
|
58 |
+
# Determine the domain with the highest similarity score
|
59 |
+
domain = max(scores, key=scores.get)
|
60 |
+
|
61 |
+
print(f"Scores: {scores}") # Display
|
62 |
+
|
clearCache.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import nltk
|
2 |
+
# nltk.download('all')
|
3 |
+
from gtts import gTTS
|
4 |
+
|
5 |
+
# Sample Hindi text
|
6 |
+
text = "नमस्ते, यह एक परीक्षण संदेश है।"
|
7 |
+
|
8 |
+
# Generate TTS in Hindi
|
9 |
+
tts = gTTS(text=text, lang='hi')
|
10 |
+
tts.save("test_hindi.mp3")
|
11 |
+
|
12 |
+
print("✅ Hindi TTS audio saved successfully!")
|
main2.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from newspaper import Article
|
5 |
+
import requests
|
6 |
+
from categorize_text import classify_text_domain
|
7 |
+
from time import sleep
|
8 |
+
|
9 |
+
|
10 |
+
# Dictionary to track visited links
|
11 |
+
visited_links = {}
|
12 |
+
|
13 |
+
|
14 |
+
def get_article_metadata(url, company_name):
|
15 |
+
"""Fetches metadata from a given article URL."""
|
16 |
+
try:
|
17 |
+
article = Article(url)
|
18 |
+
article.download()
|
19 |
+
article.parse()
|
20 |
+
article.nlp()
|
21 |
+
|
22 |
+
# Filter by company name
|
23 |
+
if company_name.lower() not in article.text.lower():
|
24 |
+
return None # Skip articles that do not mention the company
|
25 |
+
|
26 |
+
return {
|
27 |
+
"title": article.title,
|
28 |
+
"summary": article.summary,
|
29 |
+
"url": url,
|
30 |
+
"publish_date": article.publish_date,
|
31 |
+
"domain": classify_text_domain(article.text)
|
32 |
+
}
|
33 |
+
|
34 |
+
except Exception as e:
|
35 |
+
print(f"Error processing {url}: {e}")
|
36 |
+
return None
|
37 |
+
|
38 |
+
|
39 |
+
def extract_news(company_name, max_articles=10):
|
40 |
+
"""Extracts news articles for the given company."""
|
41 |
+
|
42 |
+
all_links = [
|
43 |
+
f"https://timesofindia.indiatimes.com/topic/{company_name}/news",
|
44 |
+
f"https://economictimes.indiatimes.com/topic/{company_name}",
|
45 |
+
f"https://www.hindustantimes.com/search?q={company_name}"
|
46 |
+
]
|
47 |
+
|
48 |
+
articles = []
|
49 |
+
|
50 |
+
for base_url in all_links:
|
51 |
+
try:
|
52 |
+
response = requests.get(base_url, timeout=10)
|
53 |
+
if response.status_code != 200:
|
54 |
+
print(f"Failed to access {base_url}")
|
55 |
+
continue
|
56 |
+
|
57 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
58 |
+
|
59 |
+
# Extract article links
|
60 |
+
for a_tag in soup.find_all('a', href=True):
|
61 |
+
link = a_tag['href']
|
62 |
+
full_link = link if link.startswith("http") else f"{base_url}{link}"
|
63 |
+
|
64 |
+
# Filter for valid TOI, ET, and HT articles
|
65 |
+
if ("timesofindia.indiatimes.com" in full_link and "articleshow" in full_link) or \
|
66 |
+
("economictimes.indiatimes.com" in full_link) or \
|
67 |
+
("hindustantimes.com" in full_link):
|
68 |
+
|
69 |
+
if full_link not in visited_links:
|
70 |
+
sleep(1) # Add delay to prevent rate limiting
|
71 |
+
article_data = get_article_metadata(full_link, company_name)
|
72 |
+
|
73 |
+
if article_data:
|
74 |
+
visited_links[full_link] = article_data["domain"]
|
75 |
+
articles.append(article_data)
|
76 |
+
|
77 |
+
if len(articles) >= max_articles:
|
78 |
+
break
|
79 |
+
except Exception as e:
|
80 |
+
print(f"Error scraping {base_url}: {e}")
|
81 |
+
continue
|
82 |
+
|
83 |
+
# Store results in a DataFrame
|
84 |
+
df = pd.DataFrame(articles)
|
85 |
+
|
86 |
+
if df.empty:
|
87 |
+
print(f"No relevant articles found for {company_name}.")
|
88 |
+
else:
|
89 |
+
print(f"\nExtracted {len(articles)} articles for {company_name}")
|
90 |
+
print(df)
|
91 |
+
|
92 |
+
return df
|
93 |
+
|
94 |
+
|
95 |
+
# ✅ List of 10 Companies to Extract News For
|
96 |
+
companies = [
|
97 |
+
"Reliance", "Tata", "Infosys", "Wipro", "HDFC",
|
98 |
+
"ICICI", "L&T", "Adani", "Bharti Airtel", "Bajaj"
|
99 |
+
]
|
100 |
+
|
101 |
+
# ✅ Loop through each company and extract articles
|
102 |
+
output_dir = "company_news"
|
103 |
+
os.makedirs(output_dir, exist_ok=True)
|
104 |
+
|
105 |
+
for company in companies:
|
106 |
+
print(f"\n🔍 Extracting news for {company}...")
|
107 |
+
|
108 |
+
result_df = extract_news(company, max_articles=10)
|
109 |
+
|
110 |
+
# Save results to CSV
|
111 |
+
if not result_df.empty:
|
112 |
+
csv_filename = os.path.join(output_dir, f"{company}_news.csv")
|
113 |
+
result_df.to_csv(csv_filename, index=False)
|
114 |
+
print(f"✅ Saved {company} news articles to {csv_filename}")
|
115 |
+
else:
|
116 |
+
print(f"⚠️ No articles found for {company}")
|
117 |
+
|
118 |
+
print("\n🎯 Extraction completed for all companies!")
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
flask
|
2 |
+
streamlit
|
3 |
+
gtts
|
4 |
+
nltk
|
5 |
+
pandas
|
6 |
+
textblob
|
7 |
+
beautifulsoup4
|
8 |
+
newspaper3k
|
9 |
+
requests
|
sentiment_analysis.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from textblob import TextBlob
|
3 |
+
|
4 |
+
def analyze_sentiment(text):
|
5 |
+
"""Perform sentiment analysis on the given text."""
|
6 |
+
try:
|
7 |
+
blob = TextBlob(text)
|
8 |
+
polarity = blob.sentiment.polarity
|
9 |
+
|
10 |
+
if polarity > 0:
|
11 |
+
sentiment = "Positive"
|
12 |
+
elif polarity < 0:
|
13 |
+
sentiment = "Negative"
|
14 |
+
else:
|
15 |
+
sentiment = "Neutral"
|
16 |
+
|
17 |
+
return sentiment, round(polarity, 2)
|
18 |
+
|
19 |
+
except Exception as e:
|
20 |
+
print(f"Error in sentiment analysis: {e}")
|
21 |
+
return "Neutral", 0.0
|
22 |
+
|
23 |
+
|
24 |
+
def perform_sentiment_analysis(csv_file):
|
25 |
+
"""Analyze sentiment for all articles in the CSV."""
|
26 |
+
df = pd.read_csv(csv_file)
|
27 |
+
|
28 |
+
if 'summary' not in df.columns:
|
29 |
+
print("No 'summary' column found in CSV.")
|
30 |
+
return None
|
31 |
+
|
32 |
+
df['sentiment'], df['polarity'] = zip(*df['summary'].apply(analyze_sentiment))
|
33 |
+
|
34 |
+
# Save the result with sentiment analysis
|
35 |
+
output_csv = csv_file.replace('.csv', '_sentiment.csv')
|
36 |
+
df.to_csv(output_csv, index=False)
|
37 |
+
print(f"✅ Sentiment analysis saved to {output_csv}")
|
38 |
+
|
39 |
+
return df
|
40 |
+
|
41 |
+
|
42 |
+
def comparative_analysis(df):
|
43 |
+
"""Perform comparative sentiment analysis across multiple articles."""
|
44 |
+
sentiment_counts = df['sentiment'].value_counts(normalize=True) * 100
|
45 |
+
|
46 |
+
print("\n📊 Sentiment Distribution:")
|
47 |
+
print(sentiment_counts)
|
48 |
+
|
49 |
+
summary = {
|
50 |
+
"positive": sentiment_counts.get("Positive", 0),
|
51 |
+
"negative": sentiment_counts.get("Negative", 0),
|
52 |
+
"neutral": sentiment_counts.get("Neutral", 0)
|
53 |
+
}
|
54 |
+
|
55 |
+
return summary
|
tts_hindi.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from TTS.api import TTS
|
2 |
+
import os
|
3 |
+
|
4 |
+
|
5 |
+
def generate_hindi_coqui_tts(text, company_name):
|
6 |
+
"""
|
7 |
+
Generate high-quality Hindi TTS using Coqui TTS.
|
8 |
+
"""
|
9 |
+
try:
|
10 |
+
output_dir = "output"
|
11 |
+
os.makedirs(output_dir, exist_ok=True)
|
12 |
+
|
13 |
+
audio_file = os.path.join(output_dir, f"{company_name}_tts_hindi.wav")
|
14 |
+
|
15 |
+
# ✅ Use pre-trained Hindi model
|
16 |
+
model_name = "tts_models/hi/ek1/vits"
|
17 |
+
tts = TTS(model_name)
|
18 |
+
|
19 |
+
# ✅ Generate and save Hindi TTS
|
20 |
+
tts.tts_to_file(text=text, file_path=audio_file)
|
21 |
+
|
22 |
+
print(f"✅ High-quality Hindi TTS saved: {audio_file}")
|
23 |
+
return audio_file
|
24 |
+
|
25 |
+
except Exception as e:
|
26 |
+
print(f"❌ Error generating Coqui Hindi TTS: {e}")
|
27 |
+
return None
|