shubhamprakash108 commited on
Commit
ac4bcc7
·
verified ·
1 Parent(s): 935910d

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +170 -170
utils.py CHANGED
@@ -1,170 +1,170 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
- import json
4
- import os
5
- import time
6
- import re
7
- from newspaper import Article
8
- from html import unescape
9
- from transformers import pipeline,VitsModel, AutoTokenizer
10
- import torch
11
- import soundfile as sf
12
- from bertopic import BERTopic
13
- from sentence_transformers import SentenceTransformer
14
-
15
- def clean_text(text):
16
- text = unescape(text)
17
- text = re.sub(r'\s+', ' ', text)
18
- text = re.sub(r'<.*?>', '', text)
19
- text = text.replace('\n', ' ').replace('\r', ' ')
20
- return text.strip()
21
-
22
- def search_news(company_name, num_articles=10):
23
- query = f"{company_name} news".replace(' ', '+')
24
- headers = {
25
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
26
- }
27
- search_url = f"https://www.google.com/search?q={query}&tbm=nws"
28
-
29
- try:
30
- response = requests.get(search_url, headers=headers)
31
- response.raise_for_status()
32
- soup = BeautifulSoup(response.text, 'html.parser')
33
-
34
- news_links = []
35
- news_divs = soup.find_all('div', class_='SoaBEf')
36
-
37
- for div in news_divs:
38
- link_tag = div.find('a')
39
- if link_tag:
40
- href = link_tag.get('href')
41
- if href.startswith('/url?q='):
42
- url = href.split('/url?q=')[1].split('&sa=')[0]
43
- news_links.append(url)
44
- elif href.startswith('http'):
45
- news_links.append(href)
46
-
47
- return news_links
48
- except Exception as e:
49
- print(f"Error searching for news: {str(e)}")
50
- return []
51
-
52
- def extract_article_content(url):
53
- try:
54
- article = Article(url)
55
- article.download()
56
- article.parse()
57
-
58
- if not article.text.strip():
59
- raise ValueError("Empty article content")
60
-
61
- return {
62
- "title": clean_text(article.title),
63
- "content": clean_text(article.text),
64
- "url": url
65
- }
66
- except Exception as e:
67
- print(f"Skipping article {url} due to error: {str(e)}")
68
- return None
69
-
70
- def save_company_news(company_name, num_articles=10):
71
- news_urls = search_news(company_name)
72
- articles = []
73
-
74
- for url in news_urls:
75
- if len(articles) >= num_articles:
76
- break
77
-
78
- article_data = extract_article_content(url)
79
- if article_data:
80
- articles.append(article_data)
81
-
82
- time.sleep(1)
83
-
84
- while len(articles) < num_articles:
85
- additional_urls = search_news(company_name, num_articles=10)
86
- for url in additional_urls:
87
- if len(articles) >= num_articles:
88
- break
89
- article_data = extract_article_content(url)
90
- if article_data:
91
- articles.append(article_data)
92
- time.sleep(1)
93
-
94
- os.makedirs("Company", exist_ok=True)
95
- file_path = os.path.join("Company", f"{company_name}.json")
96
-
97
- with open(file_path, "w", encoding="utf-8") as json_file:
98
- json.dump(articles, json_file, ensure_ascii=False, indent=4)
99
-
100
- return file_path
101
-
102
- def sentiment_analysis_model(text):
103
- text = text[:510]
104
- classifier = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
105
- result = classifier(text)[0]
106
- label_mapping = {
107
- "LABEL_0": "Negative",
108
- "LABEL_1": "Neutral",
109
- "LABEL_2": "Positive"
110
- }
111
- sentiment = label_mapping.get(result["label"], "Unknown")
112
- print({"sentiment": sentiment, "score": result["score"]})
113
- return {"sentiment": sentiment}
114
-
115
- def news_summarization(ARTICLE):
116
- summarizer = pipeline("summarization", model="Falconsai/text_summarization")
117
- summary = summarizer(ARTICLE, max_length=57)
118
- return summary[0]['summary_text']
119
-
120
- # def audio_output(text):
121
- # model = VitsModel.from_pretrained("facebook/mms-tts-hin")
122
- # tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")
123
- # inputs = tokenizer(text, return_tensors="pt")
124
- # with torch.no_grad():
125
- # output = model(**inputs).waveform
126
- # waveform = output.squeeze().cpu().numpy()
127
- # sample_rate = 16000
128
- # sf.write("output.wav", waveform, sample_rate)
129
-
130
- def audio_output(text, output_file="output.wav"):
131
- device = "cuda" if torch.cuda.is_available() else "cpu"
132
-
133
- try:
134
- model = VitsModel.from_pretrained("facebook/mms-tts-hin").to(device)
135
- tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")
136
-
137
- inputs = tokenizer(text, return_tensors="pt").to(device)
138
-
139
- with torch.no_grad():
140
- output = model(**inputs).waveform
141
- waveform = output.squeeze().cpu().numpy()
142
-
143
- sample_rate = 16000
144
- sf.write(output_file, waveform, sample_rate)
145
- if device == "cuda":
146
- torch.cuda.empty_cache()
147
-
148
- del model
149
- del inputs
150
- del output
151
- del waveform
152
-
153
- except Exception as e:
154
- print(f"Error generating audio: {str(e)}")
155
-
156
- def Topic_finder(text):
157
- device = "cuda" if torch.cuda.is_available() else "cpu"
158
- embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
159
-
160
- topic_model = BERTopic.load("ctam8736/bertopic-20-newsgroups")
161
- topic_model.embedding_model = embedding_model
162
- embeddings = embedding_model.encode([text])
163
- topic, _ = topic_model.transform([text], embeddings=embeddings)
164
- words = topic_model.get_topic(topic[0])
165
- related_words = [word for word, _ in words]
166
- return related_words
167
-
168
-
169
-
170
-
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+ import os
5
+ import time
6
+ import re
7
+ from newspaper import Article
8
+ from html import unescape
9
+ from transformers import pipeline,VitsModel, AutoTokenizer
10
+ import torch
11
+ import soundfile as sf
12
+ from bertopic import BERTopic
13
+ from sentence_transformers import SentenceTransformer
14
+
15
+ def clean_text(text):
16
+ text = unescape(text)
17
+ text = re.sub(r'\s+', ' ', text)
18
+ text = re.sub(r'<.*?>', '', text)
19
+ text = text.replace('\n', ' ').replace('\r', ' ')
20
+ return text.strip()
21
+
22
+ def search_news(company_name, num_articles=2):
23
+ query = f"{company_name} news".replace(' ', '+')
24
+ headers = {
25
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
26
+ }
27
+ search_url = f"https://www.google.com/search?q={query}&tbm=nws"
28
+
29
+ try:
30
+ response = requests.get(search_url, headers=headers)
31
+ response.raise_for_status()
32
+ soup = BeautifulSoup(response.text, 'html.parser')
33
+
34
+ news_links = []
35
+ news_divs = soup.find_all('div', class_='SoaBEf')
36
+
37
+ for div in news_divs:
38
+ link_tag = div.find('a')
39
+ if link_tag:
40
+ href = link_tag.get('href')
41
+ if href.startswith('/url?q='):
42
+ url = href.split('/url?q=')[1].split('&sa=')[0]
43
+ news_links.append(url)
44
+ elif href.startswith('http'):
45
+ news_links.append(href)
46
+
47
+ return news_links
48
+ except Exception as e:
49
+ print(f"Error searching for news: {str(e)}")
50
+ return []
51
+
52
+ def extract_article_content(url):
53
+ try:
54
+ article = Article(url)
55
+ article.download()
56
+ article.parse()
57
+
58
+ if not article.text.strip():
59
+ raise ValueError("Empty article content")
60
+
61
+ return {
62
+ "title": clean_text(article.title),
63
+ "content": clean_text(article.text),
64
+ "url": url
65
+ }
66
+ except Exception as e:
67
+ print(f"Skipping article {url} due to error: {str(e)}")
68
+ return None
69
+
70
+ def save_company_news(company_name, num_articles=10):
71
+ news_urls = search_news(company_name)
72
+ articles = []
73
+
74
+ for url in news_urls:
75
+ if len(articles) >= num_articles:
76
+ break
77
+
78
+ article_data = extract_article_content(url)
79
+ if article_data:
80
+ articles.append(article_data)
81
+
82
+ time.sleep(1)
83
+
84
+ while len(articles) < num_articles:
85
+ additional_urls = search_news(company_name, num_articles=10)
86
+ for url in additional_urls:
87
+ if len(articles) >= num_articles:
88
+ break
89
+ article_data = extract_article_content(url)
90
+ if article_data:
91
+ articles.append(article_data)
92
+ time.sleep(1)
93
+
94
+ os.makedirs("Company", exist_ok=True)
95
+ file_path = os.path.join("Company", f"{company_name}.json")
96
+
97
+ with open(file_path, "w", encoding="utf-8") as json_file:
98
+ json.dump(articles, json_file, ensure_ascii=False, indent=4)
99
+
100
+ return file_path
101
+
102
+ def sentiment_analysis_model(text):
103
+ text = text[:510]
104
+ classifier = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
105
+ result = classifier(text)[0]
106
+ label_mapping = {
107
+ "LABEL_0": "Negative",
108
+ "LABEL_1": "Neutral",
109
+ "LABEL_2": "Positive"
110
+ }
111
+ sentiment = label_mapping.get(result["label"], "Unknown")
112
+ print({"sentiment": sentiment, "score": result["score"]})
113
+ return {"sentiment": sentiment}
114
+
115
+ def news_summarization(ARTICLE):
116
+ summarizer = pipeline("summarization", model="Falconsai/text_summarization")
117
+ summary = summarizer(ARTICLE, max_length=57)
118
+ return summary[0]['summary_text']
119
+
120
+ # def audio_output(text):
121
+ # model = VitsModel.from_pretrained("facebook/mms-tts-hin")
122
+ # tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")
123
+ # inputs = tokenizer(text, return_tensors="pt")
124
+ # with torch.no_grad():
125
+ # output = model(**inputs).waveform
126
+ # waveform = output.squeeze().cpu().numpy()
127
+ # sample_rate = 16000
128
+ # sf.write("output.wav", waveform, sample_rate)
129
+
130
+ def audio_output(text, output_file="output.wav"):
131
+ device = "cuda" if torch.cuda.is_available() else "cpu"
132
+
133
+ try:
134
+ model = VitsModel.from_pretrained("facebook/mms-tts-hin").to(device)
135
+ tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")
136
+
137
+ inputs = tokenizer(text, return_tensors="pt").to(device)
138
+
139
+ with torch.no_grad():
140
+ output = model(**inputs).waveform
141
+ waveform = output.squeeze().cpu().numpy()
142
+
143
+ sample_rate = 16000
144
+ sf.write(output_file, waveform, sample_rate)
145
+ if device == "cuda":
146
+ torch.cuda.empty_cache()
147
+
148
+ del model
149
+ del inputs
150
+ del output
151
+ del waveform
152
+
153
+ except Exception as e:
154
+ print(f"Error generating audio: {str(e)}")
155
+
156
+ def Topic_finder(text):
157
+ device = "cuda" if torch.cuda.is_available() else "cpu"
158
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
159
+
160
+ topic_model = BERTopic.load("ctam8736/bertopic-20-newsgroups")
161
+ topic_model.embedding_model = embedding_model
162
+ embeddings = embedding_model.encode([text])
163
+ topic, _ = topic_model.transform([text], embeddings=embeddings)
164
+ words = topic_model.get_topic(topic[0])
165
+ related_words = [word for word, _ in words]
166
+ return related_words
167
+
168
+
169
+
170
+