shubhamprakash108 commited on
Commit
33fd372
·
verified ·
1 Parent(s): 75b4702

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +542 -0
  2. requirements.txt +16 -0
  3. utils.py +170 -0
app.py ADDED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import json
2
+ # import os
3
+ # from utils import save_company_news
4
+ # from utils import sentiment_analysis_model
5
+ # from utils import news_summarization, audio_output, Topic_finder
6
+ # from collections import Counter
7
+ # import time
8
+ # import re
9
+ # from deep_translator import GoogleTranslator
10
+ # from pydub import AudioSegment
11
+ # import gc
12
+ # import torch
13
+
14
+
15
+ # print("Company News Summarization")
16
+
17
+ # company_name = input("Enter Company Name: ")
18
+
19
+ # if company_name:
20
+ # file_path = save_company_news(company_name)
21
+
22
+ # if os.path.exists(file_path):
23
+ # with open(file_path, "r", encoding="utf-8") as file:
24
+ # articles = json.load(file)
25
+
26
+ # for article in articles:
27
+ # print(f"\nTitle: {article['title']}")
28
+ # print(f"Content: {article['content'][:100]}...")
29
+ # print(f"Read more: {article['url']}")
30
+
31
+ # del articles
32
+ # gc.collect()
33
+ # else:
34
+ # print("Failed to fetch news. Try again.")
35
+ # else:
36
+ # print("Please enter a company name.")
37
+
38
+ # with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
39
+ # data = json.load(file)
40
+
41
+ # for article in data:
42
+ # topics = Topic_finder(article['title'])
43
+
44
+ # sentiment = sentiment_analysis_model(article['content'])
45
+ # article["sentiment"] = sentiment['sentiment']
46
+
47
+ # del sentiment
48
+ # gc.collect()
49
+
50
+ # summary = news_summarization(article["content"])
51
+ # article["summary"] = summary
52
+
53
+ # article["topics"] = topics
54
+
55
+ # if torch.cuda.is_available():
56
+ # torch.cuda.empty_cache()
57
+
58
+ # gc.collect()
59
+
60
+ # with open(f"Company/{company_name}.json", "w", encoding="utf-8") as file:
61
+ # json.dump(data, file, indent=4)
62
+
63
+ # with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
64
+ # articles = json.load(file)
65
+
66
+ # sentiment_counts = Counter(article["sentiment"] for article in articles)
67
+
68
+ # print("Sentiment Counts:")
69
+ # print("Positive:", sentiment_counts.get("Positive", 0))
70
+ # print("Negative:", sentiment_counts.get("Negative", 0))
71
+ # print("Neutral:", sentiment_counts.get("Neutral", 0))
72
+
73
+ # del articles
74
+ # del sentiment_counts
75
+ # gc.collect()
76
+
77
+ # with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
78
+ # data = json.load(file)
79
+
80
+ # translator = GoogleTranslator(source="en", target="hi")
81
+
82
+ # audio_folder = "audio"
83
+ # os.makedirs(audio_folder, exist_ok=True)
84
+
85
+ # for file in os.listdir(audio_folder):
86
+ # file_path = os.path.join(audio_folder, file)
87
+ # if os.path.isfile(file_path):
88
+ # os.remove(file_path)
89
+
90
+ # text_data = ""
91
+ # audio_files = []
92
+
93
+ # def split_text(text, max_length=4500):
94
+ # sentences = re.split(r'(?<=[.!?])\s+', text)
95
+ # chunks = []
96
+ # current_chunk = ""
97
+
98
+ # for sentence in sentences:
99
+ # if len(current_chunk) + len(sentence) + 1 <= max_length:
100
+ # current_chunk += " " + sentence if current_chunk else sentence
101
+ # else:
102
+ # chunks.append(current_chunk)
103
+ # current_chunk = sentence
104
+
105
+ # if current_chunk:
106
+ # chunks.append(current_chunk)
107
+
108
+ # return chunks
109
+
110
+ # for i, article in enumerate(data, start=1):
111
+ # title_translated = translator.translate(article['title'])
112
+
113
+ # content_chunks = split_text(article['content'])
114
+ # translated_chunks = []
115
+
116
+ # for chunk in content_chunks:
117
+ # try:
118
+ # translated_chunk = translator.translate(chunk)
119
+ # translated_chunks.append(translated_chunk)
120
+ # time.sleep(0.5)
121
+ # except Exception as e:
122
+ # print(f"Error translating chunk: {str(e)}")
123
+ # translated_chunks.append(f"Translation error: {str(e)}")
124
+
125
+ # content_translated = " ".join(translated_chunks)
126
+
127
+ # del content_chunks
128
+ # gc.collect()
129
+
130
+ # article_text = (f"अब, आप लेख संख्या {i} सुन रहे हैं जिसका शीर्षक है: {title_translated}\n"
131
+ # f"अब, आप लेख संख्या {i} की सामग्री सुन रहे हैं।\n"
132
+ # f"सामग्री: {content_translated}\n\n")
133
+
134
+ # text_data += article_text
135
+
136
+ # audio_file = f"{audio_folder}/article_{i}.mp3"
137
+ # audio_output(article_text, audio_file)
138
+ # audio_files.append(audio_file)
139
+
140
+ # del article_text
141
+ # del content_translated
142
+ # del translated_chunks
143
+ # gc.collect()
144
+
145
+ # if torch.cuda.is_available():
146
+ # torch.cuda.empty_cache()
147
+
148
+ # time.sleep(1)
149
+
150
+ # output_file = f"Company/{company_name}_translated.txt"
151
+ # with open(output_file, "w", encoding="utf-8") as file:
152
+ # file.write(text_data)
153
+
154
+ # del text_data
155
+ # gc.collect()
156
+
157
+ # def combine_audio_files(audio_folder, output_file):
158
+ # try:
159
+ # print(f"Combining audio files from {audio_folder}...")
160
+ # audio_files = [f for f in os.listdir(audio_folder) if f.endswith('.mp3') and f != os.path.basename(output_file)]
161
+
162
+ # if not audio_files:
163
+ # print("No audio files found to combine.")
164
+ # return False
165
+
166
+ # audio_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]) if x.split('_')[-1].split('.')[0].isdigit() else 0)
167
+ # print(f"Found {len(audio_files)} audio files to combine.")
168
+
169
+ # combined = AudioSegment.empty()
170
+
171
+ # for file in audio_files:
172
+ # file_path = os.path.join(audio_folder, file)
173
+ # try:
174
+ # audio = AudioSegment.from_mp3(file_path)
175
+ # combined += audio
176
+ # print(f"Added {file}")
177
+
178
+ # del audio
179
+ # gc.collect()
180
+ # except Exception as e:
181
+ # print(f"Error processing {file}: {str(e)}")
182
+
183
+ # combined.export(output_file, format="mp3")
184
+ # print(f"Successfully combined audio files into {output_file}")
185
+
186
+ # del combined
187
+ # gc.collect()
188
+
189
+ # return True
190
+
191
+ # except Exception as e:
192
+ # print(f"Error combining audio files: {str(e)}")
193
+ # return False
194
+
195
+ # audio_folder = "audio"
196
+ # output_file = "combined_news.mp3"
197
+ # combine_audio_files(audio_folder, output_file)
198
+ # print("Audio combining process completed!")
199
+
200
+ # if torch.cuda.is_available():
201
+ # torch.cuda.empty_cache()
202
+
203
+ # gc.collect()
204
+
205
+ import streamlit as st
206
+ import json
207
+ import os
208
+ from utils import save_company_news
209
+ from utils import sentiment_analysis_model
210
+ from utils import news_summarization, audio_output, Topic_finder
211
+ from collections import Counter
212
+ import time
213
+ import re
214
+ from deep_translator import GoogleTranslator
215
+ from pydub import AudioSegment
216
+ import gc
217
+ import torch
218
+
219
+ # Set page config
220
+ st.set_page_config(
221
+ page_title="Company News Summarization",
222
+ page_icon="📰",
223
+ layout="wide"
224
+ )
225
+
226
+ # Create necessary folders
227
+ os.makedirs("Company", exist_ok=True)
228
+ os.makedirs("audio", exist_ok=True)
229
+
230
+ def split_text(text, max_length=4500):
231
+ sentences = re.split(r'(?<=[.!?])\s+', text)
232
+ chunks = []
233
+ current_chunk = ""
234
+
235
+ for sentence in sentences:
236
+ if len(current_chunk) + len(sentence) + 1 <= max_length:
237
+ current_chunk += " " + sentence if current_chunk else sentence
238
+ else:
239
+ chunks.append(current_chunk)
240
+ current_chunk = sentence
241
+
242
+ if current_chunk:
243
+ chunks.append(current_chunk)
244
+
245
+ return chunks
246
+
247
+ def combine_audio_files(audio_folder, output_file):
248
+ try:
249
+ st.info(f"Combining audio files from {audio_folder}...")
250
+ audio_files = [f for f in os.listdir(audio_folder) if f.endswith('.mp3') and f != os.path.basename(output_file)]
251
+
252
+ if not audio_files:
253
+ st.warning("No audio files found to combine.")
254
+ return False
255
+
256
+ audio_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]) if x.split('_')[-1].split('.')[0].isdigit() else 0)
257
+ st.info(f"Found {len(audio_files)} audio files to combine.")
258
+
259
+ combined = AudioSegment.empty()
260
+
261
+ for file in audio_files:
262
+ file_path = os.path.join(audio_folder, file)
263
+ try:
264
+ audio = AudioSegment.from_mp3(file_path)
265
+ combined += audio
266
+
267
+ del audio
268
+ gc.collect()
269
+ except Exception as e:
270
+ st.error(f"Error processing {file}: {str(e)}")
271
+
272
+ combined.export(output_file, format="mp3")
273
+ st.success(f"Successfully combined audio files into {output_file}")
274
+
275
+ del combined
276
+ gc.collect()
277
+
278
+ return True
279
+
280
+ except Exception as e:
281
+ st.error(f"Error combining audio files: {str(e)}")
282
+ return False
283
+
284
+ def process_company_news(company_name):
285
+ with st.spinner("Fetching company news..."):
286
+ file_path = save_company_news(company_name)
287
+
288
+ if not os.path.exists(file_path):
289
+ st.error("Failed to fetch news. Try again.")
290
+ return False
291
+
292
+ with open(file_path, "r", encoding="utf-8") as file:
293
+ articles = json.load(file)
294
+
295
+ st.success(f"Found {len(articles)} articles for {company_name}")
296
+
297
+ # Display a preview of the articles
298
+ with st.expander("Preview Articles"):
299
+ for article in articles:
300
+ st.subheader(article['title'])
301
+ st.write(f"{article['content'][:100]}...")
302
+ st.write(f"[Read more]({article['url']})")
303
+
304
+ del articles
305
+ gc.collect()
306
+
307
+ with st.spinner("Analyzing sentiment, extracting topics, and generating summaries..."):
308
+ progress_bar = st.progress(0)
309
+
310
+ with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
311
+ data = json.load(file)
312
+
313
+ total_articles = len(data)
314
+
315
+ for i, article in enumerate(data):
316
+ topics = Topic_finder(article['title'])
317
+
318
+ sentiment = sentiment_analysis_model(article['content'])
319
+ article["sentiment"] = sentiment['sentiment']
320
+
321
+ del sentiment
322
+ gc.collect()
323
+
324
+ summary = news_summarization(article["content"])
325
+ article["summary"] = summary
326
+
327
+ article["topics"] = topics
328
+
329
+ if torch.cuda.is_available():
330
+ torch.cuda.empty_cache()
331
+
332
+ gc.collect()
333
+ progress_bar.progress((i + 1) / total_articles)
334
+
335
+ with open(f"Company/{company_name}.json", "w", encoding="utf-8") as file:
336
+ json.dump(data, file, indent=4)
337
+
338
+ with st.spinner("Counting sentiment..."):
339
+ with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
340
+ articles = json.load(file)
341
+
342
+ sentiment_counts = Counter(article["sentiment"] for article in articles)
343
+
344
+ st.write("### Sentiment Analysis")
345
+ col1, col2, col3 = st.columns(3)
346
+ col1.metric("Positive", sentiment_counts.get("Positive", 0))
347
+ col2.metric("Negative", sentiment_counts.get("Negative", 0))
348
+ col3.metric("Neutral", sentiment_counts.get("Neutral", 0))
349
+
350
+ del articles
351
+ del sentiment_counts
352
+ gc.collect()
353
+
354
+ with st.spinner("Translating content and generating audio..."):
355
+ with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
356
+ data = json.load(file)
357
+
358
+ translator = GoogleTranslator(source="en", target="hi")
359
+
360
+ audio_folder = "audio"
361
+ os.makedirs(audio_folder, exist_ok=True)
362
+
363
+ # Clear previous audio files
364
+ for file in os.listdir(audio_folder):
365
+ file_path = os.path.join(audio_folder, file)
366
+ if os.path.isfile(file_path):
367
+ os.remove(file_path)
368
+
369
+ text_data = ""
370
+ audio_files = []
371
+
372
+ progress_bar = st.progress(0)
373
+
374
+ for i, article in enumerate(data, start=1):
375
+ title_translated = translator.translate(article['title'])
376
+
377
+ content_chunks = split_text(article['content'])
378
+ translated_chunks = []
379
+
380
+ for chunk in content_chunks:
381
+ try:
382
+ translated_chunk = translator.translate(chunk)
383
+ translated_chunks.append(translated_chunk)
384
+ time.sleep(0.5)
385
+ except Exception as e:
386
+ st.error(f"Error translating chunk: {str(e)}")
387
+ translated_chunks.append(f"Translation error: {str(e)}")
388
+
389
+ content_translated = " ".join(translated_chunks)
390
+
391
+ del content_chunks
392
+ gc.collect()
393
+
394
+ article_text = (f"अब, आप लेख संख्या {i} सुन रहे हैं जिसका शीर्षक है: {title_translated}\n"
395
+ f"अब, आप लेख संख्या {i} की सामग्री सुन रहे हैं।\n"
396
+ f"सामग्री: {content_translated}\n\n")
397
+
398
+ text_data += article_text
399
+
400
+ audio_file = f"{audio_folder}/article_{i}.mp3"
401
+ audio_output(article_text, audio_file)
402
+ audio_files.append(audio_file)
403
+
404
+ del article_text
405
+ del content_translated
406
+ del translated_chunks
407
+ gc.collect()
408
+
409
+ if torch.cuda.is_available():
410
+ torch.cuda.empty_cache()
411
+
412
+ progress_bar.progress(i / len(data))
413
+ time.sleep(1)
414
+
415
+ output_file = f"Company/{company_name}_translated.txt"
416
+ with open(output_file, "w", encoding="utf-8") as file:
417
+ file.write(text_data)
418
+
419
+ del text_data
420
+ gc.collect()
421
+
422
+ with st.spinner("Combining audio files..."):
423
+ output_file = "combined_news.mp3"
424
+ combine_success = combine_audio_files(audio_folder, output_file)
425
+
426
+ if combine_success:
427
+ st.success("Audio combining process completed!")
428
+ else:
429
+ st.error("Failed to combine audio files.")
430
+
431
+ if torch.cuda.is_available():
432
+ torch.cuda.empty_cache()
433
+
434
+ gc.collect()
435
+
436
+ return True
437
+
438
+ # Main app interface
439
+ st.title("Company News Summarization and Audio Generation")
440
+
441
+ with st.sidebar:
442
+ st.header("Enter Company Details")
443
+ company_name = st.text_input("Company Name")
444
+ process_button = st.button("Process Company News", type="primary")
445
+
446
+ # Process data when button is clicked
447
+ if process_button and company_name:
448
+ success = process_company_news(company_name)
449
+ if success:
450
+ st.session_state.processing_complete = True
451
+ st.session_state.company_name = company_name
452
+ elif process_button and not company_name:
453
+ st.error("Please enter a company name.")
454
+
455
+ # Show results after processing
456
+ if 'processing_complete' in st.session_state and st.session_state.processing_complete:
457
+ company_name = st.session_state.company_name
458
+
459
+ st.header(f"Results for {company_name}")
460
+
461
+ # Create tabs for different outputs
462
+ tab1, tab2, tab3 = st.tabs(["Summary", "Translated Text", "Audio"])
463
+
464
+ with tab1:
465
+ st.subheader("News Summary")
466
+ try:
467
+ with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
468
+ articles = json.load(file)
469
+
470
+ for i, article in enumerate(articles, 1):
471
+ with st.expander(f"Article {i}: {article['title']}"):
472
+ st.write(f"**Summary:** {article['summary']}")
473
+ st.write(f"**Sentiment:** {article['sentiment']}")
474
+ st.write(f"**Topics:** {', '.join(article['topics'])}")
475
+ st.write(f"**URL:** {article['url']}")
476
+ except Exception as e:
477
+ st.error(f"Error loading summary data: {str(e)}")
478
+
479
+ with tab2:
480
+ st.subheader("Translated Text (Hindi)")
481
+ try:
482
+ with open(f"Company/{company_name}_translated.txt", "r", encoding="utf-8") as file:
483
+ text_content = file.read()
484
+ st.download_button(
485
+ label="Download Translated Text",
486
+ data=text_content,
487
+ file_name=f"{company_name}_translated.txt",
488
+ mime="text/plain"
489
+ )
490
+ st.text_area("Content", text_content, height=400)
491
+ except Exception as e:
492
+ st.error(f"Error loading translated text: {str(e)}")
493
+
494
+ with tab3:
495
+ st.subheader("Audio Files")
496
+
497
+ st.write("### Combined Audio")
498
+ try:
499
+ with open("combined_news.mp3", "rb") as file:
500
+ combined_audio_bytes = file.read()
501
+
502
+ st.audio(combined_audio_bytes, format="audio/mp3")
503
+ st.download_button(
504
+ label="Download Combined Audio",
505
+ data=combined_audio_bytes,
506
+ file_name="combined_news.mp3",
507
+ mime="audio/mp3"
508
+ )
509
+ except Exception as e:
510
+ st.error(f"Error loading combined audio: {str(e)}")
511
+
512
+ st.write("### Individual Article Audio Files")
513
+ try:
514
+ audio_files = [f for f in os.listdir("audio") if f.endswith('.mp3')]
515
+ audio_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]) if x.split('_')[-1].split('.')[0].isdigit() else 0)
516
+
517
+ for audio_file in audio_files:
518
+ with st.expander(f"{audio_file}"):
519
+ with open(f"audio/{audio_file}", "rb") as file:
520
+ audio_bytes = file.read()
521
+ st.audio(audio_bytes, format="audio/mp3")
522
+ st.download_button(
523
+ label=f"Download {audio_file}",
524
+ data=audio_bytes,
525
+ file_name=audio_file,
526
+ mime="audio/mp3"
527
+ )
528
+ except Exception as e:
529
+ st.error(f"Error loading individual audio files: {str(e)}")
530
+
531
+ # Instructions at the bottom
532
+ with st.expander("How to use this app"):
533
+ st.write("""
534
+ 1. Enter the name of a company in the sidebar.
535
+ 2. Click 'Process Company News' button to start the analysis.
536
+ 3. Wait for the processing to complete (this may take some time depending on the number of articles).
537
+ 4. View the results in the different tabs:
538
+ - Summary: See sentiment analysis, topics, and summaries of each article
539
+ - Translated Text: View the Hindi translation of all articles
540
+ - Audio: Listen to or download the audio files in Hindi
541
+ """)
542
+
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ requests==2.31.0
2
+ beautifulsoup4==4.12.2
3
+ newspaper3k==0.2.8
4
+ transformers
5
+ torch
6
+ scipy
7
+ numpy
8
+ pandas
9
+ torch
10
+ IPython
11
+ soundfile
12
+ deep_translator
13
+ pydub
14
+ bertopic
15
+ sentence_transformers
16
+ streamlit
utils.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+ import os
5
+ import time
6
+ import re
7
+ from newspaper import Article
8
+ from html import unescape
9
+ from transformers import pipeline,VitsModel, AutoTokenizer
10
+ import torch
11
+ import soundfile as sf
12
+ from bertopic import BERTopic
13
+ from sentence_transformers import SentenceTransformer
14
+
15
+ def clean_text(text):
16
+ text = unescape(text)
17
+ text = re.sub(r'\s+', ' ', text)
18
+ text = re.sub(r'<.*?>', '', text)
19
+ text = text.replace('\n', ' ').replace('\r', ' ')
20
+ return text.strip()
21
+
22
+ def search_news(company_name, num_articles=10):
23
+ query = f"{company_name} news".replace(' ', '+')
24
+ headers = {
25
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
26
+ }
27
+ search_url = f"https://www.google.com/search?q={query}&tbm=nws"
28
+
29
+ try:
30
+ response = requests.get(search_url, headers=headers)
31
+ response.raise_for_status()
32
+ soup = BeautifulSoup(response.text, 'html.parser')
33
+
34
+ news_links = []
35
+ news_divs = soup.find_all('div', class_='SoaBEf')
36
+
37
+ for div in news_divs:
38
+ link_tag = div.find('a')
39
+ if link_tag:
40
+ href = link_tag.get('href')
41
+ if href.startswith('/url?q='):
42
+ url = href.split('/url?q=')[1].split('&sa=')[0]
43
+ news_links.append(url)
44
+ elif href.startswith('http'):
45
+ news_links.append(href)
46
+
47
+ return news_links
48
+ except Exception as e:
49
+ print(f"Error searching for news: {str(e)}")
50
+ return []
51
+
52
+ def extract_article_content(url):
53
+ try:
54
+ article = Article(url)
55
+ article.download()
56
+ article.parse()
57
+
58
+ if not article.text.strip():
59
+ raise ValueError("Empty article content")
60
+
61
+ return {
62
+ "title": clean_text(article.title),
63
+ "content": clean_text(article.text),
64
+ "url": url
65
+ }
66
+ except Exception as e:
67
+ print(f"Skipping article {url} due to error: {str(e)}")
68
+ return None
69
+
70
+ def save_company_news(company_name, num_articles=10):
71
+ news_urls = search_news(company_name)
72
+ articles = []
73
+
74
+ for url in news_urls:
75
+ if len(articles) >= num_articles:
76
+ break
77
+
78
+ article_data = extract_article_content(url)
79
+ if article_data:
80
+ articles.append(article_data)
81
+
82
+ time.sleep(1)
83
+
84
+ while len(articles) < num_articles:
85
+ additional_urls = search_news(company_name, num_articles=10)
86
+ for url in additional_urls:
87
+ if len(articles) >= num_articles:
88
+ break
89
+ article_data = extract_article_content(url)
90
+ if article_data:
91
+ articles.append(article_data)
92
+ time.sleep(1)
93
+
94
+ os.makedirs("Company", exist_ok=True)
95
+ file_path = os.path.join("Company", f"{company_name}.json")
96
+
97
+ with open(file_path, "w", encoding="utf-8") as json_file:
98
+ json.dump(articles, json_file, ensure_ascii=False, indent=4)
99
+
100
+ return file_path
101
+
102
+ def sentiment_analysis_model(text):
103
+ text = text[:510]
104
+ classifier = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
105
+ result = classifier(text)[0]
106
+ label_mapping = {
107
+ "LABEL_0": "Negative",
108
+ "LABEL_1": "Neutral",
109
+ "LABEL_2": "Positive"
110
+ }
111
+ sentiment = label_mapping.get(result["label"], "Unknown")
112
+ print({"sentiment": sentiment, "score": result["score"]})
113
+ return {"sentiment": sentiment}
114
+
115
+ def news_summarization(ARTICLE):
116
+ summarizer = pipeline("summarization", model="Falconsai/text_summarization")
117
+ summary = summarizer(ARTICLE, max_length=57)
118
+ return summary[0]['summary_text']
119
+
120
+ # def audio_output(text):
121
+ # model = VitsModel.from_pretrained("facebook/mms-tts-hin")
122
+ # tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")
123
+ # inputs = tokenizer(text, return_tensors="pt")
124
+ # with torch.no_grad():
125
+ # output = model(**inputs).waveform
126
+ # waveform = output.squeeze().cpu().numpy()
127
+ # sample_rate = 16000
128
+ # sf.write("output.wav", waveform, sample_rate)
129
+
130
+ def audio_output(text, output_file="output.wav"):
131
+ device = "cuda" if torch.cuda.is_available() else "cpu"
132
+
133
+ try:
134
+ model = VitsModel.from_pretrained("facebook/mms-tts-hin").to(device)
135
+ tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")
136
+
137
+ inputs = tokenizer(text, return_tensors="pt").to(device)
138
+
139
+ with torch.no_grad():
140
+ output = model(**inputs).waveform
141
+ waveform = output.squeeze().cpu().numpy()
142
+
143
+ sample_rate = 16000
144
+ sf.write(output_file, waveform, sample_rate)
145
+ if device == "cuda":
146
+ torch.cuda.empty_cache()
147
+
148
+ del model
149
+ del inputs
150
+ del output
151
+ del waveform
152
+
153
+ except Exception as e:
154
+ print(f"Error generating audio: {str(e)}")
155
+
156
+ def Topic_finder(text):
157
+ device = "cuda" if torch.cuda.is_available() else "cpu"
158
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
159
+
160
+ topic_model = BERTopic.load("ctam8736/bertopic-20-newsgroups")
161
+ topic_model.embedding_model = embedding_model
162
+ embeddings = embedding_model.encode([text])
163
+ topic, _ = topic_model.transform([text], embeddings=embeddings)
164
+ words = topic_model.get_topic(topic[0])
165
+ related_words = [word for word, _ in words]
166
+ return related_words
167
+
168
+
169
+
170
+