Spaces:

shubhamprakash108
/

news-summarization-and-text-to-speech-application

Sleeping

news-summarization-and-text-to-speech-application

File size: 19,384 Bytes

33fd372

# import json
# import os
# from utils import save_company_news
# from utils import sentiment_analysis_model
# from utils import news_summarization, audio_output, Topic_finder
# from collections import Counter
# import time
# import re
# from deep_translator import GoogleTranslator
# from pydub import AudioSegment
# import gc
# import torch


# print("Company News Summarization")
    
# company_name = input("Enter Company Name: ")
    
# if company_name:
#     file_path = save_company_news(company_name)
        
#     if os.path.exists(file_path):
#         with open(file_path, "r", encoding="utf-8") as file:
#             articles = json.load(file)
                
#             for article in articles:
#                 print(f"\nTitle: {article['title']}")
#                 print(f"Content: {article['content'][:100]}...") 
#                 print(f"Read more: {article['url']}")
                
#         del articles
#         gc.collect()
#     else:
#         print("Failed to fetch news. Try again.")
# else:
#     print("Please enter a company name.")

# with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
#     data = json.load(file)

# for article in data:
#     topics = Topic_finder(article['title'])
    
#     sentiment = sentiment_analysis_model(article['content'])
#     article["sentiment"] = sentiment['sentiment']
    
#     del sentiment
#     gc.collect()
    
#     summary = news_summarization(article["content"])
#     article["summary"] = summary
    
#     article["topics"] = topics
    
#     if torch.cuda.is_available():
#         torch.cuda.empty_cache()
    
#     gc.collect()

# with open(f"Company/{company_name}.json", "w", encoding="utf-8") as file:
#     json.dump(data, file, indent=4)

# with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
#     articles = json.load(file)

# sentiment_counts = Counter(article["sentiment"] for article in articles)

# print("Sentiment Counts:")
# print("Positive:", sentiment_counts.get("Positive", 0))
# print("Negative:", sentiment_counts.get("Negative", 0))
# print("Neutral:", sentiment_counts.get("Neutral", 0))

# del articles
# del sentiment_counts
# gc.collect()

# with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
#     data = json.load(file)

# translator = GoogleTranslator(source="en", target="hi")

# audio_folder = "audio"
# os.makedirs(audio_folder, exist_ok=True)

# for file in os.listdir(audio_folder):
#     file_path = os.path.join(audio_folder, file)
#     if os.path.isfile(file_path):
#         os.remove(file_path)

# text_data = ""
# audio_files = []

# def split_text(text, max_length=4500):
#     sentences = re.split(r'(?<=[.!?])\s+', text)
#     chunks = []
#     current_chunk = ""
    
#     for sentence in sentences:
#         if len(current_chunk) + len(sentence) + 1 <= max_length:
#             current_chunk += " " + sentence if current_chunk else sentence
#         else:
#             chunks.append(current_chunk)
#             current_chunk = sentence
    
#     if current_chunk:
#         chunks.append(current_chunk)
    
#     return chunks

# for i, article in enumerate(data, start=1):
#     title_translated = translator.translate(article['title'])
    
#     content_chunks = split_text(article['content'])
#     translated_chunks = []
    
#     for chunk in content_chunks:
#         try:
#             translated_chunk = translator.translate(chunk)
#             translated_chunks.append(translated_chunk)
#             time.sleep(0.5)
#         except Exception as e:
#             print(f"Error translating chunk: {str(e)}")
#             translated_chunks.append(f"Translation error: {str(e)}")
    
#     content_translated = " ".join(translated_chunks)
    
#     del content_chunks
#     gc.collect()

#     article_text = (f"अब, आप लेख संख्या {i} सुन रहे हैं जिसका शीर्षक है: {title_translated}\n"
#                     f"अब, आप लेख संख्या {i} की सामग्री सुन रहे हैं।\n"
#                     f"सामग्री: {content_translated}\n\n")

#     text_data += article_text

#     audio_file = f"{audio_folder}/article_{i}.mp3"
#     audio_output(article_text, audio_file)
#     audio_files.append(audio_file)
    
#     del article_text
#     del content_translated
#     del translated_chunks
#     gc.collect()
    
#     if torch.cuda.is_available():
#         torch.cuda.empty_cache()
    
#     time.sleep(1)

# output_file = f"Company/{company_name}_translated.txt"
# with open(output_file, "w", encoding="utf-8") as file:
#     file.write(text_data)

# del text_data
# gc.collect()

# def combine_audio_files(audio_folder, output_file):
#     try:
#         print(f"Combining audio files from {audio_folder}...")
#         audio_files = [f for f in os.listdir(audio_folder) if f.endswith('.mp3') and f != os.path.basename(output_file)]
        
#         if not audio_files:
#             print("No audio files found to combine.")
#             return False
            
#         audio_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]) if x.split('_')[-1].split('.')[0].isdigit() else 0)
#         print(f"Found {len(audio_files)} audio files to combine.")
        
#         combined = AudioSegment.empty()
        
#         for file in audio_files:
#             file_path = os.path.join(audio_folder, file)
#             try:
#                 audio = AudioSegment.from_mp3(file_path)
#                 combined += audio
#                 print(f"Added {file}")
                
#                 del audio
#                 gc.collect()
#             except Exception as e:
#                 print(f"Error processing {file}: {str(e)}")
                
#         combined.export(output_file, format="mp3")
#         print(f"Successfully combined audio files into {output_file}")
        
#         del combined
#         gc.collect()
        
#         return True
        
#     except Exception as e:
#         print(f"Error combining audio files: {str(e)}")
#         return False

# audio_folder = "audio"
# output_file = "combined_news.mp3"
# combine_audio_files(audio_folder, output_file)
# print("Audio combining process completed!")

# if torch.cuda.is_available():
#     torch.cuda.empty_cache()

# gc.collect()

import streamlit as st
import json
import os
from utils import save_company_news
from utils import sentiment_analysis_model
from utils import news_summarization, audio_output, Topic_finder
from collections import Counter
import time
import re
from deep_translator import GoogleTranslator
from pydub import AudioSegment
import gc
import torch

# Set page config
st.set_page_config(
    page_title="Company News Summarization",
    page_icon="📰",
    layout="wide"
)

# Create necessary folders
os.makedirs("Company", exist_ok=True)
os.makedirs("audio", exist_ok=True)

def split_text(text, max_length=4500):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= max_length:
            current_chunk += " " + sentence if current_chunk else sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence
    
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

def combine_audio_files(audio_folder, output_file):
    try:
        st.info(f"Combining audio files from {audio_folder}...")
        audio_files = [f for f in os.listdir(audio_folder) if f.endswith('.mp3') and f != os.path.basename(output_file)]
        
        if not audio_files:
            st.warning("No audio files found to combine.")
            return False
            
        audio_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]) if x.split('_')[-1].split('.')[0].isdigit() else 0)
        st.info(f"Found {len(audio_files)} audio files to combine.")
        
        combined = AudioSegment.empty()
        
        for file in audio_files:
            file_path = os.path.join(audio_folder, file)
            try:
                audio = AudioSegment.from_mp3(file_path)
                combined += audio
                
                del audio
                gc.collect()
            except Exception as e:
                st.error(f"Error processing {file}: {str(e)}")
                
        combined.export(output_file, format="mp3")
        st.success(f"Successfully combined audio files into {output_file}")
        
        del combined
        gc.collect()
        
        return True
        
    except Exception as e:
        st.error(f"Error combining audio files: {str(e)}")
        return False

def process_company_news(company_name):
    with st.spinner("Fetching company news..."):
        file_path = save_company_news(company_name)
        
        if not os.path.exists(file_path):
            st.error("Failed to fetch news. Try again.")
            return False
        
        with open(file_path, "r", encoding="utf-8") as file:
            articles = json.load(file)
            
        st.success(f"Found {len(articles)} articles for {company_name}")
        
        # Display a preview of the articles
        with st.expander("Preview Articles"):
            for article in articles:
                st.subheader(article['title'])
                st.write(f"{article['content'][:100]}...") 
                st.write(f"[Read more]({article['url']})")
        
        del articles
        gc.collect()
    
    with st.spinner("Analyzing sentiment, extracting topics, and generating summaries..."):
        progress_bar = st.progress(0)
        
        with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
            data = json.load(file)
        
        total_articles = len(data)
        
        for i, article in enumerate(data):
            topics = Topic_finder(article['title'])
            
            sentiment = sentiment_analysis_model(article['content'])
            article["sentiment"] = sentiment['sentiment']
            
            del sentiment
            gc.collect()
            
            summary = news_summarization(article["content"])
            article["summary"] = summary
            
            article["topics"] = topics
            
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            gc.collect()
            progress_bar.progress((i + 1) / total_articles)
        
        with open(f"Company/{company_name}.json", "w", encoding="utf-8") as file:
            json.dump(data, file, indent=4)
    
    with st.spinner("Counting sentiment..."):
        with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
            articles = json.load(file)
        
        sentiment_counts = Counter(article["sentiment"] for article in articles)
        
        st.write("### Sentiment Analysis")
        col1, col2, col3 = st.columns(3)
        col1.metric("Positive", sentiment_counts.get("Positive", 0))
        col2.metric("Negative", sentiment_counts.get("Negative", 0))
        col3.metric("Neutral", sentiment_counts.get("Neutral", 0))
        
        del articles
        del sentiment_counts
        gc.collect()
    
    with st.spinner("Translating content and generating audio..."):
        with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
            data = json.load(file)
        
        translator = GoogleTranslator(source="en", target="hi")
        
        audio_folder = "audio"
        os.makedirs(audio_folder, exist_ok=True)
        
        # Clear previous audio files
        for file in os.listdir(audio_folder):
            file_path = os.path.join(audio_folder, file)
            if os.path.isfile(file_path):
                os.remove(file_path)
        
        text_data = ""
        audio_files = []
        
        progress_bar = st.progress(0)
        
        for i, article in enumerate(data, start=1):
            title_translated = translator.translate(article['title'])
            
            content_chunks = split_text(article['content'])
            translated_chunks = []
            
            for chunk in content_chunks:
                try:
                    translated_chunk = translator.translate(chunk)
                    translated_chunks.append(translated_chunk)
                    time.sleep(0.5)
                except Exception as e:
                    st.error(f"Error translating chunk: {str(e)}")
                    translated_chunks.append(f"Translation error: {str(e)}")
            
            content_translated = " ".join(translated_chunks)
            
            del content_chunks
            gc.collect()
        
            article_text = (f"अब, आप लेख संख्या {i} सुन रहे हैं जिसका शीर्षक है: {title_translated}\n"
                            f"अब, आप लेख संख्या {i} की सामग्री सुन रहे हैं।\n"
                            f"सामग्री: {content_translated}\n\n")
        
            text_data += article_text
        
            audio_file = f"{audio_folder}/article_{i}.mp3"
            audio_output(article_text, audio_file)
            audio_files.append(audio_file)
            
            del article_text
            del content_translated
            del translated_chunks
            gc.collect()
            
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            progress_bar.progress(i / len(data))
            time.sleep(1)
        
        output_file = f"Company/{company_name}_translated.txt"
        with open(output_file, "w", encoding="utf-8") as file:
            file.write(text_data)
        
        del text_data
        gc.collect()
    
    with st.spinner("Combining audio files..."):
        output_file = "combined_news.mp3"
        combine_success = combine_audio_files(audio_folder, output_file)
        
        if combine_success:
            st.success("Audio combining process completed!")
        else:
            st.error("Failed to combine audio files.")
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        gc.collect()
    
    return True

# Main app interface
st.title("Company News Summarization and Audio Generation")

with st.sidebar:
    st.header("Enter Company Details")
    company_name = st.text_input("Company Name")
    process_button = st.button("Process Company News", type="primary")

# Process data when button is clicked
if process_button and company_name:
    success = process_company_news(company_name)
    if success:
        st.session_state.processing_complete = True
        st.session_state.company_name = company_name
elif process_button and not company_name:
    st.error("Please enter a company name.")

# Show results after processing
if 'processing_complete' in st.session_state and st.session_state.processing_complete:
    company_name = st.session_state.company_name
    
    st.header(f"Results for {company_name}")
    
    # Create tabs for different outputs
    tab1, tab2, tab3 = st.tabs(["Summary", "Translated Text", "Audio"])
    
    with tab1:
        st.subheader("News Summary")
        try:
            with open(f"Company/{company_name}.json", "r", encoding="utf-8") as file:
                articles = json.load(file)
            
            for i, article in enumerate(articles, 1):
                with st.expander(f"Article {i}: {article['title']}"):
                    st.write(f"**Summary:** {article['summary']}")
                    st.write(f"**Sentiment:** {article['sentiment']}")
                    st.write(f"**Topics:** {', '.join(article['topics'])}")
                    st.write(f"**URL:** {article['url']}")
        except Exception as e:
            st.error(f"Error loading summary data: {str(e)}")
    
    with tab2:
        st.subheader("Translated Text (Hindi)")
        try:
            with open(f"Company/{company_name}_translated.txt", "r", encoding="utf-8") as file:
                text_content = file.read()
            st.download_button(
                label="Download Translated Text",
                data=text_content,
                file_name=f"{company_name}_translated.txt",
                mime="text/plain"
            )
            st.text_area("Content", text_content, height=400)
        except Exception as e:
            st.error(f"Error loading translated text: {str(e)}")
    
    with tab3:
        st.subheader("Audio Files")
        
        st.write("### Combined Audio")
        try:
            with open("combined_news.mp3", "rb") as file:
                combined_audio_bytes = file.read()
                
            st.audio(combined_audio_bytes, format="audio/mp3")
            st.download_button(
                label="Download Combined Audio",
                data=combined_audio_bytes,
                file_name="combined_news.mp3",
                mime="audio/mp3"
            )
        except Exception as e:
            st.error(f"Error loading combined audio: {str(e)}")
        
        st.write("### Individual Article Audio Files")
        try:
            audio_files = [f for f in os.listdir("audio") if f.endswith('.mp3')]
            audio_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]) if x.split('_')[-1].split('.')[0].isdigit() else 0)
            
            for audio_file in audio_files:
                with st.expander(f"{audio_file}"):
                    with open(f"audio/{audio_file}", "rb") as file:
                        audio_bytes = file.read()
                    st.audio(audio_bytes, format="audio/mp3")
                    st.download_button(
                        label=f"Download {audio_file}",
                        data=audio_bytes,
                        file_name=audio_file,
                        mime="audio/mp3"
                    )
        except Exception as e:
            st.error(f"Error loading individual audio files: {str(e)}")

# Instructions at the bottom
with st.expander("How to use this app"):
    st.write("""

    1. Enter the name of a company in the sidebar.

    2. Click 'Process Company News' button to start the analysis.

    3. Wait for the processing to complete (this may take some time depending on the number of articles).

    4. View the results in the different tabs:

       - Summary: See sentiment analysis, topics, and summaries of each article

       - Translated Text: View the Hindi translation of all articles

       - Audio: Listen to or download the audio files in Hindi

    """)