Spaces:

jarif
/

Simplify-YouTube-Videos-Web-Articles

Sleeping

App Files Files Community

jarif commited on Feb 1

Commit

f8b6c23

verified ·

1 Parent(s): 5f17154

Upload app.py

Browse files

Files changed (1) hide show

app.py +73 -83

app.py CHANGED Viewed

@@ -6,8 +6,7 @@ from langchain.prompts import PromptTemplate
 from langchain_groq import ChatGroq
 from langchain.chains.summarize import load_summarize_chain
 from langchain.schema import Document
-from youtube_transcript_api import YouTubeTranscriptApi
-from urllib.parse import urlparse, parse_qs
 import requests
 from bs4 import BeautifulSoup
 import re
@@ -16,31 +15,19 @@ import re
 load_dotenv()
 # Streamlit App
-st.set_page_config(page_title="AI Content Summarizer", page_icon="📚")
-# Create two columns for the title
-col1, col2 = st.columns([0.85, 0.15])
-with col1:
-    st.title("AI Content Summarizer")
-    st.caption("Powered by LangChain & Gemma 🤖")
-with col2:
-    st.image("https://python.langchain.com/img/favicon.ico", width=50)
-st.markdown("""
-### About This App
-This application leverages the power of LangChain and Gemma AI to automatically generate concise summaries from YouTube videos and web articles. Whether you're researching a topic, catching up on content, or trying to quickly grasp key information, this tool can help save time by distilling content into clear, readable summaries.
-""")
 # Get API Key & URL input
 groq_api_key = os.getenv("GROQ_API_KEY")
 if not groq_api_key:
     st.error("GROQ API Key not found. Please check your environment variables.")
-generic_url = st.text_input("Enter YouTube or Website URL", placeholder="https://example.com or https://youtube.com/watch?v=...")
 # LangChain Model with Groq API
-llm = ChatGroq(model="gemma-7b-it", groq_api_key=groq_api_key)
 # Prompt Template
 prompt_template = """
@@ -52,44 +39,36 @@ Focus on the main points and key insights. Write in a professional tone.
 """
 prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
-def get_youtube_id(url):
-    """Extract video ID from YouTube URL"""
-    if 'youtube.com' in url:
-        query = parse_qs(urlparse(url).query)
-        return query.get('v', [None])[0]
-    elif 'youtu.be' in url:
-        return urlparse(url).path[1:]
-    return None
 def get_youtube_content(url):
-    """Get content from YouTube video using youtube-transcript-api"""
     try:
-        video_id = get_youtube_id(url)
-        if not video_id:
-            raise ValueError("Could not extract YouTube video ID")
-        # Get transcript
-        transcript = YouTubeTranscriptApi.get_transcript(video_id)
-        transcript_text = ' '.join([entry['text'] for entry in transcript])
-        # Get video info using a simple request
-        response = requests.get(f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json")
-        if response.status_code == 200:
-            video_info = response.json()
-            title = video_info.get('title', 'Unknown Title')
-            author = video_info.get('author_name', 'Unknown Author')
-        else:
-            title = "Unknown Title"
-            author = "Unknown Author"
-        content = f"""
 Video Title: {title}
-Channel: {author}
-Transcript:
-{transcript_text}
-"""
-        return [Document(page_content=content)]
     except Exception as e:
         st.error(f"Error getting YouTube content: {str(e)}")
         return None
@@ -97,42 +76,48 @@ Transcript:
 def get_website_content(url):
     """Get content from website using requests and BeautifulSoup"""
     try:
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
         response = requests.get(url, headers=headers, verify=False)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
-        # Remove unwanted elements
-        for element in soup(['script', 'style', 'header', 'footer', 'nav']):
-            element.decompose()
         title = soup.title.string if soup.title else "No title found"
-        # Get main content with improved selection
         main_content = ""
-        selectors = [
-            'article', 'main',
-            '[role="main"]',
-            '.post-content',
-            '.article-content',
-            '.content',
-            '#content'
-        ]
-        for selector in selectors:
-            if soup.select_one(selector):
-                main_content = soup.select_one(selector).get_text()
-                break
-        if not main_content:
-            paragraphs = soup.find_all('p')
-            main_content = '\n'.join(p.get_text() for p in paragraphs)
-        # Clean up text
         main_content = re.sub(r'\s+', ' ', main_content).strip()
         main_content = re.sub(r'<[^>]+>', '', main_content)
         content = f"""
@@ -145,17 +130,19 @@ Content:
         return [Document(page_content=content)]
     except Exception as e:
-        st.error(f"Error processing website content: {str(e)}")
         return None
-if st.button("Summarize"):
     if not groq_api_key or not generic_url.strip():
         st.error("Please provide a valid API key and URL.")
     elif not validators.url(generic_url):
-        st.error("Please enter a valid URL.")
     else:
         try:
-            with st.spinner("Processing content..."):
                 if "youtube.com" in generic_url or "youtu.be" in generic_url:
                     docs = get_youtube_content(generic_url)
                 else:
@@ -164,19 +151,22 @@ if st.button("Summarize"):
                 if docs is None:
                     st.stop()
                 chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
                 output_summary = chain.run(docs)
-                st.success("Summary Generated!")
-                tab1, tab2 = st.tabs(["Summary", "Original Content"])
                 with tab1:
-                    st.markdown(output_summary)
                 with tab2:
                     if docs:
-                        st.text_area("Raw Content", docs[0].page_content, height=300)
         except Exception as e:
-            st.error(f"An error occurred: {str(e)}")

 from langchain_groq import ChatGroq
 from langchain.chains.summarize import load_summarize_chain
 from langchain.schema import Document
+import yt_dlp
 import requests
 from bs4 import BeautifulSoup
 import re
 load_dotenv()
 # Streamlit App
+st.set_page_config(page_title="LangChain: Summarize Text From YT or Website", page_icon="🦜")
+st.title("🦜 LangChain: Summarize Text From YT or Website")
+st.subheader("Summarize URL")
 # Get API Key & URL input
 groq_api_key = os.getenv("GROQ_API_KEY")
 if not groq_api_key:
     st.error("GROQ API Key not found. Please check your environment variables.")
+generic_url = st.text_input("Enter YouTube or Website URL", label_visibility="collapsed")
 # LangChain Model with Groq API
+llm = ChatGroq(model="gemma2-9b-it", groq_api_key=groq_api_key)
 # Prompt Template
 prompt_template = """
 """
 prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
 def get_youtube_content(url):
+    """Get content from YouTube video using yt-dlp"""
     try:
+        ydl_opts = {
+            'format': 'worst',
+            'extract_flat': True,
+            'quiet': True,
+            'no_warnings': True
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=False)
+            title = info.get('title', '')
+            description = info.get('description', '')
+            views = info.get('view_count', 'Unknown')
+            uploader = info.get('uploader', 'Unknown')
+            upload_date = info.get('upload_date', 'Unknown')
+            content = f"""
 Video Title: {title}
+Uploader: {uploader}
+Upload Date: {upload_date}
+Views: {views}
+Description:
+{description}
+"""
+            return [Document(page_content=content)]
     except Exception as e:
         st.error(f"Error getting YouTube content: {str(e)}")
         return None
 def get_website_content(url):
     """Get content from website using requests and BeautifulSoup"""
     try:
+        # Send request with headers to mimic a browser
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
         response = requests.get(url, headers=headers, verify=False)
         response.raise_for_status()
+        # Parse HTML
         soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove script and style elements
+        for script in soup(["script", "style"]):
+            script.decompose()
+        # Get title
         title = soup.title.string if soup.title else "No title found"
+        # Get main content (adjust selectors based on the website structure)
         main_content = ""
+        # Try to find article content first
+        article = soup.find('article')
+        if article:
+            main_content = article.get_text()
+        else:
+            # If no article tag, try common content containers
+            content_tags = ['main', 'div.content', 'div.post-content', 'div.article-content']
+            for tag in content_tags:
+                element = soup.select_one(tag)
+                if element:
+                    main_content = element.get_text()
+                    break
+            # If still no content, get all paragraph text
+            if not main_content:
+                paragraphs = soup.find_all('p')
+                main_content = '\n'.join(p.get_text() for p in paragraphs)
+        # Clean up the text
+        # Remove extra whitespace and newlines
         main_content = re.sub(r'\s+', ' ', main_content).strip()
+        # Remove any remaining HTML tags
         main_content = re.sub(r'<[^>]+>', '', main_content)
         content = f"""
         return [Document(page_content=content)]
     except Exception as e:
+        st.error(f"Error fetching or processing {url}, exception:\n{str(e)}")
         return None
+if st.button("Summarize the Content from YT or Website"):
+    # Validate Input
     if not groq_api_key or not generic_url.strip():
         st.error("Please provide a valid API key and URL.")
     elif not validators.url(generic_url):
+        st.error("Please enter a valid URL (YouTube or a website).")
     else:
         try:
+            with st.spinner("Fetching content and summarizing..."):
+                # Load data from YouTube or Website
                 if "youtube.com" in generic_url or "youtu.be" in generic_url:
                     docs = get_youtube_content(generic_url)
                 else:
                 if docs is None:
                     st.stop()
+                # Create the summary chain and run it
                 chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
                 output_summary = chain.run(docs)
+                # Display the results
+                st.success("Summary Generated Successfully!")
+                tab1, tab2 = st.tabs(["Summary", "Raw Content"])
                 with tab1:
+                    st.write(output_summary)
                 with tab2:
                     if docs:
+                        st.text_area("Original Content", docs[0].page_content, height=300)
         except Exception as e:
+            st.error(f"An error occurred: {str(e)}")
+            st.exception(e)