Spaces:

ammansik
/

youtube_summarizer

Sleeping

App Files Files Community

ammansik commited on Jun 5, 2023

Commit

e3d3533

1 Parent(s): f3c8876

implementation

Browse files

Files changed (6) hide show

__init__.py +0 -0
app.py +122 -0
audio_to_text.py +39 -0
requirements.txt +7 -0
text_summary.py +201 -0
youtube_extraction.py +31 -0

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import argparse
+import os
+import tempfile
+import time
+from functools import wraps
+from shutil import rmtree
+import streamlit as st
+from audio_to_text import transcribe_audio
+from text_summary import (align_chapters, get_automatic_chapters,
+                          summarize_chapters)
+from youtube_extraction import get_youtube_chapters, youtube_to_audio
+def timing_decorator(message):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            with st.spinner(message):
+                start_time = time.time()
+                result = func(*args, **kwargs)
+                end_time = time.time()
+                st.write(f"{message} complete - {end_time - start_time:.2f}s")
+                return result
+        return wrapper
+    return decorator
+@timing_decorator("Downloading Youtube video")
+def download_youtube(youtube_url, work_dir):
+    audio_fpath = youtube_to_audio(youtube_url, work_dir)
+    # Get Youtube chapters, return empty list if is not in metadata
+    yt_chapters = get_youtube_chapters(youtube_url)
+    return audio_fpath, yt_chapters
+@timing_decorator("Transcribing audio")
+def audio_to_text(audio_fpath):
+    # Transcribe video with Whisper
+    timestamped_text = transcribe_audio(audio_fpath)
+    return timestamped_text
+@timing_decorator("Retrieving chapters")
+def retrieve_chapters(timestamped_text, yt_chapters):
+    # Get chapters
+    if len(yt_chapters) == 0:
+        chapters = get_automatic_chapters(timestamped_text)
+    else:
+        chapters = align_chapters(timestamped_text, yt_chapters)
+    return chapters
+@timing_decorator("Summarizing video")
+def summarize_youtube_chapters(chapters):
+    # Summarize chapters
+    summarized_chapters = summarize_chapters(chapters)
+    return summarized_chapters
+def get_work_dir():
+    temp_dir = tempfile.TemporaryDirectory()
+    work_dir = temp_dir.name
+    return work_dir
+def convert_seconds(seconds):
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds = int((seconds % 3600) % 60)
+    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
+def summarize_video(youtube_url):
+    st.video(youtube_url)
+    # Create a temporary directory to store the audio file
+    work_dir = get_work_dir()
+    # Summarize the video
+    audio_fpath, yt_chapters = download_youtube(youtube_url, work_dir)
+    timestamped_text = audio_to_text(audio_fpath)
+    chapters = retrieve_chapters(timestamped_text, yt_chapters)
+    summarized_chapters, overall_summary = summarize_youtube_chapters(chapters)
+    st.write(f"**TLDR:** {overall_summary}")
+    for summarized_chapter in summarized_chapters:
+        start_time = convert_seconds(summarized_chapter["start"])
+        end_time = convert_seconds(summarized_chapter["end"])
+        timestamp = f"{start_time} - {end_time}"
+        title = summarized_chapter["title"]
+        summary = summarized_chapter["summary"]
+        # Display the hyperlink with timestamp and title
+        hyperlink = (
+            f"[{timestamp} - {title}]({youtube_url}&t={summarized_chapter['start']}s)"
+        )
+        st.markdown(hyperlink, unsafe_allow_html=True)
+        st.write(summary)
+    rmtree(work_dir)
+def app():
+    st.title("Video Summarizer")
+    youtube_url = st.text_input("Enter a YouTube URL")
+    # Add summarize button
+    summarize_button = st.button("Summarize")
+    if summarize_button:
+        summarize_video(youtube_url)
+if __name__ == "__main__":
+    app()

audio_to_text.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import whisper
+EOS_TOKENS = [".", "!", "?"]
+def transcribe_audio(audio_fpath, max_snt_len=100):
+    model = whisper.load_model("small")
+    result = model.transcribe(audio_fpath)
+    sentences = []
+    snt_start = None
+    snt = ""
+    for segment in result["segments"]:
+        snt += f'{segment["text"]} '
+        if not snt_start:
+            snt_start = segment["start"]
+        if (
+            segment["text"].strip().split()[-1][-1] in EOS_TOKENS
+            or len(snt) > max_snt_len
+        ):
+            sentences.append(
+                {"text": snt.strip(), "start": snt_start, "end": segment["end"]}
+            )
+            snt_start = None
+            snt = ""
+    if len(snt) > 0:
+        sentences.append(
+            {"text": snt.strip(), "start": snt_start, "end": segment["end"]}
+        )
+        snt_start = None
+        snt = ""
+    timestamped_text = ""
+    for sentence in sentences:
+        timestamped_text += (
+            f'{sentence["start"]} {sentence["end"]} {sentence["text"]}\n'
+        )
+    return timestamped_text

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+git+https://github.com/openai/whisper.git
+openai
+yt-dlp
+streamlit
+scikit-learn
+tenacity
+langchain

text_summary.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import os
+import textwrap
+import numpy as np
+import openai
+from langchain.chains.summarize import load_summarize_chain
+from langchain.docstore.document import Document
+from langchain.llms import OpenAI
+from langchain.prompts import PromptTemplate
+from sklearn.cluster import KMeans
+from tenacity import stop_after_attempt  # for exponential backoff
+from tenacity import retry, wait_random_exponential
+DEFAULT_PROMPT = (
+    "Summarize this Youtube video chapter. Always start with a topical sentence: "
+)
+CHAPTER_TITLE = "Give a title to this video chapter based on the transcript: "
+title_template = "Give a title to this text summary: {text}"
+TITLE_PROMPT = PromptTemplate(template=title_template, input_variables=["text"])
+openai.api_key = os.environ.get("CHATGPT_API_KEY")
+if openai.api_key is None:
+    raise ValueError("CHATGPT_API_KEY environment variable not set")
+@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
+def get_embeddings(text_chunks, model="text-embedding-ada-002"):
+    data = openai.Embedding.create(input=text_chunks, model=model)["data"]
+    embeddings = [item["embedding"] for item in data]
+    return np.array(embeddings)
+def text_from_file(text_path):
+    in_text = ""
+    with open(text_path, "r", encoding="utf-8") as text_file:
+        for line in text_file:
+            in_text += line
+    return in_text
+def get_chunks(timestamped_transcripts, chunk_lines):
+    chunks = []
+    current_chunk = []
+    for line in timestamped_transcripts:
+        current_chunk.append(line)
+        if len(current_chunk) == chunk_lines:
+            chunks.append("\n".join(current_chunk))
+            current_chunk = []
+    if len(current_chunk) > 0:
+        chunks.append("\n".join(current_chunk))
+    return chunks
+def align_chapters(timestamped_transcript, yt_chapters):
+    timestamped_transcripts = timestamped_transcript.strip().split("\n")
+    chapters = []
+    chapter_text = ""
+    chapter_start_time = 0.0
+    prev_end_time = 0.0
+    chapter_index = 0
+    for idx, trn in enumerate(timestamped_transcripts):
+        trn_start_time = float(trn.split()[0])
+        trn_end_time = float(trn.split()[1])
+        trn_text = " ".join(trn.split()[2:])
+        if idx == 0:
+            chapter_start_time = trn_start_time
+        next_index = min(chapter_index + 1, len(yt_chapters) - 1)
+        if trn_start_time >= yt_chapters[next_index]["start_time"]:
+            if len(chapters) == len(yt_chapters):
+                chapter_text += f"{trn_text}\n"
+            else:
+                chapters.append(
+                    {
+                        "text": chapter_text,
+                        "start_time": chapter_start_time,
+                        "end_time": prev_end_time,
+                        "title": yt_chapters[chapter_index]["title"],
+                    }
+                )
+                chapter_text = trn_text
+                chapter_start_time = trn_start_time
+                chapter_index += 1
+        else:
+            chapter_text += f"{trn_text}\n"
+        prev_end_time = trn_end_time
+    if len(chapters) == len(yt_chapters):
+        chapter_index = len(yt_chapters) - 1
+        chapters[chapter_index]["text"] += chapter_text
+        chapters[chapter_index]["end_time"] = prev_end_time
+    return chapters
+def get_automatic_chapters(timestamped_transcript, chunk_lines=5, num_clusters=3):
+    timestamped_transcripts = timestamped_transcript.split("\n")
+    # Split into chunks
+    text_chunks = get_chunks(timestamped_transcripts, chunk_lines)
+    embeddings = get_embeddings(text_chunks)
+    # Creating and fitting the K-means model
+    kmeans = KMeans(n_clusters=num_clusters)
+    kmeans.fit(embeddings)
+    # Getting the cluster labels
+    cluster_labels = kmeans.labels_
+    current_label = -1
+    current_text = ""
+    chapters = []
+    for idx, (text_chunk, label) in enumerate(zip(text_chunks, cluster_labels)):
+        start_time, end_time = get_chunk_timestamps(text_chunk)
+        if idx == 0:
+            chapter_start_time = start_time
+        if label != current_label and current_label != -1:
+            chapters.append(
+                {
+                    "text": current_text,
+                    "start_time": chapter_start_time,
+                    "end_time": prev_end_time,
+                    "title": "",
+                }
+            )
+            current_text = ""
+            chapter_start_time = start_time
+        current_label = label
+        current_text += get_chunk_text(text_chunk)
+        prev_end_time = end_time
+    if len(current_text) > 0:
+        chapters.append(
+            {
+                "text": current_text,
+                "start_time": chapter_start_time,
+                "end_time": prev_end_time,
+                "title": "",
+            }
+        )
+    return chapters
+def get_chunk_timestamps(chunk):
+    start_time = float(chunk.strip().split("\n")[0].split()[0])
+    end_time = float(chunk.strip().split("\n")[-1].split()[1])
+    return start_time, end_time
+def get_chunk_text(chunk):
+    chunk_text = ""
+    for chunk_line in chunk.strip().split("\n"):
+        chunk_text += " ".join(chunk_line.split()[2:])
+    return chunk_text
+def summarize_chapters(chapters):
+    llm = OpenAI(temperature=0.9, openai_api_key=os.environ.get("CHATGPT_API_KEY"))
+    chapter_docs = [Document(page_content=chapter["text"]) for chapter in chapters]
+    summary_chain = load_summarize_chain(
+        llm, chain_type="map_reduce", return_intermediate_steps=True
+    )
+    summaries = summary_chain(
+        {"input_documents": chapter_docs}, return_only_outputs=True
+    )
+    summary_docs = [
+        Document(page_content=summary) for summary in summaries["intermediate_steps"]
+    ]
+    title_chain = load_summarize_chain(
+        llm,
+        chain_type="map_reduce",
+        return_intermediate_steps=True,
+        map_prompt=TITLE_PROMPT,
+    )
+    titles = title_chain({"input_documents": summary_docs}, return_only_outputs=True)
+    summarized_chapters = []
+    for chapter, chapter_summary, chapter_title in zip(
+        chapters, summaries["intermediate_steps"], titles["intermediate_steps"]
+    ):
+        if len(chapter["title"]) > 0:
+            chapter_title = chapter["title"]
+        summarized_chapters.append(
+            {
+                "start": chapter["start_time"],
+                "end": chapter["end_time"],
+                "title": chapter_title.strip(),
+                "summary": chapter_summary.strip(),
+            }
+        )
+    return summarized_chapters, summaries["output_text"]

youtube_extraction.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import glob
+import yt_dlp
+def youtube_to_audio(url, output_path, filename_template="youtube_video"):
+    ydl_opts = {
+        "outtmpl": f"{output_path}/{filename_template}",
+        "format": "m4a/bestaudio/best",
+        "postprocessors": [
+            {  # Extract audio using ffmpeg
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "m4a",
+            }
+        ],
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([url])
+    file_path = glob.glob(f"{output_path}/{filename_template}*")[0]
+    return file_path
+def get_youtube_chapters(url):
+    video_chapters = []
+    ydl_opts = {}
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=False)
+        if "chapters" in info and info["chapters"]:
+            video_chapters = info["chapters"]
+    return video_chapters