ammansik commited on
Commit
e3d3533
·
1 Parent(s): f3c8876

implementation

Browse files
Files changed (6) hide show
  1. __init__.py +0 -0
  2. app.py +122 -0
  3. audio_to_text.py +39 -0
  4. requirements.txt +7 -0
  5. text_summary.py +201 -0
  6. youtube_extraction.py +31 -0
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import tempfile
4
+ import time
5
+ from functools import wraps
6
+ from shutil import rmtree
7
+
8
+ import streamlit as st
9
+
10
+ from audio_to_text import transcribe_audio
11
+ from text_summary import (align_chapters, get_automatic_chapters,
12
+ summarize_chapters)
13
+ from youtube_extraction import get_youtube_chapters, youtube_to_audio
14
+
15
+
16
+ def timing_decorator(message):
17
+ def decorator(func):
18
+ @wraps(func)
19
+ def wrapper(*args, **kwargs):
20
+ with st.spinner(message):
21
+ start_time = time.time()
22
+ result = func(*args, **kwargs)
23
+ end_time = time.time()
24
+ st.write(f"{message} complete - {end_time - start_time:.2f}s")
25
+ return result
26
+
27
+ return wrapper
28
+
29
+ return decorator
30
+
31
+
32
+ @timing_decorator("Downloading Youtube video")
33
+ def download_youtube(youtube_url, work_dir):
34
+ audio_fpath = youtube_to_audio(youtube_url, work_dir)
35
+ # Get Youtube chapters, return empty list if is not in metadata
36
+ yt_chapters = get_youtube_chapters(youtube_url)
37
+ return audio_fpath, yt_chapters
38
+
39
+
40
+ @timing_decorator("Transcribing audio")
41
+ def audio_to_text(audio_fpath):
42
+ # Transcribe video with Whisper
43
+ timestamped_text = transcribe_audio(audio_fpath)
44
+ return timestamped_text
45
+
46
+
47
+ @timing_decorator("Retrieving chapters")
48
+ def retrieve_chapters(timestamped_text, yt_chapters):
49
+ # Get chapters
50
+ if len(yt_chapters) == 0:
51
+ chapters = get_automatic_chapters(timestamped_text)
52
+ else:
53
+ chapters = align_chapters(timestamped_text, yt_chapters)
54
+ return chapters
55
+
56
+
57
+ @timing_decorator("Summarizing video")
58
+ def summarize_youtube_chapters(chapters):
59
+ # Summarize chapters
60
+ summarized_chapters = summarize_chapters(chapters)
61
+ return summarized_chapters
62
+
63
+
64
+ def get_work_dir():
65
+ temp_dir = tempfile.TemporaryDirectory()
66
+ work_dir = temp_dir.name
67
+ return work_dir
68
+
69
+
70
+ def convert_seconds(seconds):
71
+ hours = int(seconds // 3600)
72
+ minutes = int((seconds % 3600) // 60)
73
+ seconds = int((seconds % 3600) % 60)
74
+
75
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
76
+
77
+
78
+ def summarize_video(youtube_url):
79
+ st.video(youtube_url)
80
+ # Create a temporary directory to store the audio file
81
+ work_dir = get_work_dir()
82
+
83
+ # Summarize the video
84
+ audio_fpath, yt_chapters = download_youtube(youtube_url, work_dir)
85
+ timestamped_text = audio_to_text(audio_fpath)
86
+
87
+ chapters = retrieve_chapters(timestamped_text, yt_chapters)
88
+ summarized_chapters, overall_summary = summarize_youtube_chapters(chapters)
89
+
90
+ st.write(f"**TLDR:** {overall_summary}")
91
+
92
+ for summarized_chapter in summarized_chapters:
93
+ start_time = convert_seconds(summarized_chapter["start"])
94
+ end_time = convert_seconds(summarized_chapter["end"])
95
+
96
+ timestamp = f"{start_time} - {end_time}"
97
+ title = summarized_chapter["title"]
98
+ summary = summarized_chapter["summary"]
99
+
100
+ # Display the hyperlink with timestamp and title
101
+ hyperlink = (
102
+ f"[{timestamp} - {title}]({youtube_url}&t={summarized_chapter['start']}s)"
103
+ )
104
+ st.markdown(hyperlink, unsafe_allow_html=True)
105
+
106
+ st.write(summary)
107
+ rmtree(work_dir)
108
+
109
+
110
+ def app():
111
+ st.title("Video Summarizer")
112
+ youtube_url = st.text_input("Enter a YouTube URL")
113
+
114
+ # Add summarize button
115
+ summarize_button = st.button("Summarize")
116
+
117
+ if summarize_button:
118
+ summarize_video(youtube_url)
119
+
120
+
121
+ if __name__ == "__main__":
122
+ app()
audio_to_text.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+
3
+ EOS_TOKENS = [".", "!", "?"]
4
+
5
+
6
+ def transcribe_audio(audio_fpath, max_snt_len=100):
7
+ model = whisper.load_model("small")
8
+ result = model.transcribe(audio_fpath)
9
+
10
+ sentences = []
11
+ snt_start = None
12
+ snt = ""
13
+ for segment in result["segments"]:
14
+ snt += f'{segment["text"]} '
15
+ if not snt_start:
16
+ snt_start = segment["start"]
17
+ if (
18
+ segment["text"].strip().split()[-1][-1] in EOS_TOKENS
19
+ or len(snt) > max_snt_len
20
+ ):
21
+ sentences.append(
22
+ {"text": snt.strip(), "start": snt_start, "end": segment["end"]}
23
+ )
24
+ snt_start = None
25
+ snt = ""
26
+
27
+ if len(snt) > 0:
28
+ sentences.append(
29
+ {"text": snt.strip(), "start": snt_start, "end": segment["end"]}
30
+ )
31
+ snt_start = None
32
+ snt = ""
33
+
34
+ timestamped_text = ""
35
+ for sentence in sentences:
36
+ timestamped_text += (
37
+ f'{sentence["start"]} {sentence["end"]} {sentence["text"]}\n'
38
+ )
39
+ return timestamped_text
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ git+https://github.com/openai/whisper.git
2
+ openai
3
+ yt-dlp
4
+ streamlit
5
+ scikit-learn
6
+ tenacity
7
+ langchain
text_summary.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import textwrap
3
+
4
+ import numpy as np
5
+ import openai
6
+ from langchain.chains.summarize import load_summarize_chain
7
+ from langchain.docstore.document import Document
8
+ from langchain.llms import OpenAI
9
+ from langchain.prompts import PromptTemplate
10
+ from sklearn.cluster import KMeans
11
+ from tenacity import stop_after_attempt # for exponential backoff
12
+ from tenacity import retry, wait_random_exponential
13
+
14
+ DEFAULT_PROMPT = (
15
+ "Summarize this Youtube video chapter. Always start with a topical sentence: "
16
+ )
17
+ CHAPTER_TITLE = "Give a title to this video chapter based on the transcript: "
18
+
19
+ title_template = "Give a title to this text summary: {text}"
20
+ TITLE_PROMPT = PromptTemplate(template=title_template, input_variables=["text"])
21
+
22
+ openai.api_key = os.environ.get("CHATGPT_API_KEY")
23
+
24
+ if openai.api_key is None:
25
+ raise ValueError("CHATGPT_API_KEY environment variable not set")
26
+
27
+
28
+ @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
29
+ def get_embeddings(text_chunks, model="text-embedding-ada-002"):
30
+ data = openai.Embedding.create(input=text_chunks, model=model)["data"]
31
+ embeddings = [item["embedding"] for item in data]
32
+ return np.array(embeddings)
33
+
34
+
35
+ def text_from_file(text_path):
36
+ in_text = ""
37
+ with open(text_path, "r", encoding="utf-8") as text_file:
38
+ for line in text_file:
39
+ in_text += line
40
+ return in_text
41
+
42
+
43
+ def get_chunks(timestamped_transcripts, chunk_lines):
44
+ chunks = []
45
+ current_chunk = []
46
+ for line in timestamped_transcripts:
47
+ current_chunk.append(line)
48
+ if len(current_chunk) == chunk_lines:
49
+ chunks.append("\n".join(current_chunk))
50
+ current_chunk = []
51
+
52
+ if len(current_chunk) > 0:
53
+ chunks.append("\n".join(current_chunk))
54
+
55
+ return chunks
56
+
57
+
58
+ def align_chapters(timestamped_transcript, yt_chapters):
59
+ timestamped_transcripts = timestamped_transcript.strip().split("\n")
60
+
61
+ chapters = []
62
+ chapter_text = ""
63
+ chapter_start_time = 0.0
64
+ prev_end_time = 0.0
65
+ chapter_index = 0
66
+ for idx, trn in enumerate(timestamped_transcripts):
67
+ trn_start_time = float(trn.split()[0])
68
+ trn_end_time = float(trn.split()[1])
69
+ trn_text = " ".join(trn.split()[2:])
70
+
71
+ if idx == 0:
72
+ chapter_start_time = trn_start_time
73
+
74
+ next_index = min(chapter_index + 1, len(yt_chapters) - 1)
75
+ if trn_start_time >= yt_chapters[next_index]["start_time"]:
76
+ if len(chapters) == len(yt_chapters):
77
+ chapter_text += f"{trn_text}\n"
78
+ else:
79
+ chapters.append(
80
+ {
81
+ "text": chapter_text,
82
+ "start_time": chapter_start_time,
83
+ "end_time": prev_end_time,
84
+ "title": yt_chapters[chapter_index]["title"],
85
+ }
86
+ )
87
+ chapter_text = trn_text
88
+ chapter_start_time = trn_start_time
89
+ chapter_index += 1
90
+ else:
91
+ chapter_text += f"{trn_text}\n"
92
+ prev_end_time = trn_end_time
93
+
94
+ if len(chapters) == len(yt_chapters):
95
+ chapter_index = len(yt_chapters) - 1
96
+ chapters[chapter_index]["text"] += chapter_text
97
+ chapters[chapter_index]["end_time"] = prev_end_time
98
+ return chapters
99
+
100
+
101
+ def get_automatic_chapters(timestamped_transcript, chunk_lines=5, num_clusters=3):
102
+ timestamped_transcripts = timestamped_transcript.split("\n")
103
+
104
+ # Split into chunks
105
+ text_chunks = get_chunks(timestamped_transcripts, chunk_lines)
106
+ embeddings = get_embeddings(text_chunks)
107
+
108
+ # Creating and fitting the K-means model
109
+ kmeans = KMeans(n_clusters=num_clusters)
110
+ kmeans.fit(embeddings)
111
+
112
+ # Getting the cluster labels
113
+ cluster_labels = kmeans.labels_
114
+
115
+ current_label = -1
116
+ current_text = ""
117
+ chapters = []
118
+ for idx, (text_chunk, label) in enumerate(zip(text_chunks, cluster_labels)):
119
+ start_time, end_time = get_chunk_timestamps(text_chunk)
120
+
121
+ if idx == 0:
122
+ chapter_start_time = start_time
123
+
124
+ if label != current_label and current_label != -1:
125
+ chapters.append(
126
+ {
127
+ "text": current_text,
128
+ "start_time": chapter_start_time,
129
+ "end_time": prev_end_time,
130
+ "title": "",
131
+ }
132
+ )
133
+ current_text = ""
134
+ chapter_start_time = start_time
135
+
136
+ current_label = label
137
+ current_text += get_chunk_text(text_chunk)
138
+ prev_end_time = end_time
139
+ if len(current_text) > 0:
140
+ chapters.append(
141
+ {
142
+ "text": current_text,
143
+ "start_time": chapter_start_time,
144
+ "end_time": prev_end_time,
145
+ "title": "",
146
+ }
147
+ )
148
+ return chapters
149
+
150
+
151
+ def get_chunk_timestamps(chunk):
152
+ start_time = float(chunk.strip().split("\n")[0].split()[0])
153
+ end_time = float(chunk.strip().split("\n")[-1].split()[1])
154
+ return start_time, end_time
155
+
156
+
157
+ def get_chunk_text(chunk):
158
+ chunk_text = ""
159
+ for chunk_line in chunk.strip().split("\n"):
160
+ chunk_text += " ".join(chunk_line.split()[2:])
161
+ return chunk_text
162
+
163
+
164
+ def summarize_chapters(chapters):
165
+ llm = OpenAI(temperature=0.9, openai_api_key=os.environ.get("CHATGPT_API_KEY"))
166
+ chapter_docs = [Document(page_content=chapter["text"]) for chapter in chapters]
167
+
168
+ summary_chain = load_summarize_chain(
169
+ llm, chain_type="map_reduce", return_intermediate_steps=True
170
+ )
171
+ summaries = summary_chain(
172
+ {"input_documents": chapter_docs}, return_only_outputs=True
173
+ )
174
+
175
+ summary_docs = [
176
+ Document(page_content=summary) for summary in summaries["intermediate_steps"]
177
+ ]
178
+
179
+ title_chain = load_summarize_chain(
180
+ llm,
181
+ chain_type="map_reduce",
182
+ return_intermediate_steps=True,
183
+ map_prompt=TITLE_PROMPT,
184
+ )
185
+ titles = title_chain({"input_documents": summary_docs}, return_only_outputs=True)
186
+
187
+ summarized_chapters = []
188
+ for chapter, chapter_summary, chapter_title in zip(
189
+ chapters, summaries["intermediate_steps"], titles["intermediate_steps"]
190
+ ):
191
+ if len(chapter["title"]) > 0:
192
+ chapter_title = chapter["title"]
193
+ summarized_chapters.append(
194
+ {
195
+ "start": chapter["start_time"],
196
+ "end": chapter["end_time"],
197
+ "title": chapter_title.strip(),
198
+ "summary": chapter_summary.strip(),
199
+ }
200
+ )
201
+ return summarized_chapters, summaries["output_text"]
youtube_extraction.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+
3
+ import yt_dlp
4
+
5
+
6
+ def youtube_to_audio(url, output_path, filename_template="youtube_video"):
7
+ ydl_opts = {
8
+ "outtmpl": f"{output_path}/{filename_template}",
9
+ "format": "m4a/bestaudio/best",
10
+ "postprocessors": [
11
+ { # Extract audio using ffmpeg
12
+ "key": "FFmpegExtractAudio",
13
+ "preferredcodec": "m4a",
14
+ }
15
+ ],
16
+ }
17
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
18
+ ydl.download([url])
19
+ file_path = glob.glob(f"{output_path}/{filename_template}*")[0]
20
+ return file_path
21
+
22
+
23
+ def get_youtube_chapters(url):
24
+ video_chapters = []
25
+ ydl_opts = {}
26
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
27
+ info = ydl.extract_info(url, download=False)
28
+ if "chapters" in info and info["chapters"]:
29
+ video_chapters = info["chapters"]
30
+
31
+ return video_chapters