Sneha-Kaurav commited on
Commit
376cc1a
·
verified ·
1 Parent(s): c5f9761

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
4
+ from youtube_transcript_api import YouTubeTranscriptApi
5
+ from youtube_transcript_api.formatters import TextFormatter
6
+ import gradio as gr
7
+
8
+ # Load the T5 model and tokenizer
9
+ model_name = "bilal521/t5-youtube-summarizer"
10
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
11
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
12
+
13
+ # Clean and summarize text
14
+ def summarize_with_t5(text):
15
+ input_text = "summarize: " + text.strip()
16
+ inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
17
+
18
+ summary_ids = model.generate(
19
+ inputs,
20
+ max_length=256,
21
+ min_length=80,
22
+ num_beams=5,
23
+ length_penalty=2.0,
24
+ no_repeat_ngram_size=3,
25
+ early_stopping=True
26
+ )
27
+
28
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
29
+
30
+ # Extract video ID from any YouTube URL
31
+ def extract_video_id(url):
32
+ regex = r"(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?|shorts)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
33
+ match = re.search(regex, url)
34
+ return match.group(1) if match else None
35
+
36
+ # Optional: Clean up repeated or spammy lines
37
+ def clean_transcript(text):
38
+ lines = text.split("\n")
39
+ seen = set()
40
+ clean_lines = []
41
+ for line in lines:
42
+ line = line.strip()
43
+ if not line or line.lower() in seen:
44
+ continue
45
+ if re.match(r'https?:\/\/', line):
46
+ continue
47
+ seen.add(line.lower())
48
+ clean_lines.append(line)
49
+ return " ".join(clean_lines)
50
+
51
+ # Main logic to fetch transcript and summarize
52
+ def get_youtube_transcript(video_url):
53
+ video_id = extract_video_id(video_url)
54
+ if not video_id:
55
+ return "Could not extract video ID. Please check the URL."
56
+
57
+ try:
58
+ yt = YouTubeTranscriptApi()
59
+ transcript = yt.fetch(video_id, languages=['en'])
60
+
61
+ formatter = TextFormatter()
62
+ raw_text = formatter.format_transcript(transcript)
63
+ cleaned_text = clean_transcript(raw_text)
64
+ summary = summarize_with_t5(cleaned_text)
65
+ return summary
66
+
67
+ except Exception as e:
68
+ return f"Error occurred: {e}"
69
+
70
+ # Gradio UI
71
+ demo = gr.Interface(
72
+ fn=get_youtube_transcript,
73
+ inputs=[gr.Textbox(label="YouTube Video URL", lines=1, placeholder="Paste your YouTube URL here")],
74
+ outputs=[gr.Textbox(label="Summarized Transcript", lines=10)],
75
+ title="YouTube Video Summarizer",
76
+ description="This app extracts and summarizes the transcript of a YouTube video using a fine-tuned T5 model."
77
+ )
78
+
79
+ demo.launch()