youtube_video_similarity / utils /helper_funcs.py
aapot
Add gradio error handling for incorrect video urls
25bd6d3
raw
history blame
4.24 kB
import itertools
import random
import requests
import pandas as pd
import gradio as gr
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
def is_youtube_video_available(url):
video = YouTube(url)
try:
video.title
return True
except:
return False
def get_example_videos(rr_examples_url, num_rr_examples):
example_videos = [['https://www.youtube.com/watch?v=WfVF-Ec4naQ', 'https://www.youtube.com/watch?v=4hrNt28t7Cw'],
['https://www.youtube.com/watch?v=GbpjLP-UvIU',
'https://www.youtube.com/watch?v=BlQ2mP2EE4A'],
['https://www.youtube.com/watch?v=fdzY1f2P91k',
'https://www.youtube.com/watch?v=BlQ2mP2EE4A'],
['https://www.youtube.com/watch?v=fdzY1f2P91k', 'https://www.youtube.com/watch?v=9gIVGJQ3xWE']]
example_videos = [ex for ex in example_videos if is_youtube_video_available(
ex[0]) and is_youtube_video_available(ex[1])]
try:
example_videos_rr = requests.get(rr_examples_url).json()
except:
example_videos_rr = []
example_videos_rr = [[f'https://www.youtube.com/watch?v={ex["rejected_video_id"]}',
f'https://www.youtube.com/watch?v={ex["recommendation_id"]}'] for ex in example_videos_rr]
# remove duplicate video pairs, there seems to be one duplicate
example_videos_rr.sort()
example_videos_rr = list(example_videos_rr for example_videos_rr,
_ in itertools.groupby(example_videos_rr))
example_videos_rr = [ex for ex in example_videos_rr if is_youtube_video_available(
ex[0]) and is_youtube_video_available(ex[1])]
if len(example_videos_rr) > num_rr_examples:
example_videos_rr = random.sample(example_videos_rr, num_rr_examples)
return example_videos, example_videos_rr
def get_youtube_embedded_html(embed_url, video_position):
return f'''
<p>Video {video_position}</p>
<iframe width="100%" height="360px" src="{embed_url}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; fullscreen" allowfullscreen></iframe>
'''
def update_youtube_embedded_html(video_url, video_position):
try:
embed_url = YouTube(video_url).embed_url
except:
return f'''
<p>There was error in fetching details for video with the URL: {video_url}</p>
'''
return get_youtube_embedded_html(embed_url, video_position)
def get_youtube_video_data(url):
try:
video = YouTube(url)
except:
raise gr.Error(f'Could not find YouTube video with the URL {url}')
channel_id = video.channel_id
video_title = video.title
video_description = video.description
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video.video_id)
except:
return channel_id, video_title, video_description, None
available_non_common_langs = [tr.language_code for tr in list(
transcript_list) if tr.language_code not in ['en', 'en-US', 'es', 'de']]
video_transcript = YouTubeTranscriptApi.get_transcript(
video.video_id, languages=['en', 'en-US', 'es', 'de'] + available_non_common_langs)
video_transcript = TextFormatter().format_transcript(
video_transcript).replace('\n', ' ')
return channel_id, video_title, video_description, video_transcript
def get_input_data_df(video1_url, video2_url):
channel_id, video_title, video_description, video_transcript = get_youtube_video_data(
video1_url)
channel_id2, video_title2, video_description2, video_transcript2 = get_youtube_video_data(
video2_url)
channel_sim = 1 if channel_id == channel_id2 else 0
df = pd.DataFrame([[video_title, video_description, video_transcript] + [video_title2, video_description2, video_transcript2] + [channel_sim]], columns=[
'regret_title', 'regret_description', 'regret_transcript', 'recommendation_title', 'recommendation_description', 'recommendation_transcript', 'channel_sim'])
return df