Spaces:
Runtime error
Runtime error
File size: 4,020 Bytes
7288748 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
from typing import List, Generator, Tuple
from pathlib import Path
from itertools import islice
import scrapetube
from youtubesearchpython import ChannelsSearch
from pytube import Playlist
from utils import accepts_types
from loading.serialization import Serializer
class YoutubeVideoPreprocessor:
"""This class is responsible for creating json files of expected as YoutubeVideo
objects taking a channel name as input.
Each JSON file has the following information:
- channel_name: The name of the YouTube channel
- url: The url of the video
Args:
channel_name (`str`):
The name of the YouTube channel:
Returns:
load_paths (`List[Path]`)
The paths of the json files of the video of that channel.
TODO: Change it to accept also URL of video list, name of video list, etc.
"""
def __init__(self,
mode: str = "channel_name",
serializer = Serializer) -> None:
self.mode = mode
self.serializer = serializer
def preprocess(self,
name: str,
num_videos: int,
videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
if self.mode == "channel_name":
# TODO: Add credits
channels_search = ChannelsSearch(name, limit=1)
channel_id = channels_search.result()['result'][0]['id']
videos = scrapetube.get_channel(channel_id=channel_id)
load_paths, dataset_folder = self._convert_videos_to_json_files(name,
videos,
num_videos,
videos_in_ds)
return load_paths, dataset_folder
elif self.mode == "playlist":
playlist_id = name.split("=")[-1]
playlist = Playlist(name)
name = playlist.title
videos = scrapetube.get_playlist(playlist_id)
load_paths, dataset_folder = self._convert_videos_to_json_files(name,
videos,
num_videos,
videos_in_ds)
return load_paths, dataset_folder
else:
# TODO: implement this part
youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
test_files_folder = youtube_folder/"test/files"
return [Path("test.json"), Path("test1.json")], test_files_folder
def _convert_videos_to_json_files(self,
name:str,
videos: Generator,
num_videos: int,
videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
load_paths = []
youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
dataset_folder = youtube_folder/name
Path(dataset_folder).mkdir(parents=True, exist_ok=True)
i = 0
while i < num_videos:
try:
video = next(videos)
if video["videoId"] in videos_in_ds:
continue
else:
file_name = f"{i}.json"
save_path = Path(dataset_folder, file_name)
save_path.touch(exist_ok=True)
video_dict = {"channel_name": name,
"url":f"https://www.youtube.com/watch?v={video['videoId']}"}
self.serializer.dump(obj=video_dict, save_path=save_path)
load_paths.append(save_path)
i += 1
except StopIteration:
break
return load_paths, dataset_folder |