File size: 4,020 Bytes
7288748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from typing import List, Generator, Tuple
from pathlib import Path
from itertools import islice

import scrapetube
from youtubesearchpython import ChannelsSearch
from pytube import Playlist

from utils import accepts_types
from loading.serialization import Serializer

class YoutubeVideoPreprocessor:
    """This class is responsible for creating json files of expected as YoutubeVideo
    objects taking a channel name as input.
    Each JSON file has the following information:
    - channel_name: The name of the YouTube channel
    - url: The url of the video
    Args:
        channel_name (`str`):
            The name of the YouTube channel:
    Returns:
        load_paths (`List[Path]`)
            The paths of the json files of the video of that channel.
    TODO: Change it to accept also URL of video list, name of video list, etc.
    """
    def __init__(self, 
                 mode: str = "channel_name", 
                 serializer = Serializer) -> None:
        self.mode = mode
        self.serializer = serializer
    
    def preprocess(self,
                   name: str,
                   num_videos: int,
                   videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
        if self.mode == "channel_name":
            # TODO: Add credits
            channels_search = ChannelsSearch(name, limit=1)
            channel_id = channels_search.result()['result'][0]['id']
            videos = scrapetube.get_channel(channel_id=channel_id)
            load_paths, dataset_folder = self._convert_videos_to_json_files(name, 
                                                                            videos, 
                                                                            num_videos,
                                                                            videos_in_ds)
            return load_paths, dataset_folder
        elif self.mode == "playlist":
            playlist_id = name.split("=")[-1]
            playlist = Playlist(name)
            name = playlist.title
            videos = scrapetube.get_playlist(playlist_id)
            load_paths, dataset_folder = self._convert_videos_to_json_files(name, 
                                                                            videos, 
                                                                            num_videos,
                                                                            videos_in_ds)
            return load_paths, dataset_folder
        else:
            # TODO: implement this part
            youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
            test_files_folder = youtube_folder/"test/files"
            return [Path("test.json"), Path("test1.json")], test_files_folder

    def _convert_videos_to_json_files(self, 
                                      name:str, 
                                      videos: Generator,
                                      num_videos: int,
                                      videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
        load_paths = []
        youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
        dataset_folder = youtube_folder/name
        Path(dataset_folder).mkdir(parents=True, exist_ok=True)
        i = 0
        while i < num_videos:
            try:
                video = next(videos)
                if video["videoId"] in videos_in_ds:
                    continue
                else:
                    file_name = f"{i}.json"
                    save_path = Path(dataset_folder, file_name)
                    save_path.touch(exist_ok=True)
                    video_dict = {"channel_name": name,
                                  "url":f"https://www.youtube.com/watch?v={video['videoId']}"}
                    self.serializer.dump(obj=video_dict, save_path=save_path)
                    load_paths.append(save_path)
                    i += 1
            except StopIteration:
                break
        return load_paths, dataset_folder