File size: 1,316 Bytes
7288748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# Adapted from Eduardo Matallanas
from datasets import load_dataset, Dataset
from datasets.data_files import EmptyDatasetError

class HFDataset():
  """
  Create a dataset to save the transcripts from Youtube.
  """
  def __init__(self, name) -> None:
    self.name = name
    if name != "":
      self._init_dataset()
    else:
      self.dataset = Dataset.from_dict({})
      self.exist = False
      self.is_empty = True

  def _init_dataset(self):
    try:
      self.dataset = load_dataset(self.name)
      self.exist = True
      self.is_empty = False
      self.list_of_ids = self._get_list_of_id()
    except EmptyDatasetError:
      self.dataset = Dataset.from_dict({})
      self.exist = True
      self.is_empty = True
      self.list_of_ids = []
      pass
    except FileNotFoundError:
      self.dataset = Dataset.from_dict({})
      self.exist = False
      self.is_empty = True
      self.list_of_ids = []
      pass

  def upload(self):
    self.dataset.push_to_hub(self.name)
    
  def _get_list_of_id(self):
    new_ds = self.dataset.map(
      lambda x: {"ID": [url.split("=")[-1] for url in x["URL"]]}, batched=True
    )
    list_of_ids = []
    for split in new_ds:
      ids = new_ds[split]["ID"]
      list_of_ids.append(ids)
    return [item for sublist in list_of_ids for item in sublist]