Spaces:
Runtime error
Runtime error
| # Adapted from Eduardo Matallanas | |
| from datasets import load_dataset, Dataset | |
| from datasets.data_files import EmptyDatasetError | |
| class HFDataset(): | |
| """ | |
| Create a dataset to save the transcripts from Youtube. | |
| """ | |
| def __init__(self, name) -> None: | |
| self.name = name | |
| if name != "": | |
| self._init_dataset() | |
| else: | |
| self.dataset = Dataset.from_dict({}) | |
| self.exist = False | |
| self.is_empty = True | |
| def _init_dataset(self): | |
| try: | |
| self.dataset = load_dataset(self.name) | |
| self.exist = True | |
| self.is_empty = False | |
| self.list_of_ids = self._get_list_of_id() | |
| except EmptyDatasetError: | |
| self.dataset = Dataset.from_dict({}) | |
| self.exist = True | |
| self.is_empty = True | |
| self.list_of_ids = [] | |
| pass | |
| except FileNotFoundError: | |
| self.dataset = Dataset.from_dict({}) | |
| self.exist = False | |
| self.is_empty = True | |
| self.list_of_ids = [] | |
| pass | |
| def upload(self): | |
| self.dataset.push_to_hub(self.name) | |
| def _get_list_of_id(self): | |
| new_ds = self.dataset.map( | |
| lambda x: {"ID": [url.split("=")[-1] for url in x["URL"]]}, batched=True | |
| ) | |
| list_of_ids = [] | |
| for split in new_ds: | |
| ids = new_ds[split]["ID"] | |
| list_of_ids.append(ids) | |
| return [item for sublist in list_of_ids for item in sublist] |