Spaces:

Xenova
/

sponsorblock-ml

Running

App Files Files

xet

Community

Joshua Lochner commited on Apr 15, 2022

Commit

643d00a

1 Parent(s): a0ca50e

Remove PreprocessingDatasetArguments class

Browse files

Files changed (2) hide show

src/preprocess.py +9 -24
src/shared.py +2 -1

src/preprocess.py CHANGED Viewed

@@ -420,6 +420,14 @@ class PreprocessArguments:
     do_split: bool = field(
         default=False, metadata={'help': 'Generate training, testing and validation data.'}
     )
     percentage_positive: float = field(
         default=0.5, metadata={'help': 'Ratio of positive (sponsor) segments to include in final output'})
@@ -488,29 +496,6 @@ def download_file(url, filename):
     return total_bytes == os.path.getsize(filename)
-@dataclass
-class PreprocessingDatasetArguments(DatasetArguments):
-    # excess_file: Optional[str] = field(
-    #     default='excess.json',
-    #     metadata={
-    #         'help': 'The excess segments left after the split'
-    #     },
-    # )
-    positive_file: Optional[str] = field(
-        default='sponsor_segments.json', metadata={'help': 'File to output sponsored segments to (a jsonlines file).'}
-    )
-    negative_file: Optional[str] = field(
-        default='normal_segments.json', metadata={'help': 'File to output normal segments to (a jsonlines file).'}
-    )
-    def __post_init__(self):
-        # TODO check if train/validation datasets exist
-        if self.train_file is None and self.validation_file is None:
-            raise ValueError(
-                'Need either a dataset name or a training/validation file.')
 def main():
     # Responsible for getting transcrips using youtube_transcript_api,
     # then labelling it according to SponsorBlock's API
@@ -519,7 +504,7 @@ def main():
     # Generate final.json from sponsorTimes.csv
     hf_parser = HfArgumentParser((
         PreprocessArguments,
-        PreprocessingDatasetArguments,
         segment.SegmentationArguments,
         model_module.ModelArguments,
         GeneralArguments

     do_split: bool = field(
         default=False, metadata={'help': 'Generate training, testing and validation data.'}
     )
+    positive_file: Optional[str] = field(
+        default='sponsor_segments.json', metadata={'help': 'File to output sponsored segments to (a jsonlines file).'}
+    )
+    negative_file: Optional[str] = field(
+        default='normal_segments.json', metadata={'help': 'File to output normal segments to (a jsonlines file).'}
+    )
     percentage_positive: float = field(
         default=0.5, metadata={'help': 'Ratio of positive (sponsor) segments to include in final output'})
     return total_bytes == os.path.getsize(filename)
 def main():
     # Responsible for getting transcrips using youtube_transcript_api,
     # then labelling it according to SponsorBlock's API
     # Generate final.json from sponsorTimes.csv
     hf_parser = HfArgumentParser((
         PreprocessArguments,
+        DatasetArguments,
         segment.SegmentationArguments,
         model_module.ModelArguments,
         GeneralArguments

src/shared.py CHANGED Viewed

@@ -137,7 +137,8 @@ class DatasetArguments:
     def __post_init__(self):
         if self.train_file is None or self.validation_file is None:
             raise ValueError(
-                "Need either a GLUE task, a training/validation file or a dataset name.")
         else:
             train_extension = self.train_file.split(".")[-1]
             assert train_extension in [

     def __post_init__(self):
         if self.train_file is None or self.validation_file is None:
             raise ValueError(
+                'Need either a dataset name or a training/validation file.')
         else:
             train_extension = self.train_file.split(".")[-1]
             assert train_extension in [