Spaces:

Xenova
/

sponsorblock-ml

Running

App Files Files Community

Joshua Lochner commited on Feb 1, 2022

Commit

fb87012

1 Parent(s): 02e576a

Improve caching and downloading of classifier for predictions

Browse files

Files changed (2) hide show

src/evaluate.py +4 -3
src/predict.py +20 -8

src/evaluate.py CHANGED Viewed

@@ -205,7 +205,7 @@ def main():
     evaluation_args, dataset_args, segmentation_args, classifier_args, _ = hf_parser.parse_args_into_dataclasses()
-    model, tokenizer = get_model_tokenizer(evaluation_args.model_path)
     # # TODO find better way of evaluating videos not trained on
     # dataset = load_dataset('json', data_files=os.path.join(
@@ -313,8 +313,9 @@ def main():
                                 [w['text'] for w in missed_segment['words']]), '"', sep='')
                             print('\t\tCategory:',
                                   missed_segment.get('category'))
-                            print('\t\tProbability:',
-                                  missed_segment.get('probability'))
                             segments_to_submit.append({
                                 'segment': [missed_segment['start'], missed_segment['end']],

     evaluation_args, dataset_args, segmentation_args, classifier_args, _ = hf_parser.parse_args_into_dataclasses()
+    model, tokenizer = get_model_tokenizer(evaluation_args.model_path, evaluation_args.cache_dir)
     # # TODO find better way of evaluating videos not trained on
     # dataset = load_dataset('json', data_files=os.path.join(
                                 [w['text'] for w in missed_segment['words']]), '"', sep='')
                             print('\t\tCategory:',
                                   missed_segment.get('category'))
+                            if 'probability' in missed_segment:
+                                print('\t\tProbability:',
+                                      missed_segment['probability'])
                             segments_to_submit.append({
                                 'segment': [missed_segment['start'], missed_segment['end']],

src/predict.py CHANGED Viewed

@@ -11,8 +11,8 @@ from segment import (
     SegmentationArguments
 )
 import preprocess
-from errors import TranscriptError, ModelLoadError
-from model import get_classifier_vectorizer, get_model_tokenizer
 from transformers import HfArgumentParser
 from transformers.trainer_utils import get_last_checkpoint
 from dataclasses import dataclass, field
@@ -29,6 +29,7 @@ class TrainingOutputArguments:
             'help': 'Path to pretrained model used for prediction'
         }
     )
     output_dir: Optional[str] = OutputArguments.__dataclass_fields__[
         'output_dir']
@@ -43,7 +44,8 @@ class TrainingOutputArguments:
                 self.model_path = last_checkpoint
                 return
-        raise ModelLoadError('Unable to find model, explicitly set `--model_path`')
 @dataclass
@@ -65,6 +67,13 @@ MERGE_TIME_WITHIN = 8   # Merge predictions if they are within x seconds
 @dataclass(frozen=True, eq=True)
 class ClassifierArguments:
     classifier_dir: Optional[str] = field(
         default='classifiers',
         metadata={
@@ -90,7 +99,6 @@ class ClassifierArguments:
         default=0.5, metadata={'help': 'Remove all predictions whose classification probability is below this threshold.'})
-# classifier, vectorizer,
 def filter_and_add_probabilities(predictions, classifier_args):
     """Use classifier to filter predictions"""
     if not predictions:
@@ -160,8 +168,11 @@ def predict(video_id, model, tokenizer, segmentation_args, words=None, classifie
     # TODO add back
     if classifier_args is not None:
-        predictions = filter_and_add_probabilities(
-            predictions, classifier_args)
     return predictions
@@ -290,7 +301,7 @@ def main():
         print('No video ID supplied. Use `--video_id`.')
         return
-    model, tokenizer = get_model_tokenizer(predict_args.model_path)
     predict_args.video_id = predict_args.video_id.strip()
     predictions = predict(predict_args.video_id, model, tokenizer,
@@ -308,8 +319,9 @@ def main():
               ' '.join([w['text'] for w in prediction['words']]), '"', sep='')
         print('Time:', seconds_to_time(
             prediction['start']), '\u2192', seconds_to_time(prediction['end']))
-        print('Probability:', prediction.get('probability'))
         print('Category:', prediction.get('category'))
         print()

     SegmentationArguments
 )
 import preprocess
+from errors import TranscriptError, ModelLoadError, ClassifierLoadError
+from model import ModelArguments, get_classifier_vectorizer, get_model_tokenizer
 from transformers import HfArgumentParser
 from transformers.trainer_utils import get_last_checkpoint
 from dataclasses import dataclass, field
             'help': 'Path to pretrained model used for prediction'
         }
     )
+    cache_dir: Optional[str] = ModelArguments.__dataclass_fields__['cache_dir']
     output_dir: Optional[str] = OutputArguments.__dataclass_fields__[
         'output_dir']
                 self.model_path = last_checkpoint
                 return
+        raise ModelLoadError(
+            'Unable to find model, explicitly set `--model_path`')
 @dataclass
 @dataclass(frozen=True, eq=True)
 class ClassifierArguments:
+    classifier_model: Optional[str] = field(
+        default='Xenova/sponsorblock-classifier',
+        metadata={
+            'help': 'Use a pretrained classifier'
+        }
+    )
     classifier_dir: Optional[str] = field(
         default='classifiers',
         metadata={
         default=0.5, metadata={'help': 'Remove all predictions whose classification probability is below this threshold.'})
 def filter_and_add_probabilities(predictions, classifier_args):
     """Use classifier to filter predictions"""
     if not predictions:
     # TODO add back
     if classifier_args is not None:
+        try:
+            predictions = filter_and_add_probabilities(
+                predictions, classifier_args)
+        except ClassifierLoadError:
+            print('Unable to load classifer')
     return predictions
         print('No video ID supplied. Use `--video_id`.')
         return
+    model, tokenizer = get_model_tokenizer(predict_args.model_path, predict_args.cache_dir)
     predict_args.video_id = predict_args.video_id.strip()
     predictions = predict(predict_args.video_id, model, tokenizer,
               ' '.join([w['text'] for w in prediction['words']]), '"', sep='')
         print('Time:', seconds_to_time(
             prediction['start']), '\u2192', seconds_to_time(prediction['end']))
         print('Category:', prediction.get('category'))
+        if 'probability' in prediction:
+            print('Probability:', prediction['probability'])
         print()