Spaces:

Xenova
/

sponsorblock-ml

Running

App Files Files Community

Joshua Lochner commited on Feb 16, 2022

Commit

de9c8c4

1 Parent(s): 776c8b2

Add `no_cuda` argument to not use GPU

Browse files

Files changed (5) hide show

src/evaluate.py +6 -9
src/model.py +7 -1
src/predict.py +13 -8
src/preprocess.py +4 -1
src/train.py +1 -1

src/evaluate.py CHANGED Viewed

@@ -143,12 +143,12 @@ def main():
         dataset_args.data_dir, dataset_args.processed_file)
     if not os.path.exists(final_path):
-        logger.error('ERROR: Processed database not found.',
-                     f'Run `python src/preprocess.py --update_database --do_process_database` to generate "{final_path}".')
         return
     model, tokenizer = get_model_tokenizer(
-        evaluation_args.model_path, evaluation_args.cache_dir)
     with open(final_path) as fp:
         final_data = json.load(fp)
@@ -178,14 +178,8 @@ def main():
     try:
         with tqdm(video_ids) as progress:
             for video_index, video_id in enumerate(progress):
                 progress.set_description(f'Processing {video_id}')
-                sponsor_segments = final_data.get(video_id)
-                if not sponsor_segments:
-                    logger.warning('No labels found for', video_id)
-                    continue
                 words = get_words(video_id)
                 if not words:
                     continue
@@ -194,6 +188,8 @@ def main():
                 predictions = predict(video_id, model, tokenizer,
                                       segmentation_args, words, classifier_args)
                 if sponsor_segments:
                     labelled_words = add_labels_to_words(
                         words, sponsor_segments)
@@ -229,6 +225,7 @@ def main():
                             words, seg['start'], seg['end'])
                 else:
                     # Not in database (all segments missed)
                     missed_segments = predictions
                     incorrect_segments = []

         dataset_args.data_dir, dataset_args.processed_file)
     if not os.path.exists(final_path):
+        logger.error('ERROR: Processed database not found.\n'
+                     f'Run `python src/preprocess.py --update_database --do_create` to generate "{final_path}".')
         return
     model, tokenizer = get_model_tokenizer(
+        evaluation_args.model_path, evaluation_args.cache_dir, evaluation_args.no_cuda)
     with open(final_path) as fp:
         final_data = json.load(fp)
     try:
         with tqdm(video_ids) as progress:
             for video_index, video_id in enumerate(progress):
                 progress.set_description(f'Processing {video_id}')
                 words = get_words(video_id)
                 if not words:
                     continue
                 predictions = predict(video_id, model, tokenizer,
                                       segmentation_args, words, classifier_args)
+                # Get labels
+                sponsor_segments = final_data.get(video_id)
                 if sponsor_segments:
                     labelled_words = add_labels_to_words(
                         words, sponsor_segments)
                             words, seg['start'], seg['end'])
                 else:
+                    # logger.warning(f'No labels found for {video_id}')
                     # Not in database (all segments missed)
                     missed_segments = predictions
                     incorrect_segments = []

src/model.py CHANGED Viewed

@@ -7,6 +7,7 @@ import pickle
 import os
 from dataclasses import dataclass, field
 from typing import Optional
 @dataclass
@@ -22,6 +23,9 @@ class ModelArguments:
             'help': 'Path to pretrained model or model identifier from huggingface.co/models'
         }
     )
     # config_name: Optional[str] = field( # TODO remove?
     #     default=None, metadata={'help': 'Pretrained config name or path if not the same as model_name'}
     # )
@@ -93,13 +97,15 @@ def get_classifier_vectorizer(classifier_args):
 @lru_cache(maxsize=None)
-def get_model_tokenizer(model_name_or_path, cache_dir=None):
     if model_name_or_path is None:
         raise ModelLoadError('Invalid model_name_or_path.')
     # Load pretrained model and tokenizer
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_name_or_path, cache_dir=cache_dir)
     tokenizer = AutoTokenizer.from_pretrained(
         model_name_or_path, max_length=model.config.d_model, cache_dir=cache_dir)

 import os
 from dataclasses import dataclass, field
 from typing import Optional
+import torch
 @dataclass
             'help': 'Path to pretrained model or model identifier from huggingface.co/models'
         }
     )
+    no_cuda: bool = field(default=False, metadata={
+                          'help': 'Do not use CUDA even when it is available'})
     # config_name: Optional[str] = field( # TODO remove?
     #     default=None, metadata={'help': 'Pretrained config name or path if not the same as model_name'}
     # )
 @lru_cache(maxsize=None)
+def get_model_tokenizer(model_name_or_path, cache_dir=None, no_cuda=False):
     if model_name_or_path is None:
         raise ModelLoadError('Invalid model_name_or_path.')
     # Load pretrained model and tokenizer
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_name_or_path, cache_dir=cache_dir)
+    if not no_cuda:
+        model.to('cuda' if torch.cuda.is_available() else 'cpu')
     tokenizer = AutoTokenizer.from_pretrained(
         model_name_or_path, max_length=model.config.d_model, cache_dir=cache_dir)

src/predict.py CHANGED Viewed

@@ -25,6 +25,7 @@ import preprocess
 from errors import PredictionException, TranscriptError, ModelLoadError, ClassifierLoadError
 from model import ModelArguments, get_classifier_vectorizer, get_model_tokenizer
 # Public innertube key (b64 encoded so that it is not incorrectly flagged)
 INNERTUBE_KEY = base64.b64decode(
@@ -114,6 +115,8 @@ class InferenceArguments:
     output_as_json: bool = field(default=False, metadata={
                                  'help': 'Output evaluations as JSON'})
     def __post_init__(self):
         # Try to load model from latest checkpoint
         if self.model_path is None:
@@ -137,8 +140,8 @@ class InferenceArguments:
             channel_video_ids = list(itertools.islice(get_all_channel_vids(
                 self.channel_id), start, end))
-            print('Found', len(channel_video_ids),
-                  'for channel', self.channel_id)
             self.video_ids += channel_video_ids
@@ -300,8 +303,9 @@ CATEGORIES = [None, 'SPONSOR', 'SELFPROMO', 'INTERACTION']
 def predict_sponsor_text(text, model, tokenizer):
     """Given a body of text, predict the words which are part of the sponsor"""
     input_ids = tokenizer(
-        f'{CustomTokens.EXTRACT_SEGMENTS_PREFIX.value} {text}', return_tensors='pt', truncation=True).input_ids
     max_out_len = round(min(
         max(
@@ -389,7 +393,7 @@ def segments_to_predictions(segments, model, tokenizer):
 def main():
     # Test on unseen data
-    logging.getLogger().setLevel(logging.DEBUG)
     hf_parser = HfArgumentParser((
         PredictArguments,
@@ -399,11 +403,12 @@ def main():
     predict_args, segmentation_args, classifier_args = hf_parser.parse_args_into_dataclasses()
     if not predict_args.video_ids:
-        print('No video IDs supplied. Use `--video_id`, `--video_ids`, or `--channel_id`.')
         return
     model, tokenizer = get_model_tokenizer(
-        predict_args.model_path, predict_args.cache_dir)
     for video_id in predict_args.video_ids:
         video_id = video_id.strip()
@@ -411,11 +416,11 @@ def main():
             predictions = predict(video_id, model, tokenizer,
                                   segmentation_args, classifier_args=classifier_args)
         except TranscriptError:
-            print('No transcript available for', video_id, end='\n\n')
             continue
         video_url = f'https://www.youtube.com/watch?v={video_id}'
         if not predictions:
-            print('No predictions found for', video_url, end='\n\n')
             continue
         # TODO use predict_args.output_as_json

 from errors import PredictionException, TranscriptError, ModelLoadError, ClassifierLoadError
 from model import ModelArguments, get_classifier_vectorizer, get_model_tokenizer
+logger = logging.getLogger(__name__)
 # Public innertube key (b64 encoded so that it is not incorrectly flagged)
 INNERTUBE_KEY = base64.b64decode(
     output_as_json: bool = field(default=False, metadata={
                                  'help': 'Output evaluations as JSON'})
+    no_cuda: bool = ModelArguments.__dataclass_fields__['no_cuda']
     def __post_init__(self):
         # Try to load model from latest checkpoint
         if self.model_path is None:
             channel_video_ids = list(itertools.islice(get_all_channel_vids(
                 self.channel_id), start, end))
+            logger.info(
+                f'Found {len(channel_video_ids)} for channel {self.channel_id}')
             self.video_ids += channel_video_ids
 def predict_sponsor_text(text, model, tokenizer):
     """Given a body of text, predict the words which are part of the sponsor"""
+    model_device = next(model.parameters()).device
     input_ids = tokenizer(
+        f'{CustomTokens.EXTRACT_SEGMENTS_PREFIX.value} {text}', return_tensors='pt', truncation=True).input_ids.to(model_device)
     max_out_len = round(min(
         max(
 def main():
     # Test on unseen data
+    # logging.getLogger().setLevel(logging.DEBUG)
     hf_parser = HfArgumentParser((
         PredictArguments,
     predict_args, segmentation_args, classifier_args = hf_parser.parse_args_into_dataclasses()
     if not predict_args.video_ids:
+        logger.error(
+            'No video IDs supplied. Use `--video_id`, `--video_ids`, or `--channel_id`.')
         return
     model, tokenizer = get_model_tokenizer(
+        predict_args.model_path, predict_args.cache_dir, predict_args.no_cuda)
     for video_id in predict_args.video_ids:
         video_id = video_id.strip()
             predictions = predict(video_id, model, tokenizer,
                                   segmentation_args, classifier_args=classifier_args)
         except TranscriptError:
+            logger.warning('No transcript available for', video_id, end='\n\n')
             continue
         video_url = f'https://www.youtube.com/watch?v={video_id}'
         if not predictions:
+            logger.info('No predictions found for', video_url, end='\n\n')
             continue
         # TODO use predict_args.output_as_json

src/preprocess.py CHANGED Viewed

@@ -558,6 +558,8 @@ def main():
     @lru_cache(maxsize=1)
     def read_db():
         if not preprocess_args.overwrite and os.path.exists(processed_db_path):
             with open(processed_db_path) as fp:
                 return json.load(fp)
         print('Processing raw database')
@@ -790,7 +792,8 @@ def main():
         # , max_videos, max_segments
         from model import get_model_tokenizer
-        model, tokenizer = get_model_tokenizer(model_args.model_name_or_path)
         # TODO
         # count_videos = 0

     @lru_cache(maxsize=1)
     def read_db():
         if not preprocess_args.overwrite and os.path.exists(processed_db_path):
+            print(
+                'Using cached processed database (use `--overwrite` to avoid this behaviour).')
             with open(processed_db_path) as fp:
                 return json.load(fp)
         print('Processing raw database')
         # , max_videos, max_segments
         from model import get_model_tokenizer
+        model, tokenizer = get_model_tokenizer(
+            model_args.model_name_or_path, model_args.cache_dir, model_args.no_cuda)
         # TODO
         # count_videos = 0

src/train.py CHANGED Viewed

@@ -297,7 +297,7 @@ def main():
         from model import get_model_tokenizer
         model, tokenizer = get_model_tokenizer(
-            model_args.model_name_or_path, model_args.cache_dir)
         # max_tokenizer_length = model.config.d_model
         # Preprocessing the datasets.

         from model import get_model_tokenizer
         model, tokenizer = get_model_tokenizer(
+            model_args.model_name_or_path, model_args.cache_dir, model_args.no_cuda)
         # max_tokenizer_length = model.config.d_model
         # Preprocessing the datasets.