Joshua Lochner commited on
Commit
de9c8c4
·
1 Parent(s): 776c8b2

Add `no_cuda` argument to not use GPU

Browse files
Files changed (5) hide show
  1. src/evaluate.py +6 -9
  2. src/model.py +7 -1
  3. src/predict.py +13 -8
  4. src/preprocess.py +4 -1
  5. src/train.py +1 -1
src/evaluate.py CHANGED
@@ -143,12 +143,12 @@ def main():
143
  dataset_args.data_dir, dataset_args.processed_file)
144
 
145
  if not os.path.exists(final_path):
146
- logger.error('ERROR: Processed database not found.',
147
- f'Run `python src/preprocess.py --update_database --do_process_database` to generate "{final_path}".')
148
  return
149
 
150
  model, tokenizer = get_model_tokenizer(
151
- evaluation_args.model_path, evaluation_args.cache_dir)
152
 
153
  with open(final_path) as fp:
154
  final_data = json.load(fp)
@@ -178,14 +178,8 @@ def main():
178
  try:
179
  with tqdm(video_ids) as progress:
180
  for video_index, video_id in enumerate(progress):
181
-
182
  progress.set_description(f'Processing {video_id}')
183
 
184
- sponsor_segments = final_data.get(video_id)
185
- if not sponsor_segments:
186
- logger.warning('No labels found for', video_id)
187
- continue
188
-
189
  words = get_words(video_id)
190
  if not words:
191
  continue
@@ -194,6 +188,8 @@ def main():
194
  predictions = predict(video_id, model, tokenizer,
195
  segmentation_args, words, classifier_args)
196
 
 
 
197
  if sponsor_segments:
198
  labelled_words = add_labels_to_words(
199
  words, sponsor_segments)
@@ -229,6 +225,7 @@ def main():
229
  words, seg['start'], seg['end'])
230
 
231
  else:
 
232
  # Not in database (all segments missed)
233
  missed_segments = predictions
234
  incorrect_segments = []
 
143
  dataset_args.data_dir, dataset_args.processed_file)
144
 
145
  if not os.path.exists(final_path):
146
+ logger.error('ERROR: Processed database not found.\n'
147
+ f'Run `python src/preprocess.py --update_database --do_create` to generate "{final_path}".')
148
  return
149
 
150
  model, tokenizer = get_model_tokenizer(
151
+ evaluation_args.model_path, evaluation_args.cache_dir, evaluation_args.no_cuda)
152
 
153
  with open(final_path) as fp:
154
  final_data = json.load(fp)
 
178
  try:
179
  with tqdm(video_ids) as progress:
180
  for video_index, video_id in enumerate(progress):
 
181
  progress.set_description(f'Processing {video_id}')
182
 
 
 
 
 
 
183
  words = get_words(video_id)
184
  if not words:
185
  continue
 
188
  predictions = predict(video_id, model, tokenizer,
189
  segmentation_args, words, classifier_args)
190
 
191
+ # Get labels
192
+ sponsor_segments = final_data.get(video_id)
193
  if sponsor_segments:
194
  labelled_words = add_labels_to_words(
195
  words, sponsor_segments)
 
225
  words, seg['start'], seg['end'])
226
 
227
  else:
228
+ # logger.warning(f'No labels found for {video_id}')
229
  # Not in database (all segments missed)
230
  missed_segments = predictions
231
  incorrect_segments = []
src/model.py CHANGED
@@ -7,6 +7,7 @@ import pickle
7
  import os
8
  from dataclasses import dataclass, field
9
  from typing import Optional
 
10
 
11
 
12
  @dataclass
@@ -22,6 +23,9 @@ class ModelArguments:
22
  'help': 'Path to pretrained model or model identifier from huggingface.co/models'
23
  }
24
  )
 
 
 
25
  # config_name: Optional[str] = field( # TODO remove?
26
  # default=None, metadata={'help': 'Pretrained config name or path if not the same as model_name'}
27
  # )
@@ -93,13 +97,15 @@ def get_classifier_vectorizer(classifier_args):
93
 
94
 
95
  @lru_cache(maxsize=None)
96
- def get_model_tokenizer(model_name_or_path, cache_dir=None):
97
  if model_name_or_path is None:
98
  raise ModelLoadError('Invalid model_name_or_path.')
99
 
100
  # Load pretrained model and tokenizer
101
  model = AutoModelForSeq2SeqLM.from_pretrained(
102
  model_name_or_path, cache_dir=cache_dir)
 
 
103
 
104
  tokenizer = AutoTokenizer.from_pretrained(
105
  model_name_or_path, max_length=model.config.d_model, cache_dir=cache_dir)
 
7
  import os
8
  from dataclasses import dataclass, field
9
  from typing import Optional
10
+ import torch
11
 
12
 
13
  @dataclass
 
23
  'help': 'Path to pretrained model or model identifier from huggingface.co/models'
24
  }
25
  )
26
+ no_cuda: bool = field(default=False, metadata={
27
+ 'help': 'Do not use CUDA even when it is available'})
28
+
29
  # config_name: Optional[str] = field( # TODO remove?
30
  # default=None, metadata={'help': 'Pretrained config name or path if not the same as model_name'}
31
  # )
 
97
 
98
 
99
  @lru_cache(maxsize=None)
100
+ def get_model_tokenizer(model_name_or_path, cache_dir=None, no_cuda=False):
101
  if model_name_or_path is None:
102
  raise ModelLoadError('Invalid model_name_or_path.')
103
 
104
  # Load pretrained model and tokenizer
105
  model = AutoModelForSeq2SeqLM.from_pretrained(
106
  model_name_or_path, cache_dir=cache_dir)
107
+ if not no_cuda:
108
+ model.to('cuda' if torch.cuda.is_available() else 'cpu')
109
 
110
  tokenizer = AutoTokenizer.from_pretrained(
111
  model_name_or_path, max_length=model.config.d_model, cache_dir=cache_dir)
src/predict.py CHANGED
@@ -25,6 +25,7 @@ import preprocess
25
  from errors import PredictionException, TranscriptError, ModelLoadError, ClassifierLoadError
26
  from model import ModelArguments, get_classifier_vectorizer, get_model_tokenizer
27
 
 
28
 
29
  # Public innertube key (b64 encoded so that it is not incorrectly flagged)
30
  INNERTUBE_KEY = base64.b64decode(
@@ -114,6 +115,8 @@ class InferenceArguments:
114
  output_as_json: bool = field(default=False, metadata={
115
  'help': 'Output evaluations as JSON'})
116
 
 
 
117
  def __post_init__(self):
118
  # Try to load model from latest checkpoint
119
  if self.model_path is None:
@@ -137,8 +140,8 @@ class InferenceArguments:
137
 
138
  channel_video_ids = list(itertools.islice(get_all_channel_vids(
139
  self.channel_id), start, end))
140
- print('Found', len(channel_video_ids),
141
- 'for channel', self.channel_id)
142
 
143
  self.video_ids += channel_video_ids
144
 
@@ -300,8 +303,9 @@ CATEGORIES = [None, 'SPONSOR', 'SELFPROMO', 'INTERACTION']
300
 
301
  def predict_sponsor_text(text, model, tokenizer):
302
  """Given a body of text, predict the words which are part of the sponsor"""
 
303
  input_ids = tokenizer(
304
- f'{CustomTokens.EXTRACT_SEGMENTS_PREFIX.value} {text}', return_tensors='pt', truncation=True).input_ids
305
 
306
  max_out_len = round(min(
307
  max(
@@ -389,7 +393,7 @@ def segments_to_predictions(segments, model, tokenizer):
389
 
390
  def main():
391
  # Test on unseen data
392
- logging.getLogger().setLevel(logging.DEBUG)
393
 
394
  hf_parser = HfArgumentParser((
395
  PredictArguments,
@@ -399,11 +403,12 @@ def main():
399
  predict_args, segmentation_args, classifier_args = hf_parser.parse_args_into_dataclasses()
400
 
401
  if not predict_args.video_ids:
402
- print('No video IDs supplied. Use `--video_id`, `--video_ids`, or `--channel_id`.')
 
403
  return
404
 
405
  model, tokenizer = get_model_tokenizer(
406
- predict_args.model_path, predict_args.cache_dir)
407
 
408
  for video_id in predict_args.video_ids:
409
  video_id = video_id.strip()
@@ -411,11 +416,11 @@ def main():
411
  predictions = predict(video_id, model, tokenizer,
412
  segmentation_args, classifier_args=classifier_args)
413
  except TranscriptError:
414
- print('No transcript available for', video_id, end='\n\n')
415
  continue
416
  video_url = f'https://www.youtube.com/watch?v={video_id}'
417
  if not predictions:
418
- print('No predictions found for', video_url, end='\n\n')
419
  continue
420
 
421
  # TODO use predict_args.output_as_json
 
25
  from errors import PredictionException, TranscriptError, ModelLoadError, ClassifierLoadError
26
  from model import ModelArguments, get_classifier_vectorizer, get_model_tokenizer
27
 
28
+ logger = logging.getLogger(__name__)
29
 
30
  # Public innertube key (b64 encoded so that it is not incorrectly flagged)
31
  INNERTUBE_KEY = base64.b64decode(
 
115
  output_as_json: bool = field(default=False, metadata={
116
  'help': 'Output evaluations as JSON'})
117
 
118
+ no_cuda: bool = ModelArguments.__dataclass_fields__['no_cuda']
119
+
120
  def __post_init__(self):
121
  # Try to load model from latest checkpoint
122
  if self.model_path is None:
 
140
 
141
  channel_video_ids = list(itertools.islice(get_all_channel_vids(
142
  self.channel_id), start, end))
143
+ logger.info(
144
+ f'Found {len(channel_video_ids)} for channel {self.channel_id}')
145
 
146
  self.video_ids += channel_video_ids
147
 
 
303
 
304
  def predict_sponsor_text(text, model, tokenizer):
305
  """Given a body of text, predict the words which are part of the sponsor"""
306
+ model_device = next(model.parameters()).device
307
  input_ids = tokenizer(
308
+ f'{CustomTokens.EXTRACT_SEGMENTS_PREFIX.value} {text}', return_tensors='pt', truncation=True).input_ids.to(model_device)
309
 
310
  max_out_len = round(min(
311
  max(
 
393
 
394
  def main():
395
  # Test on unseen data
396
+ # logging.getLogger().setLevel(logging.DEBUG)
397
 
398
  hf_parser = HfArgumentParser((
399
  PredictArguments,
 
403
  predict_args, segmentation_args, classifier_args = hf_parser.parse_args_into_dataclasses()
404
 
405
  if not predict_args.video_ids:
406
+ logger.error(
407
+ 'No video IDs supplied. Use `--video_id`, `--video_ids`, or `--channel_id`.')
408
  return
409
 
410
  model, tokenizer = get_model_tokenizer(
411
+ predict_args.model_path, predict_args.cache_dir, predict_args.no_cuda)
412
 
413
  for video_id in predict_args.video_ids:
414
  video_id = video_id.strip()
 
416
  predictions = predict(video_id, model, tokenizer,
417
  segmentation_args, classifier_args=classifier_args)
418
  except TranscriptError:
419
+ logger.warning('No transcript available for', video_id, end='\n\n')
420
  continue
421
  video_url = f'https://www.youtube.com/watch?v={video_id}'
422
  if not predictions:
423
+ logger.info('No predictions found for', video_url, end='\n\n')
424
  continue
425
 
426
  # TODO use predict_args.output_as_json
src/preprocess.py CHANGED
@@ -558,6 +558,8 @@ def main():
558
  @lru_cache(maxsize=1)
559
  def read_db():
560
  if not preprocess_args.overwrite and os.path.exists(processed_db_path):
 
 
561
  with open(processed_db_path) as fp:
562
  return json.load(fp)
563
  print('Processing raw database')
@@ -790,7 +792,8 @@ def main():
790
  # , max_videos, max_segments
791
 
792
  from model import get_model_tokenizer
793
- model, tokenizer = get_model_tokenizer(model_args.model_name_or_path)
 
794
 
795
  # TODO
796
  # count_videos = 0
 
558
  @lru_cache(maxsize=1)
559
  def read_db():
560
  if not preprocess_args.overwrite and os.path.exists(processed_db_path):
561
+ print(
562
+ 'Using cached processed database (use `--overwrite` to avoid this behaviour).')
563
  with open(processed_db_path) as fp:
564
  return json.load(fp)
565
  print('Processing raw database')
 
792
  # , max_videos, max_segments
793
 
794
  from model import get_model_tokenizer
795
+ model, tokenizer = get_model_tokenizer(
796
+ model_args.model_name_or_path, model_args.cache_dir, model_args.no_cuda)
797
 
798
  # TODO
799
  # count_videos = 0
src/train.py CHANGED
@@ -297,7 +297,7 @@ def main():
297
 
298
  from model import get_model_tokenizer
299
  model, tokenizer = get_model_tokenizer(
300
- model_args.model_name_or_path, model_args.cache_dir)
301
  # max_tokenizer_length = model.config.d_model
302
 
303
  # Preprocessing the datasets.
 
297
 
298
  from model import get_model_tokenizer
299
  model, tokenizer = get_model_tokenizer(
300
+ model_args.model_name_or_path, model_args.cache_dir, model_args.no_cuda)
301
  # max_tokenizer_length = model.config.d_model
302
 
303
  # Preprocessing the datasets.