Joshua Lochner commited on
Commit
2115d78
·
1 Parent(s): 18c7914

Add max_segment_duration argument

Browse files
Files changed (1) hide show
  1. src/preprocess.py +19 -5
src/preprocess.py CHANGED
@@ -373,6 +373,15 @@ class PreprocessArguments:
373
  # Downvotes will make this negative.
374
  # 1 = At least one positive vote
375
 
 
 
 
 
 
 
 
 
 
376
  min_views: int = field(
377
  default=5, metadata={'help': 'Minimum number of views a segment must have to be considered. 0 = show all'})
378
 
@@ -388,7 +397,7 @@ class PreprocessArguments:
388
 
389
  max_date: str = field(
390
  # default='01/01/9999', # Include all
391
- default='01/03/2022',
392
  metadata={'help': 'Only use videos that have some segment from before this date (exclusive). This allows for videos to have segments be corrected, but ignores new videos (posted after this date) to enter the pool.'})
393
 
394
  # max_unseen_date: str = field( # TODO
@@ -606,8 +615,13 @@ def main():
606
  min_date = datetime.strptime(preprocess_args.min_date, '%d/%m/%Y')
607
  max_date = datetime.strptime(preprocess_args.max_date, '%d/%m/%Y')
608
  for key in list(db):
609
-
610
- if any(datetime.fromtimestamp(x['submission_time']) < min_date for x in db[key]):
 
 
 
 
 
611
  # Remove videos where any of its segments were submitted before min_date
612
  # (essentially removes videos uploaded before min_date)
613
  # Prevents issues where some segments of a video are excluded
@@ -759,9 +773,9 @@ def main():
759
  # TODO use overwrite param
760
 
761
  positive_file = os.path.join(
762
- dataset_args.data_dir, dataset_args.positive_file)
763
  negative_file = os.path.join(
764
- dataset_args.data_dir, dataset_args.negative_file)
765
 
766
  if preprocess_args.do_generate:
767
  logger.info('Generating')
 
373
  # Downvotes will make this negative.
374
  # 1 = At least one positive vote
375
 
376
+ max_segment_duration: float = field(
377
+ default=180, # 3 minutes
378
+ # >180 => 2.8%
379
+ # >200 => 2.1%
380
+ # >250 => 1.1%
381
+ # >300 => 0.06%
382
+ metadata={'help': 'Ignore all segments whose duration in seconds is longer than this value (negative means no limit)'})
383
+
384
+
385
  min_views: int = field(
386
  default=5, metadata={'help': 'Minimum number of views a segment must have to be considered. 0 = show all'})
387
 
 
397
 
398
  max_date: str = field(
399
  # default='01/01/9999', # Include all
400
+ default='15/04/2022',
401
  metadata={'help': 'Only use videos that have some segment from before this date (exclusive). This allows for videos to have segments be corrected, but ignores new videos (posted after this date) to enter the pool.'})
402
 
403
  # max_unseen_date: str = field( # TODO
 
615
  min_date = datetime.strptime(preprocess_args.min_date, '%d/%m/%Y')
616
  max_date = datetime.strptime(preprocess_args.max_date, '%d/%m/%Y')
617
  for key in list(db):
618
+ if preprocess_args.max_segment_duration >= 0 and any(x['end'] - x['start'] > preprocess_args.max_segment_duration for x in db[key]):
619
+ # Remove videos that have at least one segment that is longer than
620
+ # the maximum allowed segment duration. This avoids introducing
621
+ # segments into training that might contain ignored context (since
622
+ # they are too long, so the middle might be normal content)
623
+ del db[key]
624
+ elif any(datetime.fromtimestamp(x['submission_time']) < min_date for x in db[key]):
625
  # Remove videos where any of its segments were submitted before min_date
626
  # (essentially removes videos uploaded before min_date)
627
  # Prevents issues where some segments of a video are excluded
 
773
  # TODO use overwrite param
774
 
775
  positive_file = os.path.join(
776
+ dataset_args.data_dir, preprocess_args.positive_file)
777
  negative_file = os.path.join(
778
+ dataset_args.data_dir, preprocess_args.negative_file)
779
 
780
  if preprocess_args.do_generate:
781
  logger.info('Generating')