Spaces:
Running
Running
Joshua Lochner
commited on
Commit
·
2115d78
1
Parent(s):
18c7914
Add max_segment_duration argument
Browse files- src/preprocess.py +19 -5
src/preprocess.py
CHANGED
@@ -373,6 +373,15 @@ class PreprocessArguments:
|
|
373 |
# Downvotes will make this negative.
|
374 |
# 1 = At least one positive vote
|
375 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
min_views: int = field(
|
377 |
default=5, metadata={'help': 'Minimum number of views a segment must have to be considered. 0 = show all'})
|
378 |
|
@@ -388,7 +397,7 @@ class PreprocessArguments:
|
|
388 |
|
389 |
max_date: str = field(
|
390 |
# default='01/01/9999', # Include all
|
391 |
-
default='
|
392 |
metadata={'help': 'Only use videos that have some segment from before this date (exclusive). This allows for videos to have segments be corrected, but ignores new videos (posted after this date) to enter the pool.'})
|
393 |
|
394 |
# max_unseen_date: str = field( # TODO
|
@@ -606,8 +615,13 @@ def main():
|
|
606 |
min_date = datetime.strptime(preprocess_args.min_date, '%d/%m/%Y')
|
607 |
max_date = datetime.strptime(preprocess_args.max_date, '%d/%m/%Y')
|
608 |
for key in list(db):
|
609 |
-
|
610 |
-
|
|
|
|
|
|
|
|
|
|
|
611 |
# Remove videos where any of its segments were submitted before min_date
|
612 |
# (essentially removes videos uploaded before min_date)
|
613 |
# Prevents issues where some segments of a video are excluded
|
@@ -759,9 +773,9 @@ def main():
|
|
759 |
# TODO use overwrite param
|
760 |
|
761 |
positive_file = os.path.join(
|
762 |
-
dataset_args.data_dir,
|
763 |
negative_file = os.path.join(
|
764 |
-
dataset_args.data_dir,
|
765 |
|
766 |
if preprocess_args.do_generate:
|
767 |
logger.info('Generating')
|
|
|
373 |
# Downvotes will make this negative.
|
374 |
# 1 = At least one positive vote
|
375 |
|
376 |
+
max_segment_duration: float = field(
|
377 |
+
default=180, # 3 minutes
|
378 |
+
# >180 => 2.8%
|
379 |
+
# >200 => 2.1%
|
380 |
+
# >250 => 1.1%
|
381 |
+
# >300 => 0.06%
|
382 |
+
metadata={'help': 'Ignore all segments whose duration in seconds is longer than this value (negative means no limit)'})
|
383 |
+
|
384 |
+
|
385 |
min_views: int = field(
|
386 |
default=5, metadata={'help': 'Minimum number of views a segment must have to be considered. 0 = show all'})
|
387 |
|
|
|
397 |
|
398 |
max_date: str = field(
|
399 |
# default='01/01/9999', # Include all
|
400 |
+
default='15/04/2022',
|
401 |
metadata={'help': 'Only use videos that have some segment from before this date (exclusive). This allows for videos to have segments be corrected, but ignores new videos (posted after this date) to enter the pool.'})
|
402 |
|
403 |
# max_unseen_date: str = field( # TODO
|
|
|
615 |
min_date = datetime.strptime(preprocess_args.min_date, '%d/%m/%Y')
|
616 |
max_date = datetime.strptime(preprocess_args.max_date, '%d/%m/%Y')
|
617 |
for key in list(db):
|
618 |
+
if preprocess_args.max_segment_duration >= 0 and any(x['end'] - x['start'] > preprocess_args.max_segment_duration for x in db[key]):
|
619 |
+
# Remove videos that have at least one segment that is longer than
|
620 |
+
# the maximum allowed segment duration. This avoids introducing
|
621 |
+
# segments into training that might contain ignored context (since
|
622 |
+
# they are too long, so the middle might be normal content)
|
623 |
+
del db[key]
|
624 |
+
elif any(datetime.fromtimestamp(x['submission_time']) < min_date for x in db[key]):
|
625 |
# Remove videos where any of its segments were submitted before min_date
|
626 |
# (essentially removes videos uploaded before min_date)
|
627 |
# Prevents issues where some segments of a video are excluded
|
|
|
773 |
# TODO use overwrite param
|
774 |
|
775 |
positive_file = os.path.join(
|
776 |
+
dataset_args.data_dir, preprocess_args.positive_file)
|
777 |
negative_file = os.path.join(
|
778 |
+
dataset_args.data_dir, preprocess_args.negative_file)
|
779 |
|
780 |
if preprocess_args.do_generate:
|
781 |
logger.info('Generating')
|