Joshua Lochner commited on
Commit
d7b6d7f
1 Parent(s): 9a5d9ed

Add `do_process_database` option to preprocessing script

Browse files
Files changed (1) hide show
  1. src/preprocess.py +8 -0
src/preprocess.py CHANGED
@@ -305,6 +305,9 @@ class PreprocessArguments:
305
  default='27/01/2022',
306
  metadata={'help': 'Only use videos that have some segment from before this date (exclusive). This allows for videos to have segments be corrected, but ignores new videos (posted after this date) to enter the pool.'})
307
 
 
 
 
308
  do_transcribe: bool = field(
309
  default=False, metadata={'help': 'Get transcripts for videos'}
310
  )
@@ -588,6 +591,8 @@ def main():
588
  # Always include segments locked by VIPs, regardless of view count
589
  del db[key]
590
 
 
 
591
  print('Saved', len(db), 'videos')
592
 
593
  with open(processed_db_path, 'w') as fp:
@@ -595,6 +600,9 @@ def main():
595
 
596
  return db
597
 
 
 
 
598
  # 'videoID', 'startTime', 'endTime', 'votes', 'locked', 'incorrectVotes', 'UUID',
599
  # 'userID', 'timeSubmitted', 'views', 'category', 'actionType', 'service', 'videoDuration',
600
  # 'hidden', 'reputation', 'shadowHidden', 'hashedVideoID', 'userAgent', 'description'
 
305
  default='27/01/2022',
306
  metadata={'help': 'Only use videos that have some segment from before this date (exclusive). This allows for videos to have segments be corrected, but ignores new videos (posted after this date) to enter the pool.'})
307
 
308
+ do_process_database: bool = field(
309
+ default=False, metadata={'help': 'Process the raw database'}
310
+ )
311
  do_transcribe: bool = field(
312
  default=False, metadata={'help': 'Get transcripts for videos'}
313
  )
 
591
  # Always include segments locked by VIPs, regardless of view count
592
  del db[key]
593
 
594
+ # TODO remove videos that contain a full-video label?
595
+
596
  print('Saved', len(db), 'videos')
597
 
598
  with open(processed_db_path, 'w') as fp:
 
600
 
601
  return db
602
 
603
+ if preprocess_args.do_process_database:
604
+ read_db()
605
+
606
  # 'videoID', 'startTime', 'endTime', 'votes', 'locked', 'incorrectVotes', 'UUID',
607
  # 'userID', 'timeSubmitted', 'views', 'category', 'actionType', 'service', 'videoDuration',
608
  # 'hidden', 'reputation', 'shadowHidden', 'hashedVideoID', 'userAgent', 'description'