Spaces:
Running
Running
Joshua Lochner
commited on
Commit
•
d7b6d7f
1
Parent(s):
9a5d9ed
Add `do_process_database` option to preprocessing script
Browse files- src/preprocess.py +8 -0
src/preprocess.py
CHANGED
@@ -305,6 +305,9 @@ class PreprocessArguments:
|
|
305 |
default='27/01/2022',
|
306 |
metadata={'help': 'Only use videos that have some segment from before this date (exclusive). This allows for videos to have segments be corrected, but ignores new videos (posted after this date) to enter the pool.'})
|
307 |
|
|
|
|
|
|
|
308 |
do_transcribe: bool = field(
|
309 |
default=False, metadata={'help': 'Get transcripts for videos'}
|
310 |
)
|
@@ -588,6 +591,8 @@ def main():
|
|
588 |
# Always include segments locked by VIPs, regardless of view count
|
589 |
del db[key]
|
590 |
|
|
|
|
|
591 |
print('Saved', len(db), 'videos')
|
592 |
|
593 |
with open(processed_db_path, 'w') as fp:
|
@@ -595,6 +600,9 @@ def main():
|
|
595 |
|
596 |
return db
|
597 |
|
|
|
|
|
|
|
598 |
# 'videoID', 'startTime', 'endTime', 'votes', 'locked', 'incorrectVotes', 'UUID',
|
599 |
# 'userID', 'timeSubmitted', 'views', 'category', 'actionType', 'service', 'videoDuration',
|
600 |
# 'hidden', 'reputation', 'shadowHidden', 'hashedVideoID', 'userAgent', 'description'
|
|
|
305 |
default='27/01/2022',
|
306 |
metadata={'help': 'Only use videos that have some segment from before this date (exclusive). This allows for videos to have segments be corrected, but ignores new videos (posted after this date) to enter the pool.'})
|
307 |
|
308 |
+
do_process_database: bool = field(
|
309 |
+
default=False, metadata={'help': 'Process the raw database'}
|
310 |
+
)
|
311 |
do_transcribe: bool = field(
|
312 |
default=False, metadata={'help': 'Get transcripts for videos'}
|
313 |
)
|
|
|
591 |
# Always include segments locked by VIPs, regardless of view count
|
592 |
del db[key]
|
593 |
|
594 |
+
# TODO remove videos that contain a full-video label?
|
595 |
+
|
596 |
print('Saved', len(db), 'videos')
|
597 |
|
598 |
with open(processed_db_path, 'w') as fp:
|
|
|
600 |
|
601 |
return db
|
602 |
|
603 |
+
if preprocess_args.do_process_database:
|
604 |
+
read_db()
|
605 |
+
|
606 |
# 'videoID', 'startTime', 'endTime', 'votes', 'locked', 'incorrectVotes', 'UUID',
|
607 |
# 'userID', 'timeSubmitted', 'views', 'category', 'actionType', 'service', 'videoDuration',
|
608 |
# 'hidden', 'reputation', 'shadowHidden', 'hashedVideoID', 'userAgent', 'description'
|