remove unnecessary files, add preproccessor_config.json
Browse files- __init__.py +0 -0
- __pycache__/__init__.cpython-310.pyc +0 -0
- __pycache__/benchmark_utils.cpython-310.pyc +0 -0
- __pycache__/converters.cpython-310.pyc +0 -0
- __pycache__/renamers.cpython-310.pyc +0 -0
- __pycache__/trimmers.cpython-310.pyc +0 -0
- converters.py +0 -206
- preprocessor_config.json +14 -0
- renamers.py +0 -77
- trimmers.py +0 -139
__init__.py
DELETED
File without changes
|
__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (128 Bytes)
|
|
__pycache__/benchmark_utils.cpython-310.pyc
DELETED
Binary file (10.2 kB)
|
|
__pycache__/converters.cpython-310.pyc
DELETED
Binary file (5.32 kB)
|
|
__pycache__/renamers.cpython-310.pyc
DELETED
Binary file (1.61 kB)
|
|
__pycache__/trimmers.cpython-310.pyc
DELETED
Binary file (2.77 kB)
|
|
converters.py
DELETED
@@ -1,206 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import csv
|
3 |
-
import re
|
4 |
-
import pandas as pd
|
5 |
-
from pathlib import Path
|
6 |
-
import numpy as np
|
7 |
-
# functions to convert between different transcript/annotation formats
|
8 |
-
|
9 |
-
#######
|
10 |
-
# "table" refers to a pd.Dataframe w the following cols
|
11 |
-
# [uttID, speaker, transcript, start_sec, end_sec]
|
12 |
-
#########
|
13 |
-
|
14 |
-
# separate function to write to csv, tsv or ELAN compatible (ELAN interprets ALL commas as delimiter so we need to use tab instead)
|
15 |
-
|
16 |
-
def HHMMSS_to_sec(time_str):
|
17 |
-
"""Get Seconds from timestamp string with milliseconds."""
|
18 |
-
if not time_str:
|
19 |
-
return None
|
20 |
-
if time_str.count(':')==2:
|
21 |
-
h, m, s = time_str.split(':')
|
22 |
-
elif time_str.count(':')==3:
|
23 |
-
# weird timestamps where there is a field followign seconds delimited by colon
|
24 |
-
h, m, s, u = time_str.split(':')
|
25 |
-
# determine whether ms field is in tenths or hundredths or thousandths by countng how many digits
|
26 |
-
if len(u)==1:
|
27 |
-
print('Weird time format detected - HH:MM:SS:tenths - please verify this is how you want the time interpreted')
|
28 |
-
ms = float(u)/10
|
29 |
-
elif len(u)==2: # hundredths
|
30 |
-
ms = float(u)/100
|
31 |
-
elif len(u)==3: # hundredths
|
32 |
-
ms = float(u)/1000
|
33 |
-
else:
|
34 |
-
print(f'input string format not supported: {time_str}')
|
35 |
-
return None
|
36 |
-
s = int(s)+ms
|
37 |
-
elif time_str.count(':')==1:
|
38 |
-
# print('missing HH from timestamp, assuming MM:SS')
|
39 |
-
m, s = time_str.split(':')
|
40 |
-
h=0
|
41 |
-
elif time_str.count(':')==0 and time_str.count('.')==1:
|
42 |
-
# print('missing HH:MM from timestamp, assuming SS.ms')
|
43 |
-
s = float(time_str)
|
44 |
-
h=0
|
45 |
-
m=0
|
46 |
-
else:
|
47 |
-
print(f'input string format not supported: {time_str}')
|
48 |
-
return None
|
49 |
-
return int(h) * 3600 + int(m) * 60 + float(s)
|
50 |
-
|
51 |
-
def sec_to_timecode(time_sec):
|
52 |
-
# convert seconds to HH:MM:SS:hundredths as used in .xlsx transcripts
|
53 |
-
h=int(time_sec//3600)
|
54 |
-
m=int((time_sec-3600*h)//60)
|
55 |
-
s=int(time_sec-3600*h-60*m)
|
56 |
-
u=round(100*(time_sec-3600*h-60*m-s))
|
57 |
-
timecode=f'{h}:{m:02}:{s:02}:{u:02}'
|
58 |
-
return(timecode)
|
59 |
-
|
60 |
-
def docx_scraped_tsv_to_table(ooona_file):
|
61 |
-
# ooona output is a table in a word docx,
|
62 |
-
# for now manually copying this out and saving as tsv
|
63 |
-
# but the timestamp format is wrong
|
64 |
-
# input cols are SHOT START END SPEAKER DIALOGUE
|
65 |
-
|
66 |
-
with open(ooona_file) as in_file:
|
67 |
-
reader = csv.reader(in_file, delimiter="\t")
|
68 |
-
next(reader) # skip header
|
69 |
-
rows=[]
|
70 |
-
for i,line in enumerate(reader):
|
71 |
-
utt_ix, start_time, end_time, speaker, transcript = line
|
72 |
-
start_sec = HHMMSS_to_sec(start_time)
|
73 |
-
end_sec = HHMMSS_to_sec(end_time)
|
74 |
-
rows.append([utt_ix,speaker,transcript,start_sec,end_sec])
|
75 |
-
utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec'])
|
76 |
-
return(utt_table)
|
77 |
-
# table = pd.read_csv(ooona_file, sep='\t')
|
78 |
-
|
79 |
-
def molly_xlsx_to_table(xl_file):
|
80 |
-
# contractor transcribers provide an xlsx with the following columns
|
81 |
-
# utt_ix: int
|
82 |
-
# Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"
|
83 |
-
# Duration: HH:MM:SS:ss
|
84 |
-
# Speaker: str
|
85 |
-
# Dialogue: str
|
86 |
-
# Annotations: blank
|
87 |
-
# Error Type: blank
|
88 |
-
with pd.ExcelFile(xl_file) as xls:
|
89 |
-
sheetname = xls.sheet_names
|
90 |
-
table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
|
91 |
-
table.columns=table.columns.str.lower()
|
92 |
-
table[['start_time','end_time']] = table['timecode'].str.split('-',expand=True)
|
93 |
-
table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
|
94 |
-
table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
|
95 |
-
table.drop(labels=['annotations','error type','duration'], axis=1, inplace=True)
|
96 |
-
table=table[['#','speaker','dialogue','start_sec','end_sec']]
|
97 |
-
table.rename(columns={'#':'uttID', 'dialogue':'transcript'}, inplace=True)
|
98 |
-
table.reset_index(inplace=True,drop=True)
|
99 |
-
table=table.replace('', np.nan).dropna(subset=['speaker','dialogue'], how='all') # drop rows with missing values in speaker and utterance
|
100 |
-
return table
|
101 |
-
|
102 |
-
def LoFi_xlsx_to_table(xl_file):
|
103 |
-
# LoFi transcripts have the following columns:
|
104 |
-
# # utt_ix: int
|
105 |
-
# Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"
|
106 |
-
# Duration: HH:MM:SS:ss
|
107 |
-
# Speaker: str
|
108 |
-
# Dialogue: str
|
109 |
-
# Annotations: blank
|
110 |
-
# Error Type: blank
|
111 |
-
with pd.ExcelFile(xl_file) as xls:
|
112 |
-
sheetname = xls.sheet_names
|
113 |
-
table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
|
114 |
-
table[['start_time','end_time']] = table['Timecode'].str.split('-',expand=True)
|
115 |
-
table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
|
116 |
-
table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
|
117 |
-
table.drop(labels=['Annotations','Error Type','Duration'], axis=1, inplace=True)
|
118 |
-
table=table[['#','Speaker','Dialogue','start_sec','end_sec']]
|
119 |
-
table.rename(columns={'#':'uttID','Speaker':'speaker', 'Dialogue':'transcript'}, inplace=True)
|
120 |
-
|
121 |
-
return table
|
122 |
-
|
123 |
-
def saga_to_table(saga_txt):
|
124 |
-
# saga's own transcripts are txt given in the following format
|
125 |
-
#
|
126 |
-
# speaker (start time MM:SS)
|
127 |
-
# utterance
|
128 |
-
# <blank line>
|
129 |
-
# TODO: make more robust by pattern matching instead of modulo
|
130 |
-
with open(saga_txt) as in_file:
|
131 |
-
reader = csv.reader(in_file, delimiter="\n")
|
132 |
-
count = 0
|
133 |
-
rows=[]
|
134 |
-
for i,line in enumerate(reader):
|
135 |
-
print((count,line))
|
136 |
-
if count%3 == 0:
|
137 |
-
# utt = utt.split('\n') # now speaker (time) , transcript
|
138 |
-
# transcript = utt[1]
|
139 |
-
spk_time = line[0].split('(')
|
140 |
-
if len(spk_time)<2:
|
141 |
-
# print('!!!speaker not changed')
|
142 |
-
# print(line)
|
143 |
-
timestamp = spk_time[0].strip('):( ')
|
144 |
-
speaker=rows[-1][0] # prev speaker
|
145 |
-
|
146 |
-
else:
|
147 |
-
speaker = spk_time[0]
|
148 |
-
timestamp = spk_time[1].replace('):','')
|
149 |
-
# print(timestamp)
|
150 |
-
start_sec = HHMMSS_to_sec(timestamp)
|
151 |
-
|
152 |
-
if count%3 == 1:
|
153 |
-
transcript = line[0]
|
154 |
-
if count%3 == 2:
|
155 |
-
rows.append([i,speaker,transcript,start_sec,None])
|
156 |
-
#print([speaker,transcript,timestamp])
|
157 |
-
count+=1
|
158 |
-
utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec'])
|
159 |
-
return(utt_table)
|
160 |
-
|
161 |
-
def table_to_ELAN_tsv(table:pd.DataFrame, path:str):
|
162 |
-
# write table to tsv compatible with ELAN import
|
163 |
-
table.to_csv(path, index=False, float_format='%.3f',sep='\t')
|
164 |
-
|
165 |
-
def table_to_standard_csv(table:pd.DataFrame, path:str):
|
166 |
-
# write table to standard csv format agreed upon by whole team
|
167 |
-
|
168 |
-
# TODO: convert times in seconds back to HH:MM:SS?
|
169 |
-
# TODO: split utterances into sentences?
|
170 |
-
table.to_csv(path,index=False, float_format='%.3f')
|
171 |
-
|
172 |
-
def table_to_utt_labels_csv(table:pd.DataFrame, path:str):
|
173 |
-
# write table to utt_labels csv format comaptable w rosy's isatasr lib
|
174 |
-
table.rename(columns={'transcript':'utterance', 'uttID':'seg'}, inplace=True)
|
175 |
-
table=table.replace('', np.nan).dropna(subset=['speaker','utterance'], how='all') # drop rows with missing values in speaker and utterance
|
176 |
-
table.to_csv(path,index=False, float_format='%.3f')
|
177 |
-
|
178 |
-
def table_to_molly_xlsx(tbl:pd.DataFrame,path:str):
|
179 |
-
tblx = tbl
|
180 |
-
tblx.rename(columns={'uttID':'#', 'speaker':'Speaker','transcript':'Dialogue'}, inplace=True)
|
181 |
-
tblx['dur_s'] = tblx['end_sec']-tblx['start_sec']
|
182 |
-
tblx['start_timecode']=tblx['start_sec'].apply(sec_to_timecode)
|
183 |
-
tblx['end_timecode']=tblx['end_sec'].apply(sec_to_timecode)
|
184 |
-
tblx['Duration'] = tblx['dur_s'].apply(sec_to_timecode)
|
185 |
-
tblx['Timecode'] = [' - '.join(i) for i in zip(tblx['start_timecode'], tblx['end_timecode'])]
|
186 |
-
tblx['Annotations'] = ''
|
187 |
-
tblx['Error Type'] = ''
|
188 |
-
tblx=tblx[['#','Timecode','Duration','Speaker','Dialogue','Annotations','Error Type']]
|
189 |
-
tblx.to_excel(path,sheet_name=Path(path).stem, index=False)
|
190 |
-
|
191 |
-
def utt_labels_csv_to_table(label_csv:str):
|
192 |
-
# utt_labels_csv is the usual format used for diarized, timed transcripts in this repo
|
193 |
-
# There are several versions with differnt columns (with/without segment &/ utterance index)
|
194 |
-
# table:
|
195 |
-
# [uttID, speaker, transcript, start_sec, end_sec]
|
196 |
-
|
197 |
-
table = pd.read_csv(label_csv,keep_default_na=False)
|
198 |
-
# choose which column to use for uttID in table
|
199 |
-
if 'utt' in table.columns:
|
200 |
-
table=table.rename(columns={"utt":"uttID"}).drop('seg', axis=1)
|
201 |
-
elif 'seg' in table.columns:
|
202 |
-
table=table.rename(columns={"seg":"uttID"})
|
203 |
-
else:
|
204 |
-
table=table.reset_index().rename(columns={"index":"uttID"})
|
205 |
-
|
206 |
-
return table
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocessor_config.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"chunk_length": 30,
|
3 |
+
"feature_extractor_type": "WhisperFeatureExtractor",
|
4 |
+
"feature_size": 80,
|
5 |
+
"hop_length": 160,
|
6 |
+
"n_fft": 400,
|
7 |
+
"n_samples": 480000,
|
8 |
+
"nb_max_frames": 3000,
|
9 |
+
"padding_side": "right",
|
10 |
+
"padding_value": 0.0,
|
11 |
+
"processor_class": "WhisperProcessor",
|
12 |
+
"return_attention_mask": false,
|
13 |
+
"sampling_rate": 16000
|
14 |
+
}
|
renamers.py
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
import csv
|
2 |
-
import os
|
3 |
-
import glob
|
4 |
-
import shutil
|
5 |
-
import re
|
6 |
-
|
7 |
-
|
8 |
-
# rename files from original filename (hexadecimal salad) to Session_ID (human readable) and back
|
9 |
-
global DEFAULT_MAP_PATH
|
10 |
-
DEFAULT_MAP_PATH = '../../SessionIDs_from_catalog.csv'
|
11 |
-
|
12 |
-
def make_SessionID_map(path=DEFAULT_MAP_PATH):
|
13 |
-
"""generate dictionary from csv file with columns for File_Name and Session_ID -
|
14 |
-
copied from columsn 1 & 2 of the Catalog on OneDrive
|
15 |
-
"""
|
16 |
-
SID_to_FN={}
|
17 |
-
FN_to_SID={}
|
18 |
-
with open(path,encoding='utf-8-sig') as f:
|
19 |
-
reader = csv.reader(f)
|
20 |
-
headers = next(reader)
|
21 |
-
assert (headers[0]=='File_Name' or headers[0]=='Conference_ID') & (headers[1]=='Session_ID'), "Headers are wrong, expected ('File_Name' or 'Conference_ID') and 'Session_ID'"
|
22 |
-
|
23 |
-
for line in reader:
|
24 |
-
filename,sessionID=line
|
25 |
-
filename=filename.split('.')[0] # remove extensions
|
26 |
-
if (len(filename.strip())>0 and len(sessionID.strip())>0):
|
27 |
-
SID_to_FN[sessionID]=filename
|
28 |
-
FN_to_SID[filename]=sessionID
|
29 |
-
return(SID_to_FN, FN_to_SID)
|
30 |
-
|
31 |
-
|
32 |
-
def rename_files_SID_to_FN(path, recursive=True, overwrite=False):
|
33 |
-
SID_to_FN, _=make_SessionID_map()
|
34 |
-
#TODO: deal with matching nested sIDs, see commented code below
|
35 |
-
newpaths=[]
|
36 |
-
for sID in SID_to_FN.keys():
|
37 |
-
srclist = glob.glob(os.path.join(path,'**', f'*{sID}.*'), recursive=recursive)
|
38 |
-
# print(f'siD: {sID}')
|
39 |
-
# print(srclist)
|
40 |
-
for srcpath in srclist:
|
41 |
-
newpath = srcpath.replace(sID, SID_to_FN[sID])
|
42 |
-
print(newpath)
|
43 |
-
if overwrite==True:
|
44 |
-
shutil.move(srcpath, newpath)
|
45 |
-
else:
|
46 |
-
shutil.copy(srcpath, newpath)
|
47 |
-
newpaths.append(newpath)
|
48 |
-
return newpaths
|
49 |
-
|
50 |
-
|
51 |
-
# # get sessnames
|
52 |
-
# sesslist = [s for s in os.listdir(path) ]
|
53 |
-
# srclist = [os.path.join(src_dir, filename) for filename in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, filename))]
|
54 |
-
# for src in srclist:
|
55 |
-
# sessname_matches = [sessname in src for sessname in sesslist]
|
56 |
-
# if sum(sessname_matches)>1:
|
57 |
-
# print('!!!! multiple matches, will take longest match. TODO: implement this you dope')
|
58 |
-
# elif not any(sessname_matches):
|
59 |
-
# print(f'!!!! no sessname matches for file {src}')
|
60 |
-
# else:
|
61 |
-
# sessname = sesslist[sessname_matches.index(True)]
|
62 |
-
# print(f'...copying to {sessname}')
|
63 |
-
# shutil.copy(src, os.path.join(dest_dir,sessname))
|
64 |
-
|
65 |
-
def rename_files_FN_to_SID(path, recursive=True):
|
66 |
-
_, FN_to_SID=make_SessionID_map()
|
67 |
-
|
68 |
-
def extract_conferenceID_from_filename(filename):
|
69 |
-
"""extract conferenceID from filename
|
70 |
-
"""
|
71 |
-
conferenceID=filename.split(' ')[0]
|
72 |
-
conferenceID = re.sub('_?[a-zA-Z]*(\.*[a-zA-Z]*).xlsx','', conferenceID)
|
73 |
-
conferenceID=re.sub('TMcoded|Transcript','', conferenceID)
|
74 |
-
conferenceID=re.sub('_start\d+_end\d+_?','', conferenceID)
|
75 |
-
conferenceID=re.sub(
|
76 |
-
'\d{5}_\d{4}-\d{2}-\d{2}_','', conferenceID)
|
77 |
-
return conferenceID
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
trimmers.py
DELETED
@@ -1,139 +0,0 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
import os
|
3 |
-
import csv
|
4 |
-
import subprocess
|
5 |
-
import pandas as pd
|
6 |
-
import sys
|
7 |
-
sys.path.append('..')
|
8 |
-
|
9 |
-
from levi.converters import HHMMSS_to_sec
|
10 |
-
|
11 |
-
def trim_media(media_in,
|
12 |
-
media_out,
|
13 |
-
start,
|
14 |
-
end):
|
15 |
-
|
16 |
-
# options for writing out audio if converting
|
17 |
-
WAV_CHANNELS = 1
|
18 |
-
WAV_SAMPLE_RATE = 16000
|
19 |
-
|
20 |
-
media_type = Path(media_in).suffix
|
21 |
-
ext = Path(media_out).suffix
|
22 |
-
|
23 |
-
if isinstance(start, str):
|
24 |
-
start_sec = HHMMSS_to_sec(start)
|
25 |
-
else:
|
26 |
-
start_sec = float(start)
|
27 |
-
if isinstance(end, str):
|
28 |
-
end_sec = HHMMSS_to_sec(end)
|
29 |
-
else:
|
30 |
-
end_sec = float(end)
|
31 |
-
|
32 |
-
if ext == '.wav':
|
33 |
-
# convert to wav with standard format for audio models
|
34 |
-
print(f'...Using ffmpeg to trim video from {start} to {end} \n and convert to {WAV_SAMPLE_RATE}Hz WAV with {WAV_CHANNELS} channels...')
|
35 |
-
print(f'...generating {media_out}...')
|
36 |
-
|
37 |
-
subprocess.call(['ffmpeg',
|
38 |
-
'-y',
|
39 |
-
'-i',
|
40 |
-
media_in,
|
41 |
-
'-ss',
|
42 |
-
f'{start_sec}',
|
43 |
-
'-to',
|
44 |
-
f'{end_sec}',
|
45 |
-
'-acodec',
|
46 |
-
'pcm_s16le',
|
47 |
-
'-ac',
|
48 |
-
WAV_CHANNELS,
|
49 |
-
'-ar',
|
50 |
-
WAV_SAMPLE_RATE,
|
51 |
-
media_out,
|
52 |
-
'-hide_banner',
|
53 |
-
'-loglevel',
|
54 |
-
'warning'
|
55 |
-
],shell=False)
|
56 |
-
|
57 |
-
else:
|
58 |
-
|
59 |
-
print(f'...Using ffmpeg to trim video from {start_sec} to {end_sec}...')
|
60 |
-
print(f'...generating {media_out}...')
|
61 |
-
|
62 |
-
subprocess.call(['ffmpeg',
|
63 |
-
'-y',
|
64 |
-
'-i',
|
65 |
-
media_in,
|
66 |
-
'-ss',
|
67 |
-
f'{start_sec}',
|
68 |
-
'-to',
|
69 |
-
f'{end_sec}',
|
70 |
-
'-c',
|
71 |
-
'copy',
|
72 |
-
media_out,
|
73 |
-
'-hide_banner',
|
74 |
-
'-loglevel',
|
75 |
-
'warning'
|
76 |
-
],shell=False)
|
77 |
-
|
78 |
-
def trim_media_batch(extract_timings_csv,
|
79 |
-
outpath,
|
80 |
-
suffix='',
|
81 |
-
convert_to=False):
|
82 |
-
"""trim a batch of media files given a csv of timings
|
83 |
-
|
84 |
-
Args:
|
85 |
-
extract_timings_csv (str): path to csv with columns:
|
86 |
-
filepath, start (HH:MM:SS), end (HH:MM:SS)
|
87 |
-
outpath (str): output path
|
88 |
-
suffix (str, optional): save output trimmed files with this suffix. Defaults to ''.
|
89 |
-
convert_to (bool, optional): [None, 'wav','mp4']. Defaults to False.
|
90 |
-
Returns:
|
91 |
-
outfiles (list): list of file paths created
|
92 |
-
"""
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
os.makedirs(outpath, exist_ok=True)
|
97 |
-
|
98 |
-
samples_df = pd.read_csv(
|
99 |
-
extract_timings_csv,
|
100 |
-
skip_blank_lines=True,
|
101 |
-
index_col=False,
|
102 |
-
names=['media_in','startHMS','endHMS'],
|
103 |
-
header=0
|
104 |
-
).dropna().sort_values(
|
105 |
-
by='media_in',ignore_index=True).reset_index(drop=True)
|
106 |
-
|
107 |
-
print(f'TRIMMING {len(samples_df.index)} FILES...')
|
108 |
-
|
109 |
-
# enumerate samples by session and check if there are multiple samples from a given session
|
110 |
-
samples_df['count'] = samples_df.groupby('media_in').cumcount()
|
111 |
-
if not os.path.exists(outpath):
|
112 |
-
os.makedirs(outpath)
|
113 |
-
|
114 |
-
outfiles=[]
|
115 |
-
for i, rec in samples_df.iterrows():
|
116 |
-
media_in,startHMS,endHMS, count = rec.values
|
117 |
-
suffix_use = f'{suffix}{count}' if count > 0 else suffix # if multiple samples per recording, give a diffrent name
|
118 |
-
|
119 |
-
if not os.path.exists(media_in):
|
120 |
-
print(f'!!!WARNING: media not found: {media_in}')
|
121 |
-
continue
|
122 |
-
|
123 |
-
media_type = Path(media_in).suffix
|
124 |
-
sessname = Path(media_in).stem
|
125 |
-
print(f'...Input media: {media_in}')
|
126 |
-
|
127 |
-
if convert_to=='wav':
|
128 |
-
ext = '.wav'
|
129 |
-
elif convert_to=='mp4':
|
130 |
-
ext = '.mp4'
|
131 |
-
else:
|
132 |
-
ext = media_type
|
133 |
-
|
134 |
-
outfile = os.path.expanduser(os.path.join(outpath,f'{sessname}{suffix_use}{ext}'))
|
135 |
-
|
136 |
-
trim_media(media_in, outfile, HHMMSS_to_sec(startHMS), HHMMSS_to_sec(endHMS))
|
137 |
-
|
138 |
-
outfiles.append(outfile)
|
139 |
-
return(outfiles)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|