|
import os |
|
import csv |
|
import re |
|
import pandas as pd |
|
from pathlib import Path |
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def HHMMSS_to_sec(time_str): |
|
"""Get Seconds from timestamp string with milliseconds.""" |
|
if not time_str: |
|
return None |
|
if time_str.count(':')==2: |
|
h, m, s = time_str.split(':') |
|
elif time_str.count(':')==3: |
|
|
|
h, m, s, u = time_str.split(':') |
|
|
|
if len(u)==1: |
|
print('Weird time format detected - HH:MM:SS:tenths - please verify this is how you want the time interpreted') |
|
ms = float(u)/10 |
|
elif len(u)==2: |
|
ms = float(u)/100 |
|
elif len(u)==3: |
|
ms = float(u)/1000 |
|
else: |
|
print(f'input string format not supported: {time_str}') |
|
return None |
|
s = int(s)+ms |
|
elif time_str.count(':')==1: |
|
|
|
m, s = time_str.split(':') |
|
h=0 |
|
elif time_str.count(':')==0 and time_str.count('.')==1: |
|
|
|
s = float(time_str) |
|
h=0 |
|
m=0 |
|
else: |
|
print(f'input string format not supported: {time_str}') |
|
return None |
|
return int(h) * 3600 + int(m) * 60 + float(s) |
|
|
|
def sec_to_timecode(time_sec): |
|
|
|
h=int(time_sec//3600) |
|
m=int((time_sec-3600*h)//60) |
|
s=int(time_sec-3600*h-60*m) |
|
u=round(100*(time_sec-3600*h-60*m-s)) |
|
timecode=f'{h}:{m:02}:{s:02}:{u:02}' |
|
return(timecode) |
|
|
|
def docx_scraped_tsv_to_table(ooona_file): |
|
|
|
|
|
|
|
|
|
|
|
with open(ooona_file) as in_file: |
|
reader = csv.reader(in_file, delimiter="\t") |
|
next(reader) |
|
rows=[] |
|
for i,line in enumerate(reader): |
|
utt_ix, start_time, end_time, speaker, transcript = line |
|
start_sec = HHMMSS_to_sec(start_time) |
|
end_sec = HHMMSS_to_sec(end_time) |
|
rows.append([utt_ix,speaker,transcript,start_sec,end_sec]) |
|
utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec']) |
|
return(utt_table) |
|
|
|
|
|
def molly_xlsx_to_table(xl_file): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with pd.ExcelFile(xl_file) as xls: |
|
sheetname = xls.sheet_names |
|
table = pd.DataFrame(pd.read_excel(xls, sheetname[0])) |
|
table.columns=table.columns.str.lower() |
|
table[['start_time','end_time']] = table['timecode'].str.split('-',expand=True) |
|
table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec) |
|
table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec) |
|
table.drop(labels=['annotations','error type','duration'], axis=1, inplace=True) |
|
table=table[['#','speaker','dialogue','start_sec','end_sec']] |
|
table.rename(columns={'#':'uttID', 'dialogue':'transcript'}, inplace=True) |
|
table.reset_index(inplace=True,drop=True) |
|
table=table.replace('', np.nan).dropna(subset=['speaker','dialogue'], how='all') |
|
return table |
|
|
|
def LoFi_xlsx_to_table(xl_file): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with pd.ExcelFile(xl_file) as xls: |
|
sheetname = xls.sheet_names |
|
table = pd.DataFrame(pd.read_excel(xls, sheetname[0])) |
|
table[['start_time','end_time']] = table['Timecode'].str.split('-',expand=True) |
|
table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec) |
|
table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec) |
|
table.drop(labels=['Annotations','Error Type','Duration'], axis=1, inplace=True) |
|
table=table[['#','Speaker','Dialogue','start_sec','end_sec']] |
|
table.rename(columns={'#':'uttID','Speaker':'speaker', 'Dialogue':'transcript'}, inplace=True) |
|
|
|
return table |
|
|
|
def saga_to_table(saga_txt): |
|
|
|
|
|
|
|
|
|
|
|
|
|
with open(saga_txt) as in_file: |
|
reader = csv.reader(in_file, delimiter="\n") |
|
count = 0 |
|
rows=[] |
|
for i,line in enumerate(reader): |
|
print((count,line)) |
|
if count%3 == 0: |
|
|
|
|
|
spk_time = line[0].split('(') |
|
if len(spk_time)<2: |
|
|
|
|
|
timestamp = spk_time[0].strip('):( ') |
|
speaker=rows[-1][0] |
|
|
|
else: |
|
speaker = spk_time[0] |
|
timestamp = spk_time[1].replace('):','') |
|
|
|
start_sec = HHMMSS_to_sec(timestamp) |
|
|
|
if count%3 == 1: |
|
transcript = line[0] |
|
if count%3 == 2: |
|
rows.append([i,speaker,transcript,start_sec,None]) |
|
|
|
count+=1 |
|
utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec']) |
|
return(utt_table) |
|
|
|
def table_to_ELAN_tsv(table:pd.DataFrame, path:str): |
|
|
|
table.to_csv(path, index=False, float_format='%.3f',sep='\t') |
|
|
|
def table_to_standard_csv(table:pd.DataFrame, path:str): |
|
|
|
|
|
|
|
|
|
table.to_csv(path,index=False, float_format='%.3f') |
|
|
|
def table_to_utt_labels_csv(table:pd.DataFrame, path:str): |
|
|
|
table.rename(columns={'transcript':'utterance', 'uttID':'seg'}, inplace=True) |
|
table=table.replace('', np.nan).dropna(subset=['speaker','utterance'], how='all') |
|
table.to_csv(path,index=False, float_format='%.3f') |
|
|
|
def table_to_molly_xlsx(tbl:pd.DataFrame,path:str): |
|
tblx = tbl |
|
tblx.rename(columns={'uttID':'#', 'speaker':'Speaker','transcript':'Dialogue'}, inplace=True) |
|
tblx['dur_s'] = tblx['end_sec']-tblx['start_sec'] |
|
tblx['start_timecode']=tblx['start_sec'].apply(sec_to_timecode) |
|
tblx['end_timecode']=tblx['end_sec'].apply(sec_to_timecode) |
|
tblx['Duration'] = tblx['dur_s'].apply(sec_to_timecode) |
|
tblx['Timecode'] = [' - '.join(i) for i in zip(tblx['start_timecode'], tblx['end_timecode'])] |
|
tblx['Annotations'] = '' |
|
tblx['Error Type'] = '' |
|
tblx=tblx[['#','Timecode','Duration','Speaker','Dialogue','Annotations','Error Type']] |
|
tblx.to_excel(path,sheet_name=Path(path).stem, index=False) |
|
|
|
def utt_labels_csv_to_table(label_csv:str): |
|
|
|
|
|
|
|
|
|
|
|
table = pd.read_csv(label_csv,keep_default_na=False) |
|
|
|
if 'utt' in table.columns: |
|
table=table.rename(columns={"utt":"uttID"}).drop('seg', axis=1) |
|
elif 'seg' in table.columns: |
|
table=table.rename(columns={"seg":"uttID"}) |
|
else: |
|
table=table.reset_index().rename(columns={"index":"uttID"}) |
|
|
|
return table |