import os import csv import re import pandas as pd from pathlib import Path import numpy as np # functions to convert between different transcript/annotation formats ####### # "table" refers to a pd.Dataframe w the following cols # [uttID, speaker, transcript, start_sec, end_sec] ######### # separate function to write to csv, tsv or ELAN compatible (ELAN interprets ALL commas as delimiter so we need to use tab instead) def HHMMSS_to_sec(time_str): """Get Seconds from timestamp string with milliseconds.""" if not time_str: return None if time_str.count(':')==2: h, m, s = time_str.split(':') elif time_str.count(':')==3: # weird timestamps where there is a field followign seconds delimited by colon h, m, s, u = time_str.split(':') # determine whether ms field is in tenths or hundredths or thousandths by countng how many digits if len(u)==1: print('Weird time format detected - HH:MM:SS:tenths - please verify this is how you want the time interpreted') ms = float(u)/10 elif len(u)==2: # hundredths ms = float(u)/100 elif len(u)==3: # hundredths ms = float(u)/1000 else: print(f'input string format not supported: {time_str}') return None s = int(s)+ms elif time_str.count(':')==1: # print('missing HH from timestamp, assuming MM:SS') m, s = time_str.split(':') h=0 elif time_str.count(':')==0 and time_str.count('.')==1: # print('missing HH:MM from timestamp, assuming') s = float(time_str) h=0 m=0 else: print(f'input string format not supported: {time_str}') return None return int(h) * 3600 + int(m) * 60 + float(s) def sec_to_timecode(time_sec): # convert seconds to HH:MM:SS:hundredths as used in .xlsx transcripts h=int(time_sec//3600) m=int((time_sec-3600*h)//60) s=int(time_sec-3600*h-60*m) u=round(100*(time_sec-3600*h-60*m-s)) timecode=f'{h}:{m:02}:{s:02}:{u:02}' return(timecode) def docx_scraped_tsv_to_table(ooona_file): # ooona output is a table in a word docx, # for now manually copying this out and saving as tsv # but the timestamp format is wrong # input cols are SHOT START END SPEAKER DIALOGUE with open(ooona_file) as in_file: reader = csv.reader(in_file, delimiter="\t") next(reader) # skip header rows=[] for i,line in enumerate(reader): utt_ix, start_time, end_time, speaker, transcript = line start_sec = HHMMSS_to_sec(start_time) end_sec = HHMMSS_to_sec(end_time) rows.append([utt_ix,speaker,transcript,start_sec,end_sec]) utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec']) return(utt_table) # table = pd.read_csv(ooona_file, sep='\t') def molly_xlsx_to_table(xl_file): # contractor transcribers provide an xlsx with the following columns # utt_ix: int # Timecode: "HH:MM:SS:ss - HH:MM:SS:ss" # Duration: HH:MM:SS:ss # Speaker: str # Dialogue: str # Annotations: blank # Error Type: blank with pd.ExcelFile(xl_file) as xls: sheetname = xls.sheet_names table = pd.DataFrame(pd.read_excel(xls, sheetname[0])) table.columns=table.columns.str.lower() table[['start_time','end_time']] = table['timecode'].str.split('-',expand=True) table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec) table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec) table.drop(labels=['annotations','error type','duration'], axis=1, inplace=True) table=table[['#','speaker','dialogue','start_sec','end_sec']] table.rename(columns={'#':'uttID', 'dialogue':'transcript'}, inplace=True) table.reset_index(inplace=True,drop=True) table=table.replace('', np.nan).dropna(subset=['speaker','dialogue'], how='all') # drop rows with missing values in speaker and utterance return table def LoFi_xlsx_to_table(xl_file): # LoFi transcripts have the following columns: # # utt_ix: int # Timecode: "HH:MM:SS:ss - HH:MM:SS:ss" # Duration: HH:MM:SS:ss # Speaker: str # Dialogue: str # Annotations: blank # Error Type: blank with pd.ExcelFile(xl_file) as xls: sheetname = xls.sheet_names table = pd.DataFrame(pd.read_excel(xls, sheetname[0])) table[['start_time','end_time']] = table['Timecode'].str.split('-',expand=True) table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec) table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec) table.drop(labels=['Annotations','Error Type','Duration'], axis=1, inplace=True) table=table[['#','Speaker','Dialogue','start_sec','end_sec']] table.rename(columns={'#':'uttID','Speaker':'speaker', 'Dialogue':'transcript'}, inplace=True) return table def saga_to_table(saga_txt): # saga's own transcripts are txt given in the following format # # speaker (start time MM:SS) # utterance # # TODO: make more robust by pattern matching instead of modulo with open(saga_txt) as in_file: reader = csv.reader(in_file, delimiter="\n") count = 0 rows=[] for i,line in enumerate(reader): print((count,line)) if count%3 == 0: # utt = utt.split('\n') # now speaker (time) , transcript # transcript = utt[1] spk_time = line[0].split('(') if len(spk_time)<2: # print('!!!speaker not changed') # print(line) timestamp = spk_time[0].strip('):( ') speaker=rows[-1][0] # prev speaker else: speaker = spk_time[0] timestamp = spk_time[1].replace('):','') # print(timestamp) start_sec = HHMMSS_to_sec(timestamp) if count%3 == 1: transcript = line[0] if count%3 == 2: rows.append([i,speaker,transcript,start_sec,None]) #print([speaker,transcript,timestamp]) count+=1 utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec']) return(utt_table) def table_to_ELAN_tsv(table:pd.DataFrame, path:str): # write table to tsv compatible with ELAN import table.to_csv(path, index=False, float_format='%.3f',sep='\t') def table_to_standard_csv(table:pd.DataFrame, path:str): # write table to standard csv format agreed upon by whole team # TODO: convert times in seconds back to HH:MM:SS? # TODO: split utterances into sentences? table.to_csv(path,index=False, float_format='%.3f') def table_to_utt_labels_csv(table:pd.DataFrame, path:str): # write table to utt_labels csv format comaptable w rosy's isatasr lib table.rename(columns={'transcript':'utterance', 'uttID':'seg'}, inplace=True) table=table.replace('', np.nan).dropna(subset=['speaker','utterance'], how='all') # drop rows with missing values in speaker and utterance table.to_csv(path,index=False, float_format='%.3f') def table_to_molly_xlsx(tbl:pd.DataFrame,path:str): tblx = tbl tblx.rename(columns={'uttID':'#', 'speaker':'Speaker','transcript':'Dialogue'}, inplace=True) tblx['dur_s'] = tblx['end_sec']-tblx['start_sec'] tblx['start_timecode']=tblx['start_sec'].apply(sec_to_timecode) tblx['end_timecode']=tblx['end_sec'].apply(sec_to_timecode) tblx['Duration'] = tblx['dur_s'].apply(sec_to_timecode) tblx['Timecode'] = [' - '.join(i) for i in zip(tblx['start_timecode'], tblx['end_timecode'])] tblx['Annotations'] = '' tblx['Error Type'] = '' tblx=tblx[['#','Timecode','Duration','Speaker','Dialogue','Annotations','Error Type']] tblx.to_excel(path,sheet_name=Path(path).stem, index=False) def utt_labels_csv_to_table(label_csv:str): # utt_labels_csv is the usual format used for diarized, timed transcripts in this repo # There are several versions with differnt columns (with/without segment &/ utterance index) # table: # [uttID, speaker, transcript, start_sec, end_sec] table = pd.read_csv(label_csv,keep_default_na=False) # choose which column to use for uttID in table if 'utt' in table.columns: table=table.rename(columns={"utt":"uttID"}).drop('seg', axis=1) elif 'seg' in table.columns: table=table.rename(columns={"seg":"uttID"}) else: table=table.reset_index().rename(columns={"index":"uttID"}) return table