LEVI_whisper_medium.en / converters.py
rosyvs's picture
Upload folder using huggingface_hub
6d504a5 verified
raw
history blame
8.73 kB
import os
import csv
import re
import pandas as pd
from pathlib import Path
import numpy as np
# functions to convert between different transcript/annotation formats
#######
# "table" refers to a pd.Dataframe w the following cols
# [uttID, speaker, transcript, start_sec, end_sec]
#########
# separate function to write to csv, tsv or ELAN compatible (ELAN interprets ALL commas as delimiter so we need to use tab instead)
def HHMMSS_to_sec(time_str):
"""Get Seconds from timestamp string with milliseconds."""
if not time_str:
return None
if time_str.count(':')==2:
h, m, s = time_str.split(':')
elif time_str.count(':')==3:
# weird timestamps where there is a field followign seconds delimited by colon
h, m, s, u = time_str.split(':')
# determine whether ms field is in tenths or hundredths or thousandths by countng how many digits
if len(u)==1:
print('Weird time format detected - HH:MM:SS:tenths - please verify this is how you want the time interpreted')
ms = float(u)/10
elif len(u)==2: # hundredths
ms = float(u)/100
elif len(u)==3: # hundredths
ms = float(u)/1000
else:
print(f'input string format not supported: {time_str}')
return None
s = int(s)+ms
elif time_str.count(':')==1:
# print('missing HH from timestamp, assuming MM:SS')
m, s = time_str.split(':')
h=0
elif time_str.count(':')==0 and time_str.count('.')==1:
# print('missing HH:MM from timestamp, assuming SS.ms')
s = float(time_str)
h=0
m=0
else:
print(f'input string format not supported: {time_str}')
return None
return int(h) * 3600 + int(m) * 60 + float(s)
def sec_to_timecode(time_sec):
# convert seconds to HH:MM:SS:hundredths as used in .xlsx transcripts
h=int(time_sec//3600)
m=int((time_sec-3600*h)//60)
s=int(time_sec-3600*h-60*m)
u=round(100*(time_sec-3600*h-60*m-s))
timecode=f'{h}:{m:02}:{s:02}:{u:02}'
return(timecode)
def docx_scraped_tsv_to_table(ooona_file):
# ooona output is a table in a word docx,
# for now manually copying this out and saving as tsv
# but the timestamp format is wrong
# input cols are SHOT START END SPEAKER DIALOGUE
with open(ooona_file) as in_file:
reader = csv.reader(in_file, delimiter="\t")
next(reader) # skip header
rows=[]
for i,line in enumerate(reader):
utt_ix, start_time, end_time, speaker, transcript = line
start_sec = HHMMSS_to_sec(start_time)
end_sec = HHMMSS_to_sec(end_time)
rows.append([utt_ix,speaker,transcript,start_sec,end_sec])
utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec'])
return(utt_table)
# table = pd.read_csv(ooona_file, sep='\t')
def molly_xlsx_to_table(xl_file):
# contractor transcribers provide an xlsx with the following columns
# utt_ix: int
# Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"
# Duration: HH:MM:SS:ss
# Speaker: str
# Dialogue: str
# Annotations: blank
# Error Type: blank
with pd.ExcelFile(xl_file) as xls:
sheetname = xls.sheet_names
table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
table.columns=table.columns.str.lower()
table[['start_time','end_time']] = table['timecode'].str.split('-',expand=True)
table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
table.drop(labels=['annotations','error type','duration'], axis=1, inplace=True)
table=table[['#','speaker','dialogue','start_sec','end_sec']]
table.rename(columns={'#':'uttID', 'dialogue':'transcript'}, inplace=True)
table.reset_index(inplace=True,drop=True)
table=table.replace('', np.nan).dropna(subset=['speaker','dialogue'], how='all') # drop rows with missing values in speaker and utterance
return table
def LoFi_xlsx_to_table(xl_file):
# LoFi transcripts have the following columns:
# # utt_ix: int
# Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"
# Duration: HH:MM:SS:ss
# Speaker: str
# Dialogue: str
# Annotations: blank
# Error Type: blank
with pd.ExcelFile(xl_file) as xls:
sheetname = xls.sheet_names
table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
table[['start_time','end_time']] = table['Timecode'].str.split('-',expand=True)
table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
table.drop(labels=['Annotations','Error Type','Duration'], axis=1, inplace=True)
table=table[['#','Speaker','Dialogue','start_sec','end_sec']]
table.rename(columns={'#':'uttID','Speaker':'speaker', 'Dialogue':'transcript'}, inplace=True)
return table
def saga_to_table(saga_txt):
# saga's own transcripts are txt given in the following format
#
# speaker (start time MM:SS)
# utterance
# <blank line>
# TODO: make more robust by pattern matching instead of modulo
with open(saga_txt) as in_file:
reader = csv.reader(in_file, delimiter="\n")
count = 0
rows=[]
for i,line in enumerate(reader):
print((count,line))
if count%3 == 0:
# utt = utt.split('\n') # now speaker (time) , transcript
# transcript = utt[1]
spk_time = line[0].split('(')
if len(spk_time)<2:
# print('!!!speaker not changed')
# print(line)
timestamp = spk_time[0].strip('):( ')
speaker=rows[-1][0] # prev speaker
else:
speaker = spk_time[0]
timestamp = spk_time[1].replace('):','')
# print(timestamp)
start_sec = HHMMSS_to_sec(timestamp)
if count%3 == 1:
transcript = line[0]
if count%3 == 2:
rows.append([i,speaker,transcript,start_sec,None])
#print([speaker,transcript,timestamp])
count+=1
utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec'])
return(utt_table)
def table_to_ELAN_tsv(table:pd.DataFrame, path:str):
# write table to tsv compatible with ELAN import
table.to_csv(path, index=False, float_format='%.3f',sep='\t')
def table_to_standard_csv(table:pd.DataFrame, path:str):
# write table to standard csv format agreed upon by whole team
# TODO: convert times in seconds back to HH:MM:SS?
# TODO: split utterances into sentences?
table.to_csv(path,index=False, float_format='%.3f')
def table_to_utt_labels_csv(table:pd.DataFrame, path:str):
# write table to utt_labels csv format comaptable w rosy's isatasr lib
table.rename(columns={'transcript':'utterance', 'uttID':'seg'}, inplace=True)
table=table.replace('', np.nan).dropna(subset=['speaker','utterance'], how='all') # drop rows with missing values in speaker and utterance
table.to_csv(path,index=False, float_format='%.3f')
def table_to_molly_xlsx(tbl:pd.DataFrame,path:str):
tblx = tbl
tblx.rename(columns={'uttID':'#', 'speaker':'Speaker','transcript':'Dialogue'}, inplace=True)
tblx['dur_s'] = tblx['end_sec']-tblx['start_sec']
tblx['start_timecode']=tblx['start_sec'].apply(sec_to_timecode)
tblx['end_timecode']=tblx['end_sec'].apply(sec_to_timecode)
tblx['Duration'] = tblx['dur_s'].apply(sec_to_timecode)
tblx['Timecode'] = [' - '.join(i) for i in zip(tblx['start_timecode'], tblx['end_timecode'])]
tblx['Annotations'] = ''
tblx['Error Type'] = ''
tblx=tblx[['#','Timecode','Duration','Speaker','Dialogue','Annotations','Error Type']]
tblx.to_excel(path,sheet_name=Path(path).stem, index=False)
def utt_labels_csv_to_table(label_csv:str):
# utt_labels_csv is the usual format used for diarized, timed transcripts in this repo
# There are several versions with differnt columns (with/without segment &/ utterance index)
# table:
# [uttID, speaker, transcript, start_sec, end_sec]
table = pd.read_csv(label_csv,keep_default_na=False)
# choose which column to use for uttID in table
if 'utt' in table.columns:
table=table.rename(columns={"utt":"uttID"}).drop('seg', axis=1)
elif 'seg' in table.columns:
table=table.rename(columns={"seg":"uttID"})
else:
table=table.reset_index().rename(columns={"index":"uttID"})
return table