Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

LEVI_whisper_benchmark.py +47 -0
__init__.py +0 -0
__pycache__/__init__.cpython-310.pyc +0 -0
__pycache__/benchmark_utils.cpython-310.pyc +0 -0
__pycache__/converters.cpython-310.pyc +0 -0
__pycache__/renamers.cpython-310.pyc +0 -0
__pycache__/trimmers.cpython-310.pyc +0 -0
benchmark_utils.py +353 -0
converters.py +206 -0
renamers.py +77 -0
trimmers.py +139 -0

LEVI_whisper_benchmark.py ADDED Viewed

	@@ -0,0 +1,47 @@

+#%% imports
+import os
+from benchmark_utils import ASRmanifest, wer_from_csv
+#%% setup paths
+corpora_root = '/shared/corpora/forSAGA/' # root path where audio files are, inserted in palce of $DATAROOT in manifest
+manif_root =  '/shared/corpora/forSAGA/data_manifests/' # path to dir containing data manifest csvs
+output_dir = './ASR_output/' # where to save ASR output
+manifest='LEVI_LoFi_v2_TEST_norm_wer_isat' # name of test manifest
+model_name= 'LEVI_whisper_medium.en' # name of save directory of model you want to evaluate
+hf_org = 'levicu'
+model_path = f'{hf_org}/{model_name}'
+#%% setup paths for Rosy TESTING:
+corpora_root = '/shared/corpora/' # root path where audio files are, inserted in palce of $DATAROOT in manifest
+manif_root =  '/shared/corpora/data_manifests/ASR/' # path to dir containing data manifest csvs
+output_dir = '/home/rosy/whisat-output/' # where to save ASR output
+manifest= 'LEVI_LoFi_v2_TEST_punc+cased' # name of test manifest
+model_name= 'LEVI_LoFi_v2_MediumEN_Lora_Int8' # name of save directory of model you want to evaluate
+model_path='/shared/models/LEVI_LoFi_v2_MediumEN_Lora_Int8/final/'
+model_path='openai/whisper_medium.en'
+#%%
+# generate paths
+manifest_csv=os.path.join(manif_root, f'{manifest}.csv')
+out_csv=os.path.join(output_dir,f'{model_name}_on_{manifest}.csv')
+#%% Inference
+ASRmanifest(
+manifest_csv=manifest_csv,
+out_csv=out_csv,
+corpora_root=corpora_root,
+model_path=model_path,
+)
+#%% Evaluation
+print(f'reading results from {out_csv}')
+print(f'{model_name} on {manifest}')
+wer_meas=wer_from_csv(
+    out_csv,
+    refcol='transcript',
+    hypcol='asr',
+    printout=True,
+    text_norm_method='levi'
+    )

__init__.py ADDED Viewed

File without changes

__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (128 Bytes). View file

__pycache__/benchmark_utils.cpython-310.pyc ADDED Viewed

Binary file (10.2 kB). View file

__pycache__/converters.cpython-310.pyc ADDED Viewed

Binary file (5.32 kB). View file

__pycache__/renamers.cpython-310.pyc ADDED Viewed

Binary file (1.61 kB). View file

__pycache__/trimmers.cpython-310.pyc ADDED Viewed

Binary file (2.77 kB). View file

benchmark_utils.py ADDED Viewed

	@@ -0,0 +1,353 @@

+#%% imports
+import pandas as pd
+import time
+from tqdm import tqdm
+import torch
+from torch.cuda.amp import autocast
+import transformers
+from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration, GenerationConfig
+from transformers import pipeline, AutomaticSpeechRecognitionPipeline
+from peft import PeftModel, PeftConfig
+import warnings
+import jiwer
+from jiwer.process import WordOutput
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import os
+import math
+from decimal import InvalidOperation
+import contractions
+from whisper.normalizers.english import EnglishTextNormalizer
+from num2words import num2words
+import csv
+import re
+import string
+#%% define functions
+def ASRmanifest(
+                manifest_csv: str,
+                out_csv: str,
+                corpora_root: str,
+                model_path:str,
+                ):
+    """Run Whisper ASR on a dataset specified in a manifest
+    Args:
+        manifest_csv (str): path to manifest csv listing files to transcribe
+        out_csv (str):path to write output csv
+        corpora_root (str): root path where audio files are, inserted in place of $DATAROOT in manifest
+        model_path (str): path to model directory / huggingface model name
+    """
+    df = pd.read_csv(manifest_csv,keep_default_na=False)
+    fieldnames = list(df.columns) + ['asr']
+    asr_pipeline=prepare_pipeline(
+        model_path=model_path,
+        generate_opts={'max_new_tokens':448,
+                'num_beams':1,#greedy
+                'repetition_penalty':1,
+                'do_sample':False
+                            }
+                )
+    message = "This may take a while on CPU." if asr_pipeline.device.type=="cpu" else "Using GPU"
+    print(f'Running ASR for {len(df)} files. {message} ...')
+    compute_time=0
+    total_audio_dur=0
+    # get the start time
+    st = time.time()
+    with open(out_csv, 'w', newline='') as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames,delimiter=',')
+        writer.writeheader()
+        for i,row in tqdm(df.iterrows(), total=df.shape[0]):
+            audiofile=row['wav'].replace('$DATAROOT',corpora_root)
+            with torch.no_grad():
+                with autocast():
+                    try:
+                        result = asr_pipeline(audiofile)
+                        asrtext = result['text']
+                    except (FileNotFoundError, ValueError) as e:
+                        print(f'SKIPPED: {audiofile}')
+                        continue
+            row['asr']=asrtext
+            writer.writerow( row.to_dict())
+    et = time.time()
+    compute_time = (et-st)
+    print(f'...transcription complete in {compute_time:.1f} sec')
+def load_model(
+    model_path:str,
+    language='english',
+    use_int8 = False,
+    device_map='auto'):
+    warnings.filterwarnings("ignore")
+    transformers.utils.logging.set_verbosity_error()
+    try:
+        model = WhisperForConditionalGeneration.from_pretrained(
+            model_path,
+            load_in_8bit=use_int8,
+            device_map=device_map,
+            use_cache=False,
+            )
+        try:
+            processor=WhisperProcessor.from_pretrained(model_path, language=language, task="transcribe")
+        except OSError:
+            print('missing tokenizer and preprocessor config files in save dir, checking directory above...')
+            processor=WhisperProcessor.from_pretrained(os.path.join(model_path,'..'), language=language, task="transcribe")
+    except OSError as e:
+        print(f'{e}: possibly missing model or config file in model path. Will check for adapter...')
+            # check if PEFT
+        if os.path.isdir(os.path.join(model_path , "adapter_model")):
+            print('found adapter...loading PEFT model')
+            # checkpoint dir needs adapter model subdir with adapter_model.bin and adapter_confg.json
+            peft_config = PeftConfig.from_pretrained(os.path.join(model_path , "adapter_model"))
+            print(f'...loading and merging LORA weights to base model {peft_config.base_model_name_or_path}')
+            model = WhisperForConditionalGeneration.from_pretrained(peft_config.base_model_name_or_path,
+                load_in_8bit=use_int8,
+                device_map=device_map,
+                use_cache=False,
+                )
+            model = PeftModel.from_pretrained(model, os.path.join(model_path,"adapter_model"))
+            model = model.merge_and_unload()
+            processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task="transcribe")
+        else:
+            raise e
+    model.eval()
+    return(model, processor)
+def prepare_pipeline(model_path, generate_opts):
+    """Prepare a pipeline for ASR inference
+    Args:
+        model_path (str): path to model directory / huggingface model name
+        generate_opts (dict): options to pass to pipeline
+    Returns:
+        pipeline: ASR pipeline
+    """
+    model, processor = load_model(
+        model_path=model_path)
+    asr_pipeline = pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        generate_kwargs=generate_opts,
+    )
+    return asr_pipeline
+#%% WER evaluation functions
+def get_normalizer(text_norm_method='isat'):
+    if text_norm_method=='whisper':
+        normalizer=whisper_norm_text_for_wer
+    elif text_norm_method=='whisper_keep_tags':
+        normalizer=EnglishTextNormalizer()
+    elif text_norm_method=='isat':
+        normalizer = norm_text_for_wer
+    elif text_norm_method=='levi':
+        normalizer = levi_norm_text_for_wer
+    else:
+        raise NotImplementedError(f'unrecognized normalizer method: {text_norm_method}')
+    return normalizer
+def strip_punct(instr, keep_math=False):
+    newstr = ''
+    for word in instr.split():
+        if keep_math:
+            word=word.strip('!"#$&\',.:;<=>?@[\\]^_`{|}~')
+        else:
+            # delete punct from start and end of word
+            word = word.strip(string.punctuation)
+        # delete commas inside numbers
+        m = re.match(r'(\d*),(\d)', word)
+        if m != None:
+            word = word.replace(',', '')
+        # commas inside words become space
+        word = re.sub(",", " ", word)
+        # hyphens inside words become space
+        if keep_math:
+            pass
+        else:
+            word = re.sub("-", " ", word)
+            word = word.strip()
+        newstr += ' ' + word
+    newstr = newstr.strip()
+    return newstr
+def remove_in_brackets(text):
+    # removes any clause in brackets or parens, and the brackets themselves
+    return re.sub("[\(\[\<].*?[\)\]\>]+", " ", text)
+def caught_num2words(text):
+    # first do currency replacements #TODO: plurals vs singular
+    if '$' in text:
+        text = re.sub('\$([0-9]+)', '\g<1> dollars', text)
+    if '€' in text:
+        text = re.sub('\$([0-9]+)', '\g<1> euro', text)
+    if '£' in text:
+        text = re.sub('\$([0-9]+)', '\g<1> pounds', text)
+    if '%' in text:
+        text = re.sub('([0-9]+)\%', '\g<1> percent', text)
+    # strip punctuation
+    text=strip_punct(text, keep_math=True)
+    text=text.strip('*=/')
+    # catch strings that might be converted to infinity or NaN and return as is...
+    naughty_words = ['INF','Inf','inf','NAN','NaN', 'nan', 'NONE','None','none','Infinity','infinity']
+    if text in naughty_words:
+        return text
+    try:
+        if len(text.split()) > 1:
+            return ' '.join([caught_num2words(word) for word in text.split()])
+        else:
+            return num2words(text)
+    except (InvalidOperation, ValueError) as error:
+        return text
+def spell_math(text):
+    # spell out mathematical expressions
+    # numerals preceded by hyphen become negative
+    text = re.sub('\-(\d+)', 'minus \g<1>', text)
+    text = re.sub('(\d+\s?)\-(\s?\d?)', '\g<1> minus \g<2>', text)
+    text = re.sub('(\w+\s+)\-(\s?\w+)', '\g<1> minus \g<2>', text) # need to be more careful with - as this could be a hyphenated word not minus
+    text = re.sub('(\w+\s?)\+(\s?\w+)', '\g<1> plus \g<2>', text)
+    text = re.sub('(\w+\s?)\*(\s?\w+)', '\g<1> times \g<2>', text)
+    text = re.sub('(\d+\s?)x(\s?\d)', '\g<1> times \g<2>', text) # need to be more careful with x as this could be a variable not times
+    text = re.sub('(\w+\s?)\/(\s?\w+)', '\g<1> divided by \g<2>', text)
+    text = re.sub('(\w+\s?)\=(\s?\w+)', '\g<1> equals \g<2>', text)
+    return text
+def expand_contractions(str):
+        expanded_words = []
+        for wrd in str.split():
+            expanded_words.append(contractions.fix(wrd))
+        str = ' '.join(expanded_words)
+        return str
+def norm_text_for_wer(text):
+    # function to format text or lists of text (e.g. asr, transcript) for wer computation.
+    # Converts from list to a single string and apply some text normalization operations
+    # note that the clean_REV_transcript function should be applied first to remove REV-specific keywords
+    # and extract text from docx format tables
+    if isinstance(text,list):
+        text = ' '.join(text)
+    text=str(text)
+    text = text.replace('\n',' ') # replace newline with space
+    text = remove_in_brackets(text) # removes non-spoken annotations such as [inaudible]
+    text = re.sub('%\w+','', text) # remove %HESITATION etc
+    text = ' '.join([caught_num2words(str) for str in text.split(' ')]) # spell out numbers
+    text = expand_contractions(text)
+    text = strip_punct(text)
+    text = text.lower()
+    text = re.sub('\s+',' ',text) # replace multiple space with single
+    return text
+def levi_norm_text_for_wer(text):
+    # function to format text or lists of text (e.g. asr, transcript) for wer computation.
+    # specialized for math language
+    if isinstance(text,list):
+        text = ' '.join(text)
+    text=str(text)
+    text = text.replace('\n',' ') # replace newline with space
+    text = remove_in_brackets(text) # removes non-spoken annotations such as [inaudible]
+    text = re.sub('%\w+','', text) # remove %HESITATION etc
+    text = spell_math(text)
+    text = ' '.join([caught_num2words(str) for str in text.split(' ')]) # spell out numbers
+    text = expand_contractions(text)
+    text = strip_punct(text, keep_math=True)
+    text = text.lower()
+    text = re.sub('\s+',' ',text) # replace multiple space with single
+    return text
+def whisper_norm_text_for_wer(text):
+    # function to format text for wer computation.
+    # uses Whisper normalizer after stripping corpus-specific special tags
+    if isinstance(text,list):
+        text = ' '.join(text)
+    text=str(text)
+    text = text.replace('\n',' ') # replace newline with space
+    text = re.sub('%\w+','', text) # remove %HESITATION etc
+    text = remove_in_brackets(text) # removes non-spoken annotations such as [inaudible]
+    normalizer = EnglishTextNormalizer()
+    text = normalizer(text)
+    return text
+def wer_from_df(
+    df,
+    refcol='ref',
+    hypcol='hyp',
+    return_alignments=False,
+    normalise = True,
+    text_norm_method='isat',
+    printout=True):
+    """Compute WER from a dataframe containing a ref col and a hyp col
+    WER is computed on the edit operation counts over the whole df,
+    not averaged over single utterances.
+    Args:
+        df (pandas DataFrame): containing rows per utterance
+        refcol (str, optional): column name containing reference transcript. Defaults to 'ref'.
+        hypcol (str, optional): column name containing hypothesis transcript. Defaults to 'hyp'.
+        return_alignments (bool, optional): Return full word-level alignments. Defaults to False.
+        normalise (bool, optional): Apply text normalisatin to ref and hyp (see norm_text_for_wer). Defaults to True.
+        printout (bool, optional): Print WER metrics. Defaults to True.
+    """
+    normalizer=get_normalizer(text_norm_method)
+    refs=df[refcol].astype(str)
+    hyps = df[hypcol].astype(str)
+    if normalise:
+        refs=refs.apply(normalizer)
+        hyps=hyps.apply(normalizer)
+    #ID,ref,hyp,ref_norm,hyp_norm
+    if any(s == '' for s in list(refs)):
+        nonempty=refs.str.len()>0
+        refs=refs[nonempty]
+        hyps=hyps[nonempty]
+        # print(f'{sum(~nonempty)} empty references removed (after normalisation if applied)')
+    wer_meas = jiwer.compute_measures(list(refs), list(hyps))
+    if not return_alignments:
+        # remove alignments
+        del wer_meas['ops']
+        del wer_meas['truth']
+        del wer_meas['hypothesis']
+    wer_meas['word_count'] = wer_meas['substitutions']+wer_meas['deletions']+wer_meas['hits']
+    wer_meas['sub_rate'] = wer_meas['substitutions']/wer_meas['word_count']
+    wer_meas['del_rate'] = wer_meas['deletions']/wer_meas['word_count']
+    wer_meas['ins_rate'] = wer_meas['insertions']/wer_meas['word_count']
+    if printout:
+        for key in ['wer','sub_rate','del_rate','ins_rate']:
+            print((f"{key}={100*wer_meas[key]:.1f}" ))
+        print(f"word_count={int(wer_meas['word_count'])}")
+    return wer_meas
+def wer_from_csv(
+    csv_path,
+    refcol='ref',
+    hypcol='hyp',
+    return_alignments=False,
+    normalise = True,
+    text_norm_method='isat' ,
+    printout=True):
+    res = pd.read_csv(csv_path).astype(str)
+    wer_meas=wer_from_df(res,
+        refcol=refcol,
+        hypcol=hypcol,
+        return_alignments=return_alignments,
+        normalise = normalise,
+        text_norm_method=text_norm_method,
+        printout=printout)
+    return wer_meas

converters.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import os
+import csv
+import re
+import pandas as pd
+from pathlib import Path
+import numpy as np
+# functions to convert between different transcript/annotation formats
+#######
+# "table" refers to a pd.Dataframe w the following cols
+# [uttID, speaker, transcript, start_sec, end_sec]
+#########
+# separate function to write to csv, tsv or ELAN compatible (ELAN interprets ALL commas as delimiter so we need to use tab instead)
+def HHMMSS_to_sec(time_str):
+    """Get Seconds from timestamp string with milliseconds."""
+    if not time_str:
+        return None
+    if time_str.count(':')==2:
+        h, m, s = time_str.split(':')
+    elif time_str.count(':')==3:
+    # weird timestamps where there is a field followign seconds delimited by colon
+        h, m, s, u = time_str.split(':')
+        # determine whether ms field is in tenths or hundredths or thousandths by countng how many digits
+        if len(u)==1:
+            print('Weird time format detected - HH:MM:SS:tenths - please verify this is how you want the time interpreted')
+            ms = float(u)/10
+        elif len(u)==2: # hundredths
+            ms = float(u)/100
+        elif len(u)==3: # hundredths
+            ms = float(u)/1000
+        else:
+            print(f'input string format not supported: {time_str}')
+            return None
+        s = int(s)+ms
+    elif time_str.count(':')==1:
+        # print('missing HH from timestamp, assuming MM:SS')
+        m, s = time_str.split(':')
+        h=0
+    elif time_str.count(':')==0 and time_str.count('.')==1:
+        # print('missing HH:MM from timestamp, assuming SS.ms')
+        s = float(time_str)
+        h=0
+        m=0
+    else:
+        print(f'input string format not supported: {time_str}')
+        return None
+    return int(h) * 3600 + int(m) * 60 + float(s)
+def sec_to_timecode(time_sec):
+    # convert seconds to HH:MM:SS:hundredths as used in .xlsx transcripts
+    h=int(time_sec//3600)
+    m=int((time_sec-3600*h)//60)
+    s=int(time_sec-3600*h-60*m)
+    u=round(100*(time_sec-3600*h-60*m-s))
+    timecode=f'{h}:{m:02}:{s:02}:{u:02}'
+    return(timecode)
+def docx_scraped_tsv_to_table(ooona_file):
+    # ooona output is a table in a word docx,
+    # for now manually copying this out and saving as tsv
+    # but the timestamp format is wrong
+    # input cols are SHOT	START	END	SPEAKER	DIALOGUE
+    with open(ooona_file) as in_file:
+        reader = csv.reader(in_file, delimiter="\t")
+        next(reader) # skip header
+        rows=[]
+        for i,line in enumerate(reader):
+            utt_ix, start_time, end_time, speaker, transcript = line
+            start_sec = HHMMSS_to_sec(start_time)
+            end_sec = HHMMSS_to_sec(end_time)
+            rows.append([utt_ix,speaker,transcript,start_sec,end_sec])
+    utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec'])
+    return(utt_table)
+    # table = pd.read_csv(ooona_file, sep='\t')
+def molly_xlsx_to_table(xl_file):
+    # contractor transcribers provide an xlsx with the following columns
+    # utt_ix:	int
+    # Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"
+    # Duration:	HH:MM:SS:ss
+    # Speaker:	str
+    # Dialogue:	str
+    # Annotations:	blank
+    # Error Type: blank
+    with pd.ExcelFile(xl_file) as xls:
+        sheetname = xls.sheet_names
+        table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
+    table.columns=table.columns.str.lower()
+    table[['start_time','end_time']] = table['timecode'].str.split('-',expand=True)
+    table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
+    table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
+    table.drop(labels=['annotations','error type','duration'], axis=1, inplace=True)
+    table=table[['#','speaker','dialogue','start_sec','end_sec']]
+    table.rename(columns={'#':'uttID', 'dialogue':'transcript'}, inplace=True)
+    table.reset_index(inplace=True,drop=True)
+    table=table.replace('', np.nan).dropna(subset=['speaker','dialogue'], how='all') # drop rows with missing values in speaker and utterance
+    return table
+def LoFi_xlsx_to_table(xl_file):
+# LoFi transcripts have the following columns:
+#     # utt_ix:	int
+    # Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"
+    # Duration:	HH:MM:SS:ss
+    # Speaker:	str
+    # Dialogue:	str
+    # Annotations:	blank
+    # Error Type: blank
+    with pd.ExcelFile(xl_file) as xls:
+        sheetname = xls.sheet_names
+        table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
+    table[['start_time','end_time']] = table['Timecode'].str.split('-',expand=True)
+    table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
+    table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
+    table.drop(labels=['Annotations','Error Type','Duration'], axis=1, inplace=True)
+    table=table[['#','Speaker','Dialogue','start_sec','end_sec']]
+    table.rename(columns={'#':'uttID','Speaker':'speaker', 'Dialogue':'transcript'}, inplace=True)
+    return table
+def saga_to_table(saga_txt):
+    # saga's own transcripts are txt given in the following format
+    #
+    # speaker (start time MM:SS)
+    # utterance
+    # <blank line>
+    # TODO: make more robust by pattern matching instead of modulo
+    with open(saga_txt) as in_file:
+        reader = csv.reader(in_file, delimiter="\n")
+        count = 0
+        rows=[]
+        for i,line in enumerate(reader):
+            print((count,line))
+            if count%3 == 0:
+            # utt = utt.split('\n')  # now speaker (time) , transcript
+                # transcript = utt[1]
+                spk_time = line[0].split('(')
+                if len(spk_time)<2:
+                    # print('!!!speaker not changed')
+                    # print(line)
+                    timestamp = spk_time[0].strip('):( ')
+                    speaker=rows[-1][0]   # prev speaker
+                else:
+                    speaker = spk_time[0]
+                    timestamp = spk_time[1].replace('):','')
+                    # print(timestamp)
+                start_sec = HHMMSS_to_sec(timestamp)
+            if count%3 == 1:
+                transcript = line[0]
+            if count%3 == 2:
+                rows.append([i,speaker,transcript,start_sec,None])
+                #print([speaker,transcript,timestamp])
+            count+=1
+    utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec'])
+    return(utt_table)
+def table_to_ELAN_tsv(table:pd.DataFrame, path:str):
+    # write table to tsv compatible with ELAN import
+    table.to_csv(path, index=False, float_format='%.3f',sep='\t')
+def table_to_standard_csv(table:pd.DataFrame, path:str):
+    # write table to standard csv format agreed upon by whole team
+    # TODO: convert times in seconds back to HH:MM:SS?
+    # TODO: split utterances into sentences?
+    table.to_csv(path,index=False, float_format='%.3f')
+def table_to_utt_labels_csv(table:pd.DataFrame, path:str):
+    # write table to utt_labels csv format comaptable w rosy's isatasr lib
+    table.rename(columns={'transcript':'utterance', 'uttID':'seg'}, inplace=True)
+    table=table.replace('', np.nan).dropna(subset=['speaker','utterance'], how='all') # drop rows with missing values in speaker and utterance
+    table.to_csv(path,index=False, float_format='%.3f')
+def table_to_molly_xlsx(tbl:pd.DataFrame,path:str):
+    tblx = tbl
+    tblx.rename(columns={'uttID':'#', 'speaker':'Speaker','transcript':'Dialogue'}, inplace=True)
+    tblx['dur_s'] = tblx['end_sec']-tblx['start_sec']
+    tblx['start_timecode']=tblx['start_sec'].apply(sec_to_timecode)
+    tblx['end_timecode']=tblx['end_sec'].apply(sec_to_timecode)
+    tblx['Duration'] = tblx['dur_s'].apply(sec_to_timecode)
+    tblx['Timecode'] = [' - '.join(i) for i in zip(tblx['start_timecode'], tblx['end_timecode'])]
+    tblx['Annotations'] = ''
+    tblx['Error Type'] = ''
+    tblx=tblx[['#','Timecode','Duration','Speaker','Dialogue','Annotations','Error Type']]
+    tblx.to_excel(path,sheet_name=Path(path).stem, index=False)
+def utt_labels_csv_to_table(label_csv:str):
+    # utt_labels_csv is the usual format used for diarized, timed transcripts in this repo
+    # There are several versions with differnt columns (with/without segment &/ utterance index)
+    # table:
+    # [uttID, speaker, transcript, start_sec, end_sec]
+    table = pd.read_csv(label_csv,keep_default_na=False)
+    # choose which column to use for uttID in table
+    if 'utt' in table.columns:
+        table=table.rename(columns={"utt":"uttID"}).drop('seg', axis=1)
+    elif 'seg' in table.columns:
+        table=table.rename(columns={"seg":"uttID"})
+    else:
+        table=table.reset_index().rename(columns={"index":"uttID"})
+    return table

renamers.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import csv
+import os
+import glob
+import shutil
+import re
+# rename files from original filename (hexadecimal salad) to Session_ID (human readable) and back
+global DEFAULT_MAP_PATH
+DEFAULT_MAP_PATH = '../../SessionIDs_from_catalog.csv'
+def make_SessionID_map(path=DEFAULT_MAP_PATH):
+    """generate dictionary from csv file with columns for File_Name and Session_ID -
+    copied from columsn 1 & 2 of the Catalog on OneDrive
+    """
+    SID_to_FN={}
+    FN_to_SID={}
+    with open(path,encoding='utf-8-sig') as f:
+        reader = csv.reader(f)
+        headers = next(reader)
+        assert (headers[0]=='File_Name' or headers[0]=='Conference_ID') & (headers[1]=='Session_ID'), "Headers are wrong, expected ('File_Name' or 'Conference_ID') and 'Session_ID'"
+        for line in reader:
+            filename,sessionID=line
+            filename=filename.split('.')[0] # remove extensions
+            if (len(filename.strip())>0 and len(sessionID.strip())>0):
+                SID_to_FN[sessionID]=filename
+                FN_to_SID[filename]=sessionID
+    return(SID_to_FN, FN_to_SID)
+def rename_files_SID_to_FN(path, recursive=True, overwrite=False):
+    SID_to_FN, _=make_SessionID_map()
+    #TODO: deal with matching nested sIDs, see commented code below
+    newpaths=[]
+    for sID in SID_to_FN.keys():
+        srclist = glob.glob(os.path.join(path,'**', f'*{sID}.*'), recursive=recursive)
+        # print(f'siD: {sID}')
+        # print(srclist)
+        for srcpath in srclist:
+            newpath = srcpath.replace(sID, SID_to_FN[sID])
+            print(newpath)
+            if overwrite==True:
+                shutil.move(srcpath, newpath)
+            else:
+                shutil.copy(srcpath, newpath)
+            newpaths.append(newpath)
+    return newpaths
+    # # get sessnames
+    # sesslist = [s for s in os.listdir(path) ]
+    # srclist = [os.path.join(src_dir, filename) for filename in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, filename))]
+    # for src in srclist:
+    #     sessname_matches = [sessname in src for sessname in sesslist]
+    #     if sum(sessname_matches)>1:
+    #         print('!!!!    multiple matches, will take longest match. TODO: implement this you dope')
+    #     elif not any(sessname_matches):
+    #         print(f'!!!!    no sessname matches for file {src}')
+    #     else:
+    #         sessname = sesslist[sessname_matches.index(True)]
+    #         print(f'...copying to {sessname}')
+    #         shutil.copy(src, os.path.join(dest_dir,sessname))
+def rename_files_FN_to_SID(path, recursive=True):
+    _, FN_to_SID=make_SessionID_map()
+def extract_conferenceID_from_filename(filename):
+    """extract conferenceID from filename
+    """
+    conferenceID=filename.split(' ')[0]
+    conferenceID = re.sub('_?[a-zA-Z]*(\.*[a-zA-Z]*).xlsx','', conferenceID)
+    conferenceID=re.sub('TMcoded|Transcript','', conferenceID)
+    conferenceID=re.sub('_start\d+_end\d+_?','', conferenceID)
+    conferenceID=re.sub(
+        '\d{5}_\d{4}-\d{2}-\d{2}_','', conferenceID)
+    return conferenceID

trimmers.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from pathlib import Path
+import os
+import csv
+import subprocess
+import pandas as pd
+import sys
+sys.path.append('..')
+from levi.converters import HHMMSS_to_sec
+def trim_media(media_in,
+                media_out,
+                start,
+                end):
+    # options for writing out audio if converting
+    WAV_CHANNELS = 1
+    WAV_SAMPLE_RATE = 16000
+    media_type = Path(media_in).suffix
+    ext = Path(media_out).suffix
+    if isinstance(start, str):
+        start_sec = HHMMSS_to_sec(start)
+    else:
+        start_sec = float(start)
+    if isinstance(end, str):
+        end_sec = HHMMSS_to_sec(end)
+    else:
+        end_sec = float(end)
+    if ext == '.wav':
+    # convert to wav with standard format for audio models
+        print(f'...Using ffmpeg to trim video from {start} to {end} \n   and convert to {WAV_SAMPLE_RATE}Hz WAV with {WAV_CHANNELS} channels...')
+        print(f'...generating {media_out}...')
+        subprocess.call(['ffmpeg',
+        '-y',
+        '-i',
+        media_in,
+        '-ss',
+        f'{start_sec}',
+        '-to',
+        f'{end_sec}',
+        '-acodec',
+        'pcm_s16le',
+        '-ac',
+        WAV_CHANNELS,
+        '-ar',
+        WAV_SAMPLE_RATE,
+        media_out,
+        '-hide_banner',
+        '-loglevel',
+        'warning'
+        ],shell=False)
+    else:
+        print(f'...Using ffmpeg to trim video from {start_sec} to {end_sec}...')
+        print(f'...generating {media_out}...')
+        subprocess.call(['ffmpeg',
+        '-y',
+        '-i',
+        media_in,
+        '-ss',
+        f'{start_sec}',
+        '-to',
+        f'{end_sec}',
+        '-c',
+        'copy',
+        media_out,
+        '-hide_banner',
+        '-loglevel',
+        'warning'
+        ],shell=False)
+def trim_media_batch(extract_timings_csv,
+                    outpath,
+                    suffix='',
+                    convert_to=False):
+    """trim a batch of media files given a csv of timings
+    Args:
+        extract_timings_csv (str): path to csv with columns:
+            filepath, start (HH:MM:SS), end (HH:MM:SS)
+        outpath (str): output path
+        suffix (str, optional): save output trimmed files with this suffix. Defaults to ''.
+        convert_to (bool, optional): [None, 'wav','mp4']. Defaults to False.
+    Returns:
+        outfiles (list): list of file paths created
+    """
+    os.makedirs(outpath, exist_ok=True)
+    samples_df = pd.read_csv(
+        extract_timings_csv,
+        skip_blank_lines=True,
+        index_col=False,
+        names=['media_in','startHMS','endHMS'],
+        header=0
+        ).dropna().sort_values(
+            by='media_in',ignore_index=True).reset_index(drop=True)
+    print(f'TRIMMING {len(samples_df.index)} FILES...')
+    # enumerate samples by session and check if there are multiple samples from a given session
+    samples_df['count'] = samples_df.groupby('media_in').cumcount()
+    if not os.path.exists(outpath):
+            os.makedirs(outpath)
+    outfiles=[]
+    for i, rec in samples_df.iterrows():
+        media_in,startHMS,endHMS, count = rec.values
+        suffix_use = f'{suffix}{count}' if count > 0 else suffix # if multiple samples per recording, give a diffrent name
+        if not os.path.exists(media_in):
+            print(f'!!!WARNING: media not found: {media_in}')
+            continue
+        media_type = Path(media_in).suffix
+        sessname = Path(media_in).stem
+        print(f'...Input media: {media_in}')
+        if convert_to=='wav':
+            ext = '.wav'
+        elif convert_to=='mp4':
+            ext = '.mp4'
+        else:
+            ext = media_type
+        outfile = os.path.expanduser(os.path.join(outpath,f'{sessname}{suffix_use}{ext}'))
+        trim_media(media_in, outfile, HHMMSS_to_sec(startHMS), HHMMSS_to_sec(endHMS))
+        outfiles.append(outfile)
+    return(outfiles)