levicu
/

LEVI_whisper_medium.en

Automatic Speech Recognition

Inference Endpoints

Model card Files Files and versions Community

LEVI_whisper_medium.en / renamers.py

rosyvs's picture

Upload folder using huggingface_hub

6d504a5 verified 10 months ago

3.06 kB

	import csv
	import os
	import glob
	import shutil
	import re


	# rename files from original filename (hexadecimal salad) to Session_ID (human readable) and back
	global DEFAULT_MAP_PATH
	DEFAULT_MAP_PATH = '../../SessionIDs_from_catalog.csv'

	def make_SessionID_map(path=DEFAULT_MAP_PATH):
	"""generate dictionary from csv file with columns for File_Name and Session_ID -
	copied from columsn 1 & 2 of the Catalog on OneDrive
	"""
	SID_to_FN={}
	FN_to_SID={}
	with open(path,encoding='utf-8-sig') as f:
	reader = csv.reader(f)
	headers = next(reader)
	assert (headers[0]=='File_Name' or headers[0]=='Conference_ID') & (headers[1]=='Session_ID'), "Headers are wrong, expected ('File_Name' or 'Conference_ID') and 'Session_ID'"

	for line in reader:
	filename,sessionID=line
	filename=filename.split('.')[0] # remove extensions
	if (len(filename.strip())>0 and len(sessionID.strip())>0):
	SID_to_FN[sessionID]=filename
	FN_to_SID[filename]=sessionID
	return(SID_to_FN, FN_to_SID)


	def rename_files_SID_to_FN(path, recursive=True, overwrite=False):
	SID_to_FN, _=make_SessionID_map()
	#TODO: deal with matching nested sIDs, see commented code below
	newpaths=[]
	for sID in SID_to_FN.keys():
	srclist = glob.glob(os.path.join(path,'*', f'{sID}.*'), recursive=recursive)
	# print(f'siD: {sID}')
	# print(srclist)
	for srcpath in srclist:
	newpath = srcpath.replace(sID, SID_to_FN[sID])
	print(newpath)
	if overwrite==True:
	shutil.move(srcpath, newpath)
	else:
	shutil.copy(srcpath, newpath)
	newpaths.append(newpath)
	return newpaths


	# # get sessnames
	# sesslist = [s for s in os.listdir(path) ]
	# srclist = [os.path.join(src_dir, filename) for filename in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, filename))]
	# for src in srclist:
	# sessname_matches = [sessname in src for sessname in sesslist]
	# if sum(sessname_matches)>1:
	# print('!!!! multiple matches, will take longest match. TODO: implement this you dope')
	# elif not any(sessname_matches):
	# print(f'!!!! no sessname matches for file {src}')
	# else:
	# sessname = sesslist[sessname_matches.index(True)]
	# print(f'...copying to {sessname}')
	# shutil.copy(src, os.path.join(dest_dir,sessname))

	def rename_files_FN_to_SID(path, recursive=True):
	_, FN_to_SID=make_SessionID_map()

	def extract_conferenceID_from_filename(filename):
	"""extract conferenceID from filename
	"""
	conferenceID=filename.split(' ')[0]
	conferenceID = re.sub('_?[a-zA-Z](\.[a-zA-Z]*).xlsx','', conferenceID)
	conferenceID=re.sub('TMcoded\|Transcript','', conferenceID)
	conferenceID=re.sub('_start\d+_end\d+_?','', conferenceID)
	conferenceID=re.sub(
	'\d{5}_\d{4}-\d{2}-\d{2}_','', conferenceID)
	return conferenceID