import pandas as pd import audioread from tqdm import tqdm from tqdm.contrib.concurrent import process_map import argparse def map_duration(tsv_withdur,tsv_toadd):# tsv_withdur 和 tsv_toadd 'name'列相同且tsv_withdur有duration信息,目标是给tsv_toadd的相同行加上duration信息。 df1 = pd.read_csv(tsv_withdur,sep='\t') df2 = pd.read_csv(tsv_toadd,sep='\t') df = df2.merge(df1,on=['name'],suffixes=['','_y']) dropset = list(set(df.columns) - set(df1.columns)) df = df.drop(dropset,axis=1) df.to_csv(tsv_toadd,sep='\t',index=False) return df def add_duration(args): index,audiopath = args try: with audioread.audio_open(audiopath) as f: totalsec = f.duration except: totalsec = -1 return (index,totalsec) def add_dur2tsv(tsv_path,save_path): df = pd.read_csv(tsv_path,sep='\t') item_list = [] for item in tqdm(df.itertuples()): item_list.append((item[0],getattr(item,'audio_path'))) r = process_map(add_duration,item_list,max_workers=4,chunksize=1) index2dur = {} for index,dur in r: if dur == -1: bad_wav = df.loc[index,'audio_path'] print(f'bad wav:{bad_wav}') index2dur[index] = dur df['duration'] = df.index.map(index2dur) df.to_csv(save_path,sep='\t',index=False) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( "--tsv_path",type=str) return parser.parse_args() if __name__ == '__main__': pargs = parse_args() add_dur2tsv(pargs.tsv_path,pargs.tsv_path) #map_duration(tsv_withdur='tsv_maker/filter_audioset.tsv', # tsv_toadd='MAA1 Dataset tsvs/V3/refilter_audioset.tsv')