Spaces:

AIGC-Audio
/

AudioGPT

Build error

App Files Files Community

lmzjms commited on Apr 6, 2023

Commit

15ac91d

•

1 Parent(s): a1c3802

Upload 35 files

Browse files

Files changed (35) hide show

audio_detection/__init__.py +0 -0
audio_detection/audio_infer/__init__.py +0 -0
audio_detection/audio_infer/__pycache__/__init__.cpython-38.pyc +0 -0
audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv +1350 -0
audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv +606 -0
audio_detection/audio_infer/metadata/class_labels_indices.csv +528 -0
audio_detection/audio_infer/pytorch/__pycache__/models.cpython-38.pyc +0 -0
audio_detection/audio_infer/pytorch/__pycache__/pytorch_utils.cpython-38.pyc +0 -0
audio_detection/audio_infer/pytorch/evaluate.py +42 -0
audio_detection/audio_infer/pytorch/finetune_template.py +127 -0
audio_detection/audio_infer/pytorch/inference.py +206 -0
audio_detection/audio_infer/pytorch/losses.py +14 -0
audio_detection/audio_infer/pytorch/main.py +378 -0
audio_detection/audio_infer/pytorch/models.py +951 -0
audio_detection/audio_infer/pytorch/pytorch_utils.py +251 -0
audio_detection/audio_infer/results/YDlWd7Wmdi1E.png +0 -0
audio_detection/audio_infer/useful_ckpts/audio_detection.pth +3 -0
audio_detection/audio_infer/utils/__pycache__/config.cpython-38.pyc +0 -0
audio_detection/audio_infer/utils/config.py +94 -0
audio_detection/audio_infer/utils/crash.py +12 -0
audio_detection/audio_infer/utils/create_black_list.py +64 -0
audio_detection/audio_infer/utils/create_indexes.py +126 -0
audio_detection/audio_infer/utils/data_generator.py +421 -0
audio_detection/audio_infer/utils/dataset.py +224 -0
audio_detection/audio_infer/utils/plot_for_paper.py +565 -0
audio_detection/audio_infer/utils/plot_statistics.py +0 -0
audio_detection/audio_infer/utils/utilities.py +172 -0
audio_detection/target_sound_detection/src/__pycache__/models.cpython-38.pyc +0 -0
audio_detection/target_sound_detection/src/__pycache__/utils.cpython-38.pyc +0 -0
audio_detection/target_sound_detection/src/models.py +1288 -0
audio_detection/target_sound_detection/src/utils.py +353 -0
audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth +3 -0
audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth +3 -0
audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt +3 -0
audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth +3 -0

audio_detection/__init__.py ADDED Viewed

File without changes

audio_detection/audio_infer/__init__.py ADDED Viewed

File without changes

audio_detection/audio_infer/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (171 Bytes). View file

audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv ADDED Viewed

	@@ -0,0 +1,1350 @@

+-JMT0mK0Dbg_30.000_40.000.wav	30.000	40.000	Train horn
+3ACjUf9QpAQ_30.000_40.000.wav	30.000	40.000	Train horn
+3S2-TODd__k_90.000_100.000.wav	90.000	100.000	Train horn
+3YJewEC-NWo_30.000_40.000.wav	30.000	40.000	Train horn
+3jXAh3V2FO8_30.000_40.000.wav	30.000	40.000	Train horn
+53oq_Otm_XI_30.000_40.000.wav	30.000	40.000	Train horn
+8IaInXpdd9M_0.000_10.000.wav	0.000	10.000	Train horn
+8nU1aVscJec_30.000_40.000.wav	30.000	40.000	Train horn
+9LQEZJPNVpw_30.000_40.000.wav	30.000	40.000	Train horn
+AHom7lBbtoY_30.000_40.000.wav	30.000	40.000	Train horn
+Ag_zT74ZGNc_9.000_19.000.wav	9.000	19.000	Train horn
+BQpa8whzwAE_30.000_40.000.wav	30.000	40.000	Train horn
+CCX_4cW_SAU_0.000_10.000.wav	0.000	10.000	Train horn
+CLIdVCUO_Vw_30.000_40.000.wav	30.000	40.000	Train horn
+D_nXtMgbPNY_30.000_40.000.wav	30.000	40.000	Train horn
+GFQnh84kNwU_30.000_40.000.wav	30.000	40.000	Train horn
+I4qODX0fypE_30.000_40.000.wav	30.000	40.000	Train horn
+IdqEbjujFb8_30.000_40.000.wav	30.000	40.000	Train horn
+L3a132_uApg_50.000_60.000.wav	50.000	60.000	Train horn
+LzcNa3HvD7c_30.000_40.000.wav	30.000	40.000	Train horn
+MCYY8tJsnfY_7.000_17.000.wav	7.000	17.000	Train horn
+MPSf7dJpV5w_30.000_40.000.wav	30.000	40.000	Train horn
+NdCr5IDnkxc_30.000_40.000.wav	30.000	40.000	Train horn
+P54KKbTA_TE_0.000_7.000.wav	0.000	7.000	Train horn
+PJUy17bXlhc_40.000_50.000.wav	40.000	50.000	Train horn
+QrAoRSA13bM_30.000_40.000.wav	30.000	40.000	Train horn
+R_Lpb-51Kl4_30.000_40.000.wav	30.000	40.000	Train horn
+Rq-22Cycrpg_30.000_40.000.wav	30.000	40.000	Train horn
+TBjrN1aMRrM_30.000_40.000.wav	30.000	40.000	Train horn
+XAUtk9lwzU8_30.000_40.000.wav	30.000	40.000	Train horn
+XW8pSKLyr0o_20.000_30.000.wav	20.000	30.000	Train horn
+Y10I9JSvJuQ_30.000_40.000.wav	30.000	40.000	Train horn
+Y_jwEflLthg_190.000_200.000.wav	190.000	200.000	Train horn
+YilfKdY7w6Y_60.000_70.000.wav	60.000	70.000	Train horn
+ZcTI8fQgEZE_240.000_250.000.wav	240.000	250.000	Train horn
+_8MvhMlbwiE_40.000_50.000.wav	40.000	50.000	Train horn
+_dkeW6lqmq4_30.000_40.000.wav	30.000	40.000	Train horn
+aXsUHAKbyLs_30.000_40.000.wav	30.000	40.000	Train horn
+arevYmB0qGg_30.000_40.000.wav	30.000	40.000	Train horn
+d1o334I5X_k_30.000_40.000.wav	30.000	40.000	Train horn
+dSzZWgbJ378_30.000_40.000.wav	30.000	40.000	Train horn
+ePVb5Upev8k_40.000_50.000.wav	40.000	50.000	Train horn
+g4cA-ifQc70_30.000_40.000.wav	30.000	40.000	Train horn
+g9JVq7wfDIo_30.000_40.000.wav	30.000	40.000	Train horn
+gTFCK9TuLOQ_30.000_40.000.wav	30.000	40.000	Train horn
+hYqzr_rIIAw_30.000_40.000.wav	30.000	40.000	Train horn
+iZgzRfa-xPQ_30.000_40.000.wav	30.000	40.000	Train horn
+k8H8rn4NaSM_0.000_10.000.wav	0.000	10.000	Train horn
+lKQ-I_P7TEM_20.000_30.000.wav	20.000	30.000	Train horn
+nfY_zkJceDw_30.000_40.000.wav	30.000	40.000	Train horn
+pW5SI1ZKUpA_30.000_40.000.wav	30.000	40.000	Train horn
+pxmrmtEnROk_30.000_40.000.wav	30.000	40.000	Train horn
+q7zzKHFWGkg_30.000_40.000.wav	30.000	40.000	Train horn
+qu8vVFWKszA_30.000_40.000.wav	30.000	40.000	Train horn
+stdjjG6Y5IU_30.000_40.000.wav	30.000	40.000	Train horn
+tdRMxc4UWRk_30.000_40.000.wav	30.000	40.000	Train horn
+tu-cxDG2mW8_0.000_10.000.wav	0.000	10.000	Train horn
+txXSE7kgrc8_30.000_40.000.wav	30.000	40.000	Train horn
+xabrKa79prM_30.000_40.000.wav	30.000	40.000	Train horn
+yBVxtq9k8Sg_0.000_10.000.wav	0.000	10.000	Train horn
+-WoudI3gGvk_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+0_gci63CtFY_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+2-h8MRSRvEg_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+3NX4HaOVBoo_240.000_250.000.wav	240.000	250.000	Air horn, truck horn
+9NPKQDaNCRk_0.000_6.000.wav	0.000	6.000	Air horn, truck horn
+9ct4w4aYWdc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+9l9QXgsJSfo_120.000_130.000.wav	120.000	130.000	Air horn, truck horn
+CN0Bi4MDpA4_20.000_30.000.wav	20.000	30.000	Air horn, truck horn
+CU2MyVM_B48_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+Cg-DWc9nPfQ_90.000_100.000.wav	90.000	100.000	Air horn, truck horn
+D62L3husEa0_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+GO2zKyMtBV4_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+Ge_KWS-0098_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+Hk7HqLBHWng_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+IpyingiCwV8_0.000_3.000.wav	0.000	3.000	Air horn, truck horn
+Isuh9pOuH6I_300.000_310.000.wav	300.000	310.000	Air horn, truck horn
+IuTfMfzkr5Y_120.000_130.000.wav	120.000	130.000	Air horn, truck horn
+MFxsgcZZtFs_10.000_20.000.wav	10.000	20.000	Air horn, truck horn
+N3osL4QmOL8_49.000_59.000.wav	49.000	59.000	Air horn, truck horn
+NOZsDTFLm7M_0.000_9.000.wav	0.000	9.000	Air horn, truck horn
+OjVY3oM1jEU_40.000_50.000.wav	40.000	50.000	Air horn, truck horn
+PNaLTW50fxM_60.000_70.000.wav	60.000	70.000	Air horn, truck horn
+TYLZuBBu8ms_0.000_10.000.wav	0.000	10.000	Air horn, truck horn
+UdHR1P_NIbo_110.000_120.000.wav	110.000	120.000	Air horn, truck horn
+YilfKdY7w6Y_60.000_70.000.wav	60.000	70.000	Air horn, truck horn
+Yt4ZWNjvJOY_50.000_60.000.wav	50.000	60.000	Air horn, truck horn
+Z5M3fGT3Xjk_60.000_70.000.wav	60.000	70.000	Air horn, truck horn
+ZauRsP1uH74_12.000_22.000.wav	12.000	22.000	Air horn, truck horn
+a_6CZ2JaEuc_0.000_2.000.wav	0.000	2.000	Air horn, truck horn
+b7m5Kt5U7Vc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+bIObkrK06rk_15.000_25.000.wav	15.000	25.000	Air horn, truck horn
+cdrjKqyDrak_420.000_430.000.wav	420.000	430.000	Air horn, truck horn
+ckSYn557ZyE_20.000_30.000.wav	20.000	30.000	Air horn, truck horn
+cs-RPPsg_ks_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+ctsq33oUBT8_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+eCFUwyU9ZWA_9.000_19.000.wav	9.000	19.000	Air horn, truck horn
+ePVb5Upev8k_40.000_50.000.wav	40.000	50.000	Air horn, truck horn
+fHaQPHCjyfA_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+fOVsAMJ3Yms_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+g4cA-ifQc70_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+gjlo4evwjlE_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+i9VjpIbM3iE_410.000_420.000.wav	410.000	420.000	Air horn, truck horn
+ieZVo7W3BQ4_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+ii87iO6JboA_10.000_20.000.wav	10.000	20.000	Air horn, truck horn
+jko48cNdvFA_80.000_90.000.wav	80.000	90.000	Air horn, truck horn
+kJuvA2zmrnY_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+kUrb38hMwPs_0.000_10.000.wav	0.000	10.000	Air horn, truck horn
+km_hVyma2vo_0.000_10.000.wav	0.000	10.000	Air horn, truck horn
+m1e9aOwRiDQ_0.000_9.000.wav	0.000	9.000	Air horn, truck horn
+mQJcObz1k_E_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+pk75WDyNZKc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+rhUfN81puDI_0.000_10.000.wav	0.000	10.000	Air horn, truck horn
+suuYwAifIAQ_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+wDdEZ46B-tM_460.000_470.000.wav	460.000	470.000	Air horn, truck horn
+wHISHmuP58s_80.000_90.000.wav	80.000	90.000	Air horn, truck horn
+xwqIKDz1bT4_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+y4Ko6VNiqB0_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+yhcmPrU3QSk_61.000_71.000.wav	61.000	71.000	Air horn, truck horn
+3FWHjjZGT9U_80.000_90.000.wav	80.000	90.000	Car alarm
+3YChVhqW42E_130.000_140.000.wav	130.000	140.000	Car alarm
+3YRkin3bMlQ_170.000_180.000.wav	170.000	180.000	Car alarm
+4APBvMmKubU_10.000_20.000.wav	10.000	20.000	Car alarm
+4JDah6Ckr9k_5.000_15.000.wav	5.000	15.000	Car alarm
+5hL1uGb4sas_30.000_40.000.wav	30.000	40.000	Car alarm
+969Zfj4IoPk_20.000_30.000.wav	20.000	30.000	Car alarm
+AyfuBDN3Vdw_40.000_50.000.wav	40.000	50.000	Car alarm
+B-ZqhRg3km4_60.000_70.000.wav	60.000	70.000	Car alarm
+BDnwA3AaclE_10.000_20.000.wav	10.000	20.000	Car alarm
+ES-rjFfuxq4_120.000_130.000.wav	120.000	130.000	Car alarm
+EWbZq5ruCpg_0.000_10.000.wav	0.000	10.000	Car alarm
+F50h9HiyC3k_40.000_50.000.wav	40.000	50.000	Car alarm
+F5AP8kQvogM_30.000_40.000.wav	30.000	40.000	Car alarm
+FKJuDOAumSk_20.000_30.000.wav	20.000	30.000	Car alarm
+GmbNjZi4xBw_30.000_40.000.wav	30.000	40.000	Car alarm
+H7lOMlND9dc_30.000_40.000.wav	30.000	40.000	Car alarm
+Hu8lxbHYaqg_40.000_50.000.wav	40.000	50.000	Car alarm
+IziTYkSwq9Q_30.000_40.000.wav	30.000	40.000	Car alarm
+JcO2TTtiplA_30.000_40.000.wav	30.000	40.000	Car alarm
+KKx7dWRg8s8_8.000_18.000.wav	8.000	18.000	Car alarm
+Kf9Kr69mwOA_14.000_24.000.wav	14.000	24.000	Car alarm
+L535vIV3ED4_40.000_50.000.wav	40.000	50.000	Car alarm
+LOjT44tFx1A_0.000_10.000.wav	0.000	10.000	Car alarm
+Mxn2FKuNwiI_20.000_30.000.wav	20.000	30.000	Car alarm
+Nkqx09b-xyI_70.000_80.000.wav	70.000	80.000	Car alarm
+QNKo1W1WRbc_22.000_32.000.wav	22.000	32.000	Car alarm
+R0VxYDfjyAU_60.000_70.000.wav	60.000	70.000	Car alarm
+TJ58vMpSy1w_30.000_40.000.wav	30.000	40.000	Car alarm
+ToU1kRagUjY_0.000_10.000.wav	0.000	10.000	Car alarm
+TrQGIZqrW0s_30.000_40.000.wav	30.000	40.000	Car alarm
+ULFhHR0OLSE_30.000_40.000.wav	30.000	40.000	Car alarm
+ULS3ffQkCW4_30.000_40.000.wav	30.000	40.000	Car alarm
+U_9NuNORYQM_1.000_11.000.wav	1.000	11.000	Car alarm
+UkCEuwYUW8c_110.000_120.000.wav	110.000	120.000	Car alarm
+Wak5QxsS-QU_30.000_40.000.wav	30.000	40.000	Car alarm
+XzE7mp3pVik_0.000_10.000.wav	0.000	10.000	Car alarm
+Y-4dtrP-RNo_7.000_17.000.wav	7.000	17.000	Car alarm
+Zltlj0fDeS4_30.000_40.000.wav	30.000	40.000	Car alarm
+cB1jkzgH2es_150.000_160.000.wav	150.000	160.000	Car alarm
+eIMjkADTWzA_60.000_70.000.wav	60.000	70.000	Car alarm
+eL7s5CoW0UA_0.000_7.000.wav	0.000	7.000	Car alarm
+i9VjpIbM3iE_410.000_420.000.wav	410.000	420.000	Car alarm
+iWl-5LNURFc_30.000_40.000.wav	30.000	40.000	Car alarm
+iX34nDCq9NU_10.000_20.000.wav	10.000	20.000	Car alarm
+ii87iO6JboA_10.000_20.000.wav	10.000	20.000	Car alarm
+l6_h_YHuTbY_30.000_40.000.wav	30.000	40.000	Car alarm
+lhedRVb85Fk_30.000_40.000.wav	30.000	40.000	Car alarm
+monelE7hnwI_20.000_30.000.wav	20.000	30.000	Car alarm
+o2CmtHNUrXg_30.000_40.000.wav	30.000	40.000	Car alarm
+pXX6cK4xtiY_11.000_21.000.wav	11.000	21.000	Car alarm
+stnVta2ip9g_30.000_40.000.wav	30.000	40.000	Car alarm
+uvuVg9Cl0n0_30.000_40.000.wav	30.000	40.000	Car alarm
+vF2zXcbADUk_20.000_30.000.wav	20.000	30.000	Car alarm
+vN7dJyt-nj0_20.000_30.000.wav	20.000	30.000	Car alarm
+w8Md65mE5Vc_30.000_40.000.wav	30.000	40.000	Car alarm
+ySqfMcFk5LM_30.000_40.000.wav	30.000	40.000	Car alarm
+ysNK5RVF3Zw_0.000_10.000.wav	0.000	10.000	Car alarm
+za8KPcQ0dTw_30.000_40.000.wav	30.000	40.000	Car alarm
+-2sE5CH8Wb8_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-fJsZm3YRc0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-oSzD8P2BtU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-pzwalZ0ub0_5.000_15.000.wav	5.000	15.000	Reversing beeps
+-t-htrAtNvM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-zNEcuo28oE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+077aWlQn6XI_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0O-gZoirpRA_30.000_40.000.wav	30.000	40.000	Reversing beeps
+10aF24rMeu0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+1P5FFxXLSpY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+1n_s2Gb5R1Q_30.000_40.000.wav	30.000	40.000	Reversing beeps
+2HZcxlRs-hg_30.000_40.000.wav	30.000	40.000	Reversing beeps
+2Jpg_KvJWL0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+2WTk_j_fivY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+38F6eeIR-s0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+3xh2kScw64U_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4MIHbR4QZhE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4Tpy1lsfcSM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4XMY2IvVSf0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4ep09nZl3LA_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4t1VqRz4w2g_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4tKvAMmAUMM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+5-x2pk3YYAs_11.000_21.000.wav	11.000	21.000	Reversing beeps
+5DW8WjxxCag_30.000_40.000.wav	30.000	40.000	Reversing beeps
+5DjZHCumLfs_11.000_21.000.wav	11.000	21.000	Reversing beeps
+5V0xKS-FGMk_30.000_40.000.wav	30.000	40.000	Reversing beeps
+5fLzQegwHUg_30.000_40.000.wav	30.000	40.000	Reversing beeps
+6Y8bKS6KLeE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+6xEHP-C-ZuU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+6yyToq9cW9A_60.000_70.000.wav	60.000	70.000	Reversing beeps
+7Gua0-UrKIw_30.000_40.000.wav	30.000	40.000	Reversing beeps
+7nglQSmcjAk_30.000_40.000.wav	30.000	40.000	Reversing beeps
+81DteAPIhoE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+96a4smrM_30_30.000_40.000.wav	30.000	40.000	Reversing beeps
+9EsgN-WS2qY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+9OcAwC8y-eQ_30.000_40.000.wav	30.000	40.000	Reversing beeps
+9Ti98L4PRCo_17.000_27.000.wav	17.000	27.000	Reversing beeps
+9yhMtJ50sys_30.000_40.000.wav	30.000	40.000	Reversing beeps
+A9KMqwqLboE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+AFwmMFq_xlc_390.000_400.000.wav	390.000	400.000	Reversing beeps
+AvhBRiwWJU4_30.000_40.000.wav	30.000	40.000	Reversing beeps
+CL5vkiMs2c0_10.000_20.000.wav	10.000	20.000	Reversing beeps
+DcU6AzN7imA_210.000_220.000.wav	210.000	220.000	Reversing beeps
+ISBJKY8hwnM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+LA5TekLaIPI_10.000_20.000.wav	10.000	20.000	Reversing beeps
+NqzZbJJl3E4_30.000_40.000.wav	30.000	40.000	Reversing beeps
+PSt0xAYgf4g_0.000_10.000.wav	0.000	10.000	Reversing beeps
+Q1CMSV81_ws_30.000_40.000.wav	30.000	40.000	Reversing beeps
+_gG0KNGD47M_30.000_40.000.wav	30.000	40.000	Reversing beeps
+ckt7YEGcSoY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+eIkUuCRE_0U_30.000_40.000.wav	30.000	40.000	Reversing beeps
+kH6fFjIZkB0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+mCJ0aqIygWE_24.000_34.000.wav	24.000	34.000	Reversing beeps
+nFqf1vflJaI_350.000_360.000.wav	350.000	360.000	Reversing beeps
+nMaSkwx6cHE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+oHKTmTLEy68_11.000_21.000.wav	11.000	21.000	Reversing beeps
+saPU2JNoytU_0.000_10.000.wav	0.000	10.000	Reversing beeps
+tQd0vFueRKs_30.000_40.000.wav	30.000	40.000	Reversing beeps
+vzP6soELj2Q_0.000_10.000.wav	0.000	10.000	Reversing beeps
+0x82_HySIVU_30.000_40.000.wav	30.000	40.000	Bicycle
+1IQdvfm9SDY_30.000_40.000.wav	30.000	40.000	Bicycle
+1_hGvbEiYAs_30.000_40.000.wav	30.000	40.000	Bicycle
+26CM8IXODG4_2.000_12.000.wav	2.000	12.000	Bicycle
+2f7Ad-XpbnY_30.000_40.000.wav	30.000	40.000	Bicycle
+3-a8i_MEUl8_30.000_40.000.wav	30.000	40.000	Bicycle
+7KiTXYwaD04_7.000_17.000.wav	7.000	17.000	Bicycle
+7gkjn-LLInI_30.000_40.000.wav	30.000	40.000	Bicycle
+84flVacRHUI_21.000_31.000.wav	21.000	31.000	Bicycle
+9VziOIkNXsE_30.000_40.000.wav	30.000	40.000	Bicycle
+ANofTuuN0W0_160.000_170.000.wav	160.000	170.000	Bicycle
+B6n0op0sLPA_30.000_40.000.wav	30.000	40.000	Bicycle
+D4_zTwsCRds_60.000_70.000.wav	60.000	70.000	Bicycle
+DEs_Sp9S1Nw_30.000_40.000.wav	30.000	40.000	Bicycle
+GjsxrMRRdfQ_3.000_13.000.wav	3.000	13.000	Bicycle
+GkpUU3VX4wQ_30.000_40.000.wav	30.000	40.000	Bicycle
+H9HNXYxRmv8_30.000_40.000.wav	30.000	40.000	Bicycle
+HPWRKwrs-rY_370.000_380.000.wav	370.000	380.000	Bicycle
+HrQxbNO5jXU_6.000_16.000.wav	6.000	16.000	Bicycle
+IYaEZkAO0LU_30.000_40.000.wav	30.000	40.000	Bicycle
+Idzfy0XbZRo_7.000_17.000.wav	7.000	17.000	Bicycle
+Iigfz_GeXVs_30.000_40.000.wav	30.000	40.000	Bicycle
+JWCtQ_94YoQ_30.000_40.000.wav	30.000	40.000	Bicycle
+JXmBrD4b4EI_30.000_40.000.wav	30.000	40.000	Bicycle
+LSZPNwZex9s_30.000_40.000.wav	30.000	40.000	Bicycle
+M5kwg1kx4q0_30.000_40.000.wav	30.000	40.000	Bicycle
+NrR1wmCpqAk_12.000_22.000.wav	12.000	22.000	Bicycle
+O1_Rw2dHb1I_2.000_12.000.wav	2.000	12.000	Bicycle
+OEN0TySl1Jw_10.000_20.000.wav	10.000	20.000	Bicycle
+PF7uY9ydMYc_30.000_40.000.wav	30.000	40.000	Bicycle
+SDl0tWf9Q44_30.000_40.000.wav	30.000	40.000	Bicycle
+SkXXjcw9sJI_30.000_40.000.wav	30.000	40.000	Bicycle
+Ssa1m5Mnllw_0.000_9.000.wav	0.000	9.000	Bicycle
+UB-A1oyNyyg_0.000_6.000.wav	0.000	6.000	Bicycle
+UqyvFyQthHo_30.000_40.000.wav	30.000	40.000	Bicycle
+Wg4ik5zZxBc_250.000_260.000.wav	250.000	260.000	Bicycle
+WvquSD2PcCE_30.000_40.000.wav	30.000	40.000	Bicycle
+YIJBuXUi64U_30.000_40.000.wav	30.000	40.000	Bicycle
+aBHdl_TiseI_30.000_40.000.wav	30.000	40.000	Bicycle
+aeHCq6fFkNo_30.000_40.000.wav	30.000	40.000	Bicycle
+amKDjVcs1Vg_30.000_40.000.wav	30.000	40.000	Bicycle
+ehYwty_G2L4_13.000_23.000.wav	13.000	23.000	Bicycle
+jOlVJv7jAHg_30.000_40.000.wav	30.000	40.000	Bicycle
+lGFDQ-ZwUfk_30.000_40.000.wav	30.000	40.000	Bicycle
+lmTHvLGQy3g_50.000_60.000.wav	50.000	60.000	Bicycle
+nNHW3Uxlb-g_30.000_40.000.wav	30.000	40.000	Bicycle
+o98R4ruf8kw_30.000_40.000.wav	30.000	40.000	Bicycle
+oiLHBkHgkAo_0.000_8.000.wav	0.000	8.000	Bicycle
+qL0ESQcaPhM_30.000_40.000.wav	30.000	40.000	Bicycle
+qjz5t9M4YCw_30.000_40.000.wav	30.000	40.000	Bicycle
+qrCWPsqG9vA_30.000_40.000.wav	30.000	40.000	Bicycle
+r06tmeUDgc8_3.000_13.000.wav	3.000	13.000	Bicycle
+sAMjMyCdGOc_30.000_40.000.wav	30.000	40.000	Bicycle
+tKdRlWz-1pg_30.000_40.000.wav	30.000	40.000	Bicycle
+uNpSMpqlkMA_0.000_10.000.wav	0.000	10.000	Bicycle
+vOYj9W7Jsxk_8.000_18.000.wav	8.000	18.000	Bicycle
+xBKrmKdjAIA_0.000_10.000.wav	0.000	10.000	Bicycle
+xfNeZaw4o3U_17.000_27.000.wav	17.000	27.000	Bicycle
+xgiJqbhhU3c_30.000_40.000.wav	30.000	40.000	Bicycle
+0vg9qxNKXOw_30.000_40.000.wav	30.000	40.000	Skateboard
+10YXuv9Go0E_140.000_150.000.wav	140.000	150.000	Skateboard
+3-a8i_MEUl8_30.000_40.000.wav	30.000	40.000	Skateboard
+6kXUG1Zo6VA_0.000_10.000.wav	0.000	10.000	Skateboard
+84fDGWoRtsU_210.000_220.000.wav	210.000	220.000	Skateboard
+8kbHA22EWd0_330.000_340.000.wav	330.000	340.000	Skateboard
+8m-a_6wLTkU_230.000_240.000.wav	230.000	240.000	Skateboard
+9QwaP-cvdeU_360.000_370.000.wav	360.000	370.000	Skateboard
+9ZYj5toEbGA_0.000_10.000.wav	0.000	10.000	Skateboard
+9gkppwB5CXA_30.000_40.000.wav	30.000	40.000	Skateboard
+9hlXgXWXYXQ_0.000_6.000.wav	0.000	6.000	Skateboard
+ALxn5-2bVyI_30.000_40.000.wav	30.000	40.000	Skateboard
+ANPjV_rudog_30.000_40.000.wav	30.000	40.000	Skateboard
+ATAL-_Dblvg_0.000_7.000.wav	0.000	7.000	Skateboard
+An-4jPvUT14_60.000_70.000.wav	60.000	70.000	Skateboard
+BGR0QnX4k6w_30.000_40.000.wav	30.000	40.000	Skateboard
+BlhUt8AJJO8_30.000_40.000.wav	30.000	40.000	Skateboard
+CD7INyI79fM_170.000_180.000.wav	170.000	180.000	Skateboard
+CNcxzB9F-Q8_100.000_110.000.wav	100.000	110.000	Skateboard
+DqOGYyFVnKk_200.000_210.000.wav	200.000	210.000	Skateboard
+E0gBwPTHxqE_30.000_40.000.wav	30.000	40.000	Skateboard
+E3XIdP8kxwg_110.000_120.000.wav	110.000	120.000	Skateboard
+FQZnQhiM41U_0.000_6.000.wav	0.000	6.000	Skateboard
+FRwFfq3Tl1g_310.000_320.000.wav	310.000	320.000	Skateboard
+JJo971B_eDg_30.000_40.000.wav	30.000	40.000	Skateboard
+KXkxqxoCylc_30.000_40.000.wav	30.000	40.000	Skateboard
+L4Z7XkS6CtA_30.000_40.000.wav	30.000	40.000	Skateboard
+LjEqr0Z7xm0_0.000_6.000.wav	0.000	6.000	Skateboard
+MAbDEeLF4cQ_30.000_40.000.wav	30.000	40.000	Skateboard
+MUBbiivNYZs_30.000_40.000.wav	30.000	40.000	Skateboard
+Nq8GyBrTI8Y_30.000_40.000.wav	30.000	40.000	Skateboard
+PPq9QZmV7jc_25.000_35.000.wav	25.000	35.000	Skateboard
+PVgL5wFOKMs_30.000_40.000.wav	30.000	40.000	Skateboard
+Tcq_xAdCMr4_30.000_40.000.wav	30.000	40.000	Skateboard
+UtZofZjccBs_290.000_300.000.wav	290.000	300.000	Skateboard
+VZfrDZhI7BU_30.000_40.000.wav	30.000	40.000	Skateboard
+WxChkRrVOIs_0.000_7.000.wav	0.000	7.000	Skateboard
+YV0noe1sZAs_150.000_160.000.wav	150.000	160.000	Skateboard
+YjScrri_F7U_0.000_10.000.wav	0.000	10.000	Skateboard
+YrGQKTbiG1g_30.000_40.000.wav	30.000	40.000	Skateboard
+ZM67kt6G-d4_30.000_40.000.wav	30.000	40.000	Skateboard
+ZaUaqnLdg6k_30.000_40.000.wav	30.000	40.000	Skateboard
+ZhpkRcAEJzc_3.000_13.000.wav	3.000	13.000	Skateboard
+_43OOP6UEw0_30.000_40.000.wav	30.000	40.000	Skateboard
+_6Fyave4jqA_260.000_270.000.wav	260.000	270.000	Skateboard
+aOoZ0bCoaZw_30.000_40.000.wav	30.000	40.000	Skateboard
+gV6y9L24wWg_0.000_10.000.wav	0.000	10.000	Skateboard
+hHb0Eq1I7Fk_0.000_10.000.wav	0.000	10.000	Skateboard
+lGf_L6i6AZI_20.000_30.000.wav	20.000	30.000	Skateboard
+leOH87itNWM_30.000_40.000.wav	30.000	40.000	Skateboard
+mIkW7mWlnXw_30.000_40.000.wav	30.000	40.000	Skateboard
+qadmKrM0ppo_20.000_30.000.wav	20.000	30.000	Skateboard
+rLUIHCc4b9A_0.000_7.000.wav	0.000	7.000	Skateboard
+u3vBJgEVJvk_0.000_10.000.wav	0.000	10.000	Skateboard
+vHKBrtPDSvA_150.000_160.000.wav	150.000	160.000	Skateboard
+wWmydRt0Z-w_21.000_31.000.wav	21.000	31.000	Skateboard
+xeHt-R5ScmI_0.000_10.000.wav	0.000	10.000	Skateboard
+xqGtIVeeXY4_330.000_340.000.wav	330.000	340.000	Skateboard
+y_lfY0uzmr0_30.000_40.000.wav	30.000	40.000	Skateboard
+02Ak1eIyj3M_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+0N0C0Wbe6AI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+2-h8MRSRvEg_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+4APBvMmKubU_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+5RgHBmX2HLw_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+6rXgD5JlYxY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+7eeN-fXbso8_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+8Aq2DyLbUBA_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+8qMHvgA9mGw_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+9CRb-PToaAM_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+AwFuGITwrms_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+BGp9-Ro5h8Y_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+CDrpqsGqfPo_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+Cc7-P0py1Mc_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+Daqv2F6SEmQ_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+F9Dbcxr-lAI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+GORjnSWhZeY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+GgV0yYogTPI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+H9xQQVv3ElI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+LNQ7fzfdLiY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+MEUcv-QM0cQ_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+QWVub6-0jX4_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+R8G5Y0HASxY_60.000_70.000.wav	60.000	70.000	Ambulance (siren)
+RVTKY5KR3ME_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+Sm0pPvXPA9U_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+VXI3-DI4xNs_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+W8fIlauyJkk_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+ZlS4vIWQMmE_0.000_10.000.wav	0.000	10.000	Ambulance (siren)
+ZxlbI2Rj1VY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+ZyuX_gMFiss_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+bA8mt0JI0Ko_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+bIU0X1v4SF0_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+cHm1cYBAXMI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+cR79KnWpiQA_70.000_80.000.wav	70.000	80.000	Ambulance (siren)
+dPcw4R5lczw_500.000_510.000.wav	500.000	510.000	Ambulance (siren)
+epwDz5WBkvc_80.000_90.000.wav	80.000	90.000	Ambulance (siren)
+fHaQPHCjyfA_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+gw9pYEG2Zb0_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+iEX8L_oEbsU_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+iM-U56fTTOQ_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+iSnWMz4FUAg_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+kJuvA2zmrnY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+kSjvt2Z_pBo_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+ke35yF1LHs4_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+lqGtL8sUo_g_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+mAfPu0meA_Y_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+mlS9LLiMIG8_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+oPR7tUEUptk_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+qsHc2X1toLs_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+rCQykaL8Hy4_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+rhUfN81puDI_0.000_10.000.wav	0.000	10.000	Ambulance (siren)
+s0iddDFzL9s_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+tcKlq7_cOkw_8.000_18.000.wav	8.000	18.000	Ambulance (siren)
+u3yYpMwG4Us_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+vBXPyBiyJG0_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+vVqUvv1SSu8_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+vYKWnuvq2FI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+ysNK5RVF3Zw_0.000_10.000.wav	0.000	10.000	Ambulance (siren)
+z4B14tAqJ4w_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+zbiJEml563w_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+-HxRz4w60-Y_150.000_160.000.wav	150.000	160.000	Fire engine, fire truck (siren)
+-_dElQcyJnA_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+0K1mroXg8bs_9.000_19.000.wav	9.000	19.000	Fire engine, fire truck (siren)
+0SvSNVatkv0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+2-h8MRSRvEg_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+31WGUPOYS5g_22.000_32.000.wav	22.000	32.000	Fire engine, fire truck (siren)
+3h3_IZWhX0g_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+4APBvMmKubU_10.000_20.000.wav	10.000	20.000	Fire engine, fire truck (siren)
+5fjy_2ajEkg_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+6rXgD5JlYxY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+8Aq2DyLbUBA_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+8DaEd5KbnnA_80.000_90.000.wav	80.000	90.000	Fire engine, fire truck (siren)
+ARIVxBOc0BQ_40.000_50.000.wav	40.000	50.000	Fire engine, fire truck (siren)
+AwFuGITwrms_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+Bs2KqqI9F_k_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+Cc7-P0py1Mc_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+D4M3YT75ZrQ_90.000_100.000.wav	90.000	100.000	Fire engine, fire truck (siren)
+DWXQ_cSUW98_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+Daqv2F6SEmQ_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+DpagxUQwXDo_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+FFSI6Bg2M-Q_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+GORjnSWhZeY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+GbIuxmaiCOk_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+GgV0yYogTPI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+H6c8ZDrdUaM_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+H9xQQVv3ElI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+HQQxGJKg1iM_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+IiCh2H3JtsE_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+InrS4Fdndr4_0.000_10.000.wav	0.000	10.000	Fire engine, fire truck (siren)
+JpLA7HY9r3Y_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+MEUcv-QM0cQ_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+PCl-q7lCT_U_50.000_60.000.wav	50.000	60.000	Fire engine, fire truck (siren)
+VXI3-DI4xNs_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+Xggsbzzes3M_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+YbiiaDBU-HI_10.000_20.000.wav	10.000	20.000	Fire engine, fire truck (siren)
+ZeH6Fc7Y900_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+ZlS4vIWQMmE_0.000_10.000.wav	0.000	10.000	Fire engine, fire truck (siren)
+bIU0X1v4SF0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+cHm1cYBAXMI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+fHaQPHCjyfA_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+iM-U56fTTOQ_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+k2a30--j37Q_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+kJuvA2zmrnY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+kr8ssbrDDMY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+pvYwIdGrS90_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+qsHc2X1toLs_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+rCQykaL8Hy4_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+rhUfN81puDI_0.000_10.000.wav	0.000	10.000	Fire engine, fire truck (siren)
+u08iA12iAmM_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+u9aHjYGbl5o_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+uUiZrgUpw2A_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+vBXPyBiyJG0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+vVqUvv1SSu8_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+vYKWnuvq2FI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+wD0P-doqkXo_20.000_30.000.wav	20.000	30.000	Fire engine, fire truck (siren)
+xbr7x2V6mxk_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+ysNK5RVF3Zw_0.000_10.000.wav	0.000	10.000	Fire engine, fire truck (siren)
+z4B14tAqJ4w_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+zpzJKMG5iGc_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+02Ak1eIyj3M_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0CJFt950vOk_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0phl6nlC-n0_10.000_20.000.wav	10.000	20.000	Civil defense siren
+1jhbNtCWC9w_50.000_60.000.wav	50.000	60.000	Civil defense siren
+4Ukj2TTJxHM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+4XAVaSz_P7c_150.000_160.000.wav	150.000	160.000	Civil defense siren
+69AIBPnJN5E_0.000_10.000.wav	0.000	10.000	Civil defense siren
+8DaEd5KbnnA_80.000_90.000.wav	80.000	90.000	Civil defense siren
+8ILgvaJVPCI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+9MWHXCLAX8I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+A5y-aZc0CiM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+AQCZH4OdNSM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+AVBUh6qeHrQ_30.000_40.000.wav	30.000	40.000	Civil defense siren
+BhQPDafekdw_30.000_40.000.wav	30.000	40.000	Civil defense siren
+CJXNdudcJrs_30.000_40.000.wav	30.000	40.000	Civil defense siren
+CU2MyVM_B48_30.000_40.000.wav	30.000	40.000	Civil defense siren
+DdZw0XDv0JI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+DgWHUawAGnI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+Do9Dffb6vHA_30.000_40.000.wav	30.000	40.000	Civil defense siren
+GO2zKyMtBV4_30.000_40.000.wav	30.000	40.000	Civil defense siren
+GeRgy4of730_30.000_40.000.wav	30.000	40.000	Civil defense siren
+IIypdzgZAaI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+JpLA7HY9r3Y_30.000_40.000.wav	30.000	40.000	Civil defense siren
+JqHJ7015aWM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+K7a1P4RX_5w_30.000_40.000.wav	30.000	40.000	Civil defense siren
+KrTocA-I550_190.000_200.000.wav	190.000	200.000	Civil defense siren
+KumYcZVLOVU_350.000_360.000.wav	350.000	360.000	Civil defense siren
+L60HS_jbZu0_30.000_40.000.wav	30.000	40.000	Civil defense siren
+MZ1Yh6mRC-E_30.000_40.000.wav	30.000	40.000	Civil defense siren
+R8XUrRCFkzs_30.000_40.000.wav	30.000	40.000	Civil defense siren
+SyWbolNFst4_60.000_70.000.wav	60.000	70.000	Civil defense siren
+TYLZuBBu8ms_0.000_10.000.wav	0.000	10.000	Civil defense siren
+Tx6eSkU2lKc_30.000_40.000.wav	30.000	40.000	Civil defense siren
+VcflBZLflSU_130.000_140.000.wav	130.000	140.000	Civil defense siren
+WXsTHg_DiYA_30.000_40.000.wav	30.000	40.000	Civil defense siren
+Wz5ffJxCElQ_10.000_20.000.wav	10.000	20.000	Civil defense siren
+X2MlmcY8UZU_30.000_40.000.wav	30.000	40.000	Civil defense siren
+XYLheTmlEYI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+YyxlD_FwZXM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+adCuLs-4nmI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+cPjtrTq3F-I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+eHDm93tI4Ok_30.000_40.000.wav	30.000	40.000	Civil defense siren
+etppP5Sdo14_30.000_40.000.wav	30.000	40.000	Civil defense siren
+fRKxUc1gQBw_50.000_60.000.wav	50.000	60.000	Civil defense siren
+feIue4LHzfM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+gr-Yen6Sj_Q_0.000_10.000.wav	0.000	10.000	Civil defense siren
+hl3Kqi9Wi_g_30.000_40.000.wav	30.000	40.000	Civil defense siren
+iKca2cbowd4_30.000_40.000.wav	30.000	40.000	Civil defense siren
+kzFyGWdj6MI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+m3LGopSVju4_30.000_40.000.wav	30.000	40.000	Civil defense siren
+ne4IMxs-hMk_30.000_40.000.wav	30.000	40.000	Civil defense siren
+nuu2iNisoQc_6.000_16.000.wav	6.000	16.000	Civil defense siren
+oYeql9xE19k_30.000_40.000.wav	30.000	40.000	Civil defense siren
+rGUrM19BnJ8_110.000_120.000.wav	110.000	120.000	Civil defense siren
+u08iA12iAmM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+uCRAnDBXxgI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+vQG4HZR2KSk_30.000_40.000.wav	30.000	40.000	Civil defense siren
+vjsG5b2yNzc_190.000_200.000.wav	190.000	200.000	Civil defense siren
+yO7guxGY-_k_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-9GUUhB3QV0_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-HxRz4w60-Y_150.000_160.000.wav	150.000	160.000	Police car (siren)
+-UBVqmhbT50_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-_dElQcyJnA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+0N0C0Wbe6AI_30.000_40.000.wav	30.000	40.000	Police car (siren)
+0SvSNVatkv0_30.000_40.000.wav	30.000	40.000	Police car (siren)
+145N68nh4m0_120.000_130.000.wav	120.000	130.000	Police car (siren)
+2-h8MRSRvEg_30.000_40.000.wav	30.000	40.000	Police car (siren)
+31WGUPOYS5g_22.000_32.000.wav	22.000	32.000	Police car (siren)
+5RgHBmX2HLw_30.000_40.000.wav	30.000	40.000	Police car (siren)
+6rXgD5JlYxY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+8Aq2DyLbUBA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+8DaEd5KbnnA_80.000_90.000.wav	80.000	90.000	Police car (siren)
+8E7okHnCcTA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+9CRb-PToaAM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+9OFUd38sBNM_0.000_8.000.wav	0.000	8.000	Police car (siren)
+AQCZH4OdNSM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+AwFuGITwrms_30.000_40.000.wav	30.000	40.000	Police car (siren)
+CDrpqsGqfPo_10.000_20.000.wav	10.000	20.000	Police car (siren)
+DK_6C29B2zs_14.000_24.000.wav	14.000	24.000	Police car (siren)
+GORjnSWhZeY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+GgV0yYogTPI_30.000_40.000.wav	30.000	40.000	Police car (siren)
+H6c8ZDrdUaM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+H7lOMlND9dc_30.000_40.000.wav	30.000	40.000	Police car (siren)
+H9xQQVv3ElI_30.000_40.000.wav	30.000	40.000	Police car (siren)
+IiCh2H3JtsE_30.000_40.000.wav	30.000	40.000	Police car (siren)
+InrS4Fdndr4_0.000_10.000.wav	0.000	10.000	Police car (siren)
+JgDuU9kpHpM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+JpLA7HY9r3Y_30.000_40.000.wav	30.000	40.000	Police car (siren)
+LNQ7fzfdLiY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+PCl-q7lCT_U_50.000_60.000.wav	50.000	60.000	Police car (siren)
+QWVub6-0jX4_30.000_40.000.wav	30.000	40.000	Police car (siren)
+Wak5QxsS-QU_30.000_40.000.wav	30.000	40.000	Police car (siren)
+YbiiaDBU-HI_10.000_20.000.wav	10.000	20.000	Police car (siren)
+Z34SD-OEpJI_10.000_20.000.wav	10.000	20.000	Police car (siren)
+ZeH6Fc7Y900_30.000_40.000.wav	30.000	40.000	Police car (siren)
+ZlS4vIWQMmE_0.000_10.000.wav	0.000	10.000	Police car (siren)
+ZyuX_gMFiss_30.000_40.000.wav	30.000	40.000	Police car (siren)
+bIU0X1v4SF0_30.000_40.000.wav	30.000	40.000	Police car (siren)
+eIMjkADTWzA_60.000_70.000.wav	60.000	70.000	Police car (siren)
+epwDz5WBkvc_80.000_90.000.wav	80.000	90.000	Police car (siren)
+fHaQPHCjyfA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+fNcrlqPrAqM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+g_DBLppDZAs_30.000_40.000.wav	30.000	40.000	Police car (siren)
+gw9pYEG2Zb0_20.000_30.000.wav	20.000	30.000	Police car (siren)
+iEX8L_oEbsU_30.000_40.000.wav	30.000	40.000	Police car (siren)
+iM-U56fTTOQ_30.000_40.000.wav	30.000	40.000	Police car (siren)
+kJuvA2zmrnY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+kSjvt2Z_pBo_30.000_40.000.wav	30.000	40.000	Police car (siren)
+lqGtL8sUo_g_30.000_40.000.wav	30.000	40.000	Police car (siren)
+mAfPu0meA_Y_20.000_30.000.wav	20.000	30.000	Police car (siren)
+mlS9LLiMIG8_30.000_40.000.wav	30.000	40.000	Police car (siren)
+pzup58Eyhuo_30.000_40.000.wav	30.000	40.000	Police car (siren)
+rCQykaL8Hy4_30.000_40.000.wav	30.000	40.000	Police car (siren)
+rhUfN81puDI_0.000_10.000.wav	0.000	10.000	Police car (siren)
+u08iA12iAmM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+u3yYpMwG4Us_30.000_40.000.wav	30.000	40.000	Police car (siren)
+u9aHjYGbl5o_30.000_40.000.wav	30.000	40.000	Police car (siren)
+uUiZrgUpw2A_30.000_40.000.wav	30.000	40.000	Police car (siren)
+vYKWnuvq2FI_30.000_40.000.wav	30.000	40.000	Police car (siren)
+xbr7x2V6mxk_30.000_40.000.wav	30.000	40.000	Police car (siren)
+z4B14tAqJ4w_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-FKrYTj_eCU_0.000_10.000.wav	0.000	10.000	Screaming
+0G50t4FlbIA_60.000_70.000.wav	60.000	70.000	Screaming
+1LTxZ2aNytc_30.000_40.000.wav	30.000	40.000	Screaming
+2FEhG1UXb_E_370.000_380.000.wav	370.000	380.000	Screaming
+45vBbOhzS6g_50.000_60.000.wav	50.000	60.000	Screaming
+4PYTtp78Ig0_60.000_70.000.wav	60.000	70.000	Screaming
+5QNq0IEPICQ_30.000_40.000.wav	30.000	40.000	Screaming
+5YcIJuYQECc_0.000_6.000.wav	0.000	6.000	Screaming
+5kQF4r03yRI_0.000_6.000.wav	0.000	6.000	Screaming
+7ARVgI_wx5Y_30.000_40.000.wav	30.000	40.000	Screaming
+AIFvFuZPr68_30.000_40.000.wav	30.000	40.000	Screaming
+Aw43FUCkIb8_20.000_30.000.wav	20.000	30.000	Screaming
+AxM2BofYfPY_30.000_40.000.wav	30.000	40.000	Screaming
+BFqHyCoypfM_16.000_26.000.wav	16.000	26.000	Screaming
+Bk_xS_fKCpk_30.000_40.000.wav	30.000	40.000	Screaming
+C4YMjmJ7tt4_90.000_100.000.wav	90.000	100.000	Screaming
+CMWoAvgD0A0_9.000_19.000.wav	9.000	19.000	Screaming
+DZfYFhywhRs_30.000_40.000.wav	30.000	40.000	Screaming
+ElJFYwRtrH4_30.000_40.000.wav	30.000	40.000	Screaming
+FcUVtXJMkJs_30.000_40.000.wav	30.000	40.000	Screaming
+G--718JDmAQ_0.000_10.000.wav	0.000	10.000	Screaming
+GPJ1uQwmNHk_30.000_40.000.wav	30.000	40.000	Screaming
+H3vSRzkG82U_30.000_40.000.wav	30.000	40.000	Screaming
+HS28EUWt8dE_110.000_120.000.wav	110.000	120.000	Screaming
+KkGTB8ESMCM_0.000_10.000.wav	0.000	10.000	Screaming
+MQ0YasvMcuQ_1.000_11.000.wav	1.000	11.000	Screaming
+Msl9dI5yweA_90.000_100.000.wav	90.000	100.000	Screaming
+Ntn6YvZM3kA_0.000_10.000.wav	0.000	10.000	Screaming
+NwTHlpXdk4M_30.000_40.000.wav	30.000	40.000	Screaming
+OHjfSfqa804_0.000_10.000.wav	0.000	10.000	Screaming
+OzWJuqG2F3Y_30.000_40.000.wav	30.000	40.000	Screaming
+QDW_uCMnMMU_0.000_8.000.wav	0.000	8.000	Screaming
+SxI3Lnzzmkw_110.000_120.000.wav	110.000	120.000	Screaming
+TVvbfuGu9eM_70.000_80.000.wav	70.000	80.000	Screaming
+YCk9F0Uq3BE_70.000_80.000.wav	70.000	80.000	Screaming
+Z54pSnNw2iM_30.000_40.000.wav	30.000	40.000	Screaming
+a59ivTlYoNk_310.000_320.000.wav	310.000	320.000	Screaming
+auC_LgwFF8g_30.000_40.000.wav	30.000	40.000	Screaming
+bi8R9JbF2cc_80.000_90.000.wav	80.000	90.000	Screaming
+cdbYsoEasio_70.000_80.000.wav	70.000	80.000	Screaming
+dfsvT5xImNg_80.000_90.000.wav	80.000	90.000	Screaming
+e2AaF6siR1A_540.000_550.000.wav	540.000	550.000	Screaming
+gB1ytjgpcW4_190.000_200.000.wav	190.000	200.000	Screaming
+gE-0JxMtUh0_20.000_30.000.wav	20.000	30.000	Screaming
+hWiGgsuGnzs_100.000_110.000.wav	100.000	110.000	Screaming
+l-iIfi3SNpw_120.000_130.000.wav	120.000	130.000	Screaming
+mT-f0lGk-JM_30.000_40.000.wav	30.000	40.000	Screaming
+nApE_Biu13k_10.000_20.000.wav	10.000	20.000	Screaming
+nRMmafPUAEU_80.000_90.000.wav	80.000	90.000	Screaming
+nYAbLuyqPis_30.000_40.000.wav	30.000	40.000	Screaming
+nlYlNF30bVg_30.000_40.000.wav	30.000	40.000	Screaming
+sUp-UXzgmrA_0.000_10.000.wav	0.000	10.000	Screaming
+syIwNMo2TUA_0.000_7.000.wav	0.000	7.000	Screaming
+uTu0a1wd9-M_21.000_31.000.wav	21.000	31.000	Screaming
+xVG7dfH5DL0_320.000_330.000.wav	320.000	330.000	Screaming
+xvAQ44hx3_k_220.000_230.000.wav	220.000	230.000	Screaming
+yNTkb2zgA_M_70.000_80.000.wav	70.000	80.000	Screaming
+zCdOEvduBTo_30.000_40.000.wav	30.000	40.000	Screaming
+zMICvbCJ6zc_550.000_560.000.wav	550.000	560.000	Screaming
+-0RWZT-miFs_420.000_430.000.wav	420.000	430.000	Car
+-1pRmoJIGQc_11.000_21.000.wav	11.000	21.000	Car
+-7eDqv-6AKQ_30.000_40.000.wav	30.000	40.000	Car
+-CZ1LIc8aos_20.000_30.000.wav	20.000	30.000	Car
+-HWygXWSNRA_30.000_40.000.wav	30.000	40.000	Car
+-PVEno65928_30.000_40.000.wav	30.000	40.000	Car
+-WgJ-M292Yc_30.000_40.000.wav	30.000	40.000	Car
+0O-gZoirpRA_30.000_40.000.wav	30.000	40.000	Car
+0QwxnzHf_0E_30.000_40.000.wav	30.000	40.000	Car
+0bg1nzEVdgY_0.000_10.000.wav	0.000	10.000	Car
+0lpPdWvg7Eo_0.000_10.000.wav	0.000	10.000	Car
+11Pn3yJifSQ_4.000_14.000.wav	4.000	14.000	Car
+1BgqrhbyRFw_30.000_40.000.wav	30.000	40.000	Car
+1F9zCsJyw6k_430.000_440.000.wav	430.000	440.000	Car
+1HayoASR-54_80.000_90.000.wav	80.000	90.000	Car
+1P5FFxXLSpY_30.000_40.000.wav	30.000	40.000	Car
+1hIg-Lsvc7Q_30.000_40.000.wav	30.000	40.000	Car
+27m49pmJ8Og_370.000_380.000.wav	370.000	380.000	Car
+2E_N8lnoVKE_30.000_40.000.wav	30.000	40.000	Car
+2Fdau5KTEls_30.000_40.000.wav	30.000	40.000	Car
+2STASUlGAjs_30.000_40.000.wav	30.000	40.000	Car
+2fi0m8ei_B4_30.000_40.000.wav	30.000	40.000	Car
+2uMXfAIMeN0_180.000_190.000.wav	180.000	190.000	Car
+32V2zsK7GME_110.000_120.000.wav	110.000	120.000	Car
+3YChVhqW42E_130.000_140.000.wav	130.000	140.000	Car
+3_OLj6XChvM_30.000_40.000.wav	30.000	40.000	Car
+3hLxPQpmfQo_30.000_40.000.wav	30.000	40.000	Car
+3mDPQ_CPopw_30.000_40.000.wav	30.000	40.000	Car
+3mor5mPSYoU_7.000_17.000.wav	7.000	17.000	Car
+3xh2kScw64U_30.000_40.000.wav	30.000	40.000	Car
+40s88hEcn5I_170.000_180.000.wav	170.000	180.000	Car
+42P93B_GzGA_30.000_40.000.wav	30.000	40.000	Car
+4KZWpXlcpM4_60.000_70.000.wav	60.000	70.000	Car
+4TshFWSsrn8_290.000_300.000.wav	290.000	300.000	Car
+4WRgvRI06zc_30.000_40.000.wav	30.000	40.000	Car
+4aJfQpHt9lY_160.000_170.000.wav	160.000	170.000	Car
+4hd2CLrzCZs_30.000_40.000.wav	30.000	40.000	Car
+4zCHl7pRsNY_30.000_40.000.wav	30.000	40.000	Car
+5RgHBmX2HLw_30.000_40.000.wav	30.000	40.000	Car
+5oirFKi6Sfo_190.000_200.000.wav	190.000	200.000	Car
+5vmxFp1r1ZM_30.000_40.000.wav	30.000	40.000	Car
+5z1rE_l-0Ow_0.000_8.000.wav	0.000	8.000	Car
+620GoTv5Ic8_30.000_40.000.wav	30.000	40.000	Car
+6BitLl5Bnxw_30.000_40.000.wav	30.000	40.000	Car
+6FVA4hqp1Ro_30.000_40.000.wav	30.000	40.000	Car
+6U942AYlcXA_30.000_40.000.wav	30.000	40.000	Car
+6b2ZMMrLTz8_5.000_15.000.wav	5.000	15.000	Car
+6ibh38autyA_30.000_40.000.wav	30.000	40.000	Car
+6kuESYFcEqw_30.000_40.000.wav	30.000	40.000	Car
+73cuZZq-J3w_20.000_30.000.wav	20.000	30.000	Car
+764IcMEMVUk_90.000_100.000.wav	90.000	100.000	Car
+7NH1WJlSiYI_30.000_40.000.wav	30.000	40.000	Car
+7lJu9wEsErY_220.000_230.000.wav	220.000	230.000	Car
+8CqqK9CzuXM_30.000_40.000.wav	30.000	40.000	Car
+8SYLYWR47EE_30.000_40.000.wav	30.000	40.000	Car
+8Wk-ZmlsUqY_28.000_38.000.wav	28.000	38.000	Car
+8q8JrJNAa-Q_30.000_40.000.wav	30.000	40.000	Car
+8rMlNbKlp_s_0.000_10.000.wav	0.000	10.000	Car
+8sGJFPr2Nmc_30.000_40.000.wav	30.000	40.000	Car
+8yRROnG0-lA_30.000_40.000.wav	30.000	40.000	Car
+9Ti98L4PRCo_17.000_27.000.wav	17.000	27.000	Car
+9fzAWj5YJ9c_30.000_40.000.wav	30.000	40.000	Car
+9rq8h4oMJ98_30.000_40.000.wav	30.000	40.000	Car
+9ye2Fn62xDc_60.000_70.000.wav	60.000	70.000	Car
+ACGuC6SH4V4_150.000_160.000.wav	150.000	160.000	Car
+AFz5TIs_Gug_30.000_40.000.wav	30.000	40.000	Car
+AedlWfHafgw_21.000_31.000.wav	21.000	31.000	Car
+AlsDSDTiaWI_30.000_40.000.wav	30.000	40.000	Car
+B3SkK0wuOhY_130.000_140.000.wav	130.000	140.000	Car
+B9n4a5ciI48_16.000_26.000.wav	16.000	26.000	Car
+BAekfGvUtFM_30.000_40.000.wav	30.000	40.000	Car
+BNLOvQbrPdc_290.000_300.000.wav	290.000	300.000	Car
+BS1fqEDAvh0_330.000_340.000.wav	330.000	340.000	Car
+Bqx_SZgCzZw_10.000_20.000.wav	10.000	20.000	Car
+CZB6WXDuM1g_30.000_40.000.wav	30.000	40.000	Car
+C_pnsyNXphA_30.000_40.000.wav	30.000	40.000	Car
+Ck5ZjBf1nLM_30.000_40.000.wav	30.000	40.000	Car
+CqNyeZeHb8Y_30.000_40.000.wav	30.000	40.000	Car
+Cs1d7Ibk8CA_220.000_230.000.wav	220.000	230.000	Car
+CuS-ok0xG9g_0.000_10.000.wav	0.000	10.000	Car
+CuaBHNKycvI_30.000_40.000.wav	30.000	40.000	Car
+Cwur_jvxMzY_360.000_370.000.wav	360.000	370.000	Car
+DEGSyVygE98_110.000_120.000.wav	110.000	120.000	Car
+DLxTYAUifjU_30.000_40.000.wav	30.000	40.000	Car
+DkKpnvJk9u0_30.000_40.000.wav	30.000	40.000	Car
+DkVfro9iq80_30.000_40.000.wav	30.000	40.000	Car
+Dw1q9rBv7oU_30.000_40.000.wav	30.000	40.000	Car
+E8NgxTz1d90_30.000_40.000.wav	30.000	40.000	Car
+ExqedxdXuBc_70.000_80.000.wav	70.000	80.000	Car
+FCxEMSNSEuI_160.000_170.000.wav	160.000	170.000	Car
+FEoMTMxzn3U_30.000_40.000.wav	30.000	40.000	Car
+FFSWmryaZ60_30.000_40.000.wav	30.000	40.000	Car
+FYk2paHPSdg_30.000_40.000.wav	30.000	40.000	Car
+Fo_FDiZhzDo_30.000_40.000.wav	30.000	40.000	Car
+GteozUDpJRc_30.000_40.000.wav	30.000	40.000	Car
+GwBS2NzjAvA_30.000_40.000.wav	30.000	40.000	Car
+H8d1mZOqb1c_110.000_120.000.wav	110.000	120.000	Car
+HFF_PpqLQ9w_30.000_40.000.wav	30.000	40.000	Car
+HHlb-h2Pc7o_30.000_40.000.wav	30.000	40.000	Car
+Hu8lxbHYaqg_40.000_50.000.wav	40.000	50.000	Car
+I-HlrcP6Qg4_30.000_40.000.wav	30.000	40.000	Car
+I7vs2H-Htt8_480.000_490.000.wav	480.000	490.000	Car
+IblhEF_MiH8_400.000_410.000.wav	400.000	410.000	Car
+JgXnbgS_XBk_480.000_490.000.wav	480.000	490.000	Car
+Ju7Kg_H2iZQ_30.000_40.000.wav	30.000	40.000	Car
+KiCB6pP6EEo_100.000_110.000.wav	100.000	110.000	Car
+Kwpn3utYEHM_30.000_40.000.wav	30.000	40.000	Car
+Ky9Kw-0XwAs_30.000_40.000.wav	30.000	40.000	Car
+KzKDk-UgS54_30.000_40.000.wav	30.000	40.000	Car
+L1qC8DicAZE_70.000_80.000.wav	70.000	80.000	Car
+L4N0LOYZrFo_30.000_40.000.wav	30.000	40.000	Car
+L535vIV3ED4_40.000_50.000.wav	40.000	50.000	Car
+L9YtOeck3A0_0.000_10.000.wav	0.000	10.000	Car
+LEtkHiZZugk_30.000_40.000.wav	30.000	40.000	Car
+LLkNFGrrgUo_30.000_40.000.wav	30.000	40.000	Car
+LhRNnXaSsCk_30.000_40.000.wav	30.000	40.000	Car
+M7NvD1WJQ7o_70.000_80.000.wav	70.000	80.000	Car
+M8BFtmQRHq4_200.000_210.000.wav	200.000	210.000	Car
+Mxn2FKuNwiI_20.000_30.000.wav	20.000	30.000	Car
+NMqSBlEq14Q_30.000_40.000.wav	30.000	40.000	Car
+NoPbk9fy6uw_10.000_20.000.wav	10.000	20.000	Car
+O36torHptH4_30.000_40.000.wav	30.000	40.000	Car
+OBwh-KGukE8_30.000_40.000.wav	30.000	40.000	Car
+Oa2Os8eOUjs_30.000_40.000.wav	30.000	40.000	Car
+PNaLTW50fxM_60.000_70.000.wav	60.000	70.000	Car
+PfXdcsW8dJI_540.000_550.000.wav	540.000	550.000	Car
+QAWuHvVCI6g_30.000_40.000.wav	30.000	40.000	Car
+QBMDnMRwQCc_70.000_80.000.wav	70.000	80.000	Car
+QzrS-S7OerE_370.000_380.000.wav	370.000	380.000	Car
+R0BtkTm_CPI_30.000_40.000.wav	30.000	40.000	Car
+SEHxfje9Eio_30.000_40.000.wav	30.000	40.000	Car
+Sb3V17F8xU8_360.000_370.000.wav	360.000	370.000	Car
+SkbFczIabRY_30.000_40.000.wav	30.000	40.000	Car
+SqWkV-UQ6CI_30.000_40.000.wav	30.000	40.000	Car
+TWDytzefXXc_10.000_20.000.wav	10.000	20.000	Car
+Tv67JhZDAYs_30.000_40.000.wav	30.000	40.000	Car
+VTwVF3xRSWg_12.000_22.000.wav	12.000	22.000	Car
+VulCKZgWspc_570.000_580.000.wav	570.000	580.000	Car
+Vx6mttDHWfo_30.000_40.000.wav	30.000	40.000	Car
+W11cJ9HZNaY_30.000_40.000.wav	30.000	40.000	Car
+WLXQgcx8qTI_30.000_40.000.wav	30.000	40.000	Car
+WMbdMQ7rdFs_30.000_40.000.wav	30.000	40.000	Car
+WZoQD6cInx8_360.000_370.000.wav	360.000	370.000	Car
+WffmaOr2p8I_30.000_40.000.wav	30.000	40.000	Car
+WoynilrteLU_30.000_40.000.wav	30.000	40.000	Car
+WxrKq0aI0iM_130.000_140.000.wav	130.000	140.000	Car
+X60eVxecY3I_30.000_40.000.wav	30.000	40.000	Car
+X8fEzx-fA0U_80.000_90.000.wav	80.000	90.000	Car
+XVxlZqwWcBI_10.000_20.000.wav	10.000	20.000	Car
+Xnd8ERrynEo_120.000_130.000.wav	120.000	130.000	Car
+XqXLI7bDb-I_0.000_7.000.wav	0.000	7.000	Car
+XyCjByHuDIk_260.000_270.000.wav	260.000	270.000	Car
+XzE7mp3pVik_0.000_10.000.wav	0.000	10.000	Car
+Y5e8BW513ww_20.000_30.000.wav	20.000	30.000	Car
+YJdBwuIn4Ec_30.000_40.000.wav	30.000	40.000	Car
+YTFJUFWcRns_30.000_40.000.wav	30.000	40.000	Car
+YY9aConw2QE_0.000_10.000.wav	0.000	10.000	Car
+Yc_WuISxfLI_30.000_40.000.wav	30.000	40.000	Car
+Ys_rO2Ieg1U_30.000_40.000.wav	30.000	40.000	Car
+Z34SD-OEpJI_10.000_20.000.wav	10.000	20.000	Car
+Z8cigemT5_g_210.000_220.000.wav	210.000	220.000	Car
+ZJW7ymsioQc_16.000_26.000.wav	16.000	26.000	Car
+ZY6A9ZDkudg_130.000_140.000.wav	130.000	140.000	Car
+_Mw9lKigni4_30.000_40.000.wav	30.000	40.000	Car
+_ZiJA6phEq8_30.000_40.000.wav	30.000	40.000	Car
+_yU0-fmspFY_210.000_220.000.wav	210.000	220.000	Car
+a5vTn5286-A_80.000_90.000.wav	80.000	90.000	Car
+aCX6vJhHO2c_30.000_40.000.wav	30.000	40.000	Car
+aHEAK0iWqKk_180.000_190.000.wav	180.000	190.000	Car
+aOVPHKqKjyQ_90.000_100.000.wav	90.000	100.000	Car
+aUq4glO5ryE_30.000_40.000.wav	30.000	40.000	Car
+aW3DY8XDrmw_22.000_32.000.wav	22.000	32.000	Car
+aa4uhPvKviY_30.000_40.000.wav	30.000	40.000	Car
+akgqVmFFDiY_30.000_40.000.wav	30.000	40.000	Car
+buOEFwXhoe0_310.000_320.000.wav	310.000	320.000	Car
+cHCIoXF7moA_30.000_40.000.wav	30.000	40.000	Car
+cW859JAzVZ0_30.000_40.000.wav	30.000	40.000	Car
+cbYZQRz09bc_390.000_400.000.wav	390.000	400.000	Car
+d-do1XZ8f_E_30.000_40.000.wav	30.000	40.000	Car
+d3gMwtMK6Gs_30.000_40.000.wav	30.000	40.000	Car
+d6AioJ8CkTc_30.000_40.000.wav	30.000	40.000	Car
+dAud19zNZyw_190.000_200.000.wav	190.000	200.000	Car
+dC1TVxwiitc_30.000_40.000.wav	30.000	40.000	Car
+dFqOBLxhEl8_20.000_30.000.wav	20.000	30.000	Car
+dSfcznv4KLo_30.000_40.000.wav	30.000	40.000	Car
+dThSTe35jb0_50.000_60.000.wav	50.000	60.000	Car
+dfwr8wgZU8M_40.000_50.000.wav	40.000	50.000	Car
+dmJH84FnQa8_30.000_40.000.wav	30.000	40.000	Car
+e9xPBfEJni8_230.000_240.000.wav	230.000	240.000	Car
+eAl9WwRaWUE_30.000_40.000.wav	30.000	40.000	Car
+eAt6si6k65c_30.000_40.000.wav	30.000	40.000	Car
+eHiqCLHmoxI_0.000_8.000.wav	0.000	8.000	Car
+eV5JX81GzqA_150.000_160.000.wav	150.000	160.000	Car
+er1vQ-nse_g_30.000_40.000.wav	30.000	40.000	Car
+eyFPHlybqDg_30.000_40.000.wav	30.000	40.000	Car
+f70nsY7ThBA_220.000_230.000.wav	220.000	230.000	Car
+fJLCT3xDGxA_30.000_40.000.wav	30.000	40.000	Car
+fZMPDCNyQxE_30.000_40.000.wav	30.000	40.000	Car
+f__6chtFRM0_30.000_40.000.wav	30.000	40.000	Car
+fdDTuo_COG8_90.000_100.000.wav	90.000	100.000	Car
+gFJjYWXeBn0_30.000_40.000.wav	30.000	40.000	Car
+g_DBLppDZAs_30.000_40.000.wav	30.000	40.000	Car
+gaFQgJLQHtU_90.000_100.000.wav	90.000	100.000	Car
+gc6VlixMHXE_30.000_40.000.wav	30.000	40.000	Car
+hN1ykzC8kZM_30.000_40.000.wav	30.000	40.000	Car
+hQ_yyPI46FI_11.000_21.000.wav	11.000	21.000	Car
+haiMRJEH-Aw_0.000_9.000.wav	0.000	9.000	Car
+hsC_sT0A4XM_30.000_40.000.wav	30.000	40.000	Car
+ihQDd1CqFBw_70.000_80.000.wav	70.000	80.000	Car
+ii87iO6JboA_10.000_20.000.wav	10.000	20.000	Car
+j2R1zurR39E_30.000_40.000.wav	30.000	40.000	Car
+j42ETHcp044_0.000_10.000.wav	0.000	10.000	Car
+j7OEpDiK3IA_30.000_40.000.wav	30.000	40.000	Car
+jCeUZwd8b2w_0.000_10.000.wav	0.000	10.000	Car
+jZxusrD28rM_30.000_40.000.wav	30.000	40.000	Car
+kdDgTDfo9HY_100.000_110.000.wav	100.000	110.000	Car
+l6_h_YHuTbY_30.000_40.000.wav	30.000	40.000	Car
+lRrv5m9Xu4k_30.000_40.000.wav	30.000	40.000	Car
+lb1awXgoyQE_0.000_10.000.wav	0.000	10.000	Car
+llZBUsAwRWc_30.000_40.000.wav	30.000	40.000	Car
+lu5teS1j1RQ_0.000_10.000.wav	0.000	10.000	Car
+mCmjh_EJtb4_30.000_40.000.wav	30.000	40.000	Car
+nFqf1vflJaI_350.000_360.000.wav	350.000	360.000	Car
+njodYtK0Hqg_30.000_40.000.wav	30.000	40.000	Car
+noymXcxyxis_30.000_40.000.wav	30.000	40.000	Car
+o2CmtHNUrXg_30.000_40.000.wav	30.000	40.000	Car
+oPJVdi0cqNE_30.000_40.000.wav	30.000	40.000	Car
+oxJYMzEmtk4_10.000_20.000.wav	10.000	20.000	Car
+pPnLErF3GOY_30.000_40.000.wav	30.000	40.000	Car
+pXX6cK4xtiY_11.000_21.000.wav	11.000	21.000	Car
+qC5M7BAsKOA_0.000_10.000.wav	0.000	10.000	Car
+qg4WxBm8h_w_510.000_520.000.wav	510.000	520.000	Car
+qxLdv8u_Ujw_0.000_5.000.wav	0.000	5.000	Car
+rgeu0Gtf3Es_40.000_50.000.wav	40.000	50.000	Car
+s3-i5eUpe6c_30.000_40.000.wav	30.000	40.000	Car
+s5s3aR8Z7I8_350.000_360.000.wav	350.000	360.000	Car
+syCQldBsAtg_30.000_40.000.wav	30.000	40.000	Car
+tAfucDIyRiM_30.000_40.000.wav	30.000	40.000	Car
+teoER4j9H14_290.000_300.000.wav	290.000	300.000	Car
+uFSkczD2i14_30.000_40.000.wav	30.000	40.000	Car
+uUyB4q7jgn4_30.000_40.000.wav	30.000	40.000	Car
+uYqlVTlSgbM_40.000_50.000.wav	40.000	50.000	Car
+v8Kry1CbTkM_310.000_320.000.wav	310.000	320.000	Car
+vF2zXcbADUk_20.000_30.000.wav	20.000	30.000	Car
+vHlqKDR7ggA_30.000_40.000.wav	30.000	40.000	Car
+vPDXFKcdaS4_0.000_10.000.wav	0.000	10.000	Car
+vW1nk4o9u5g_30.000_40.000.wav	30.000	40.000	Car
+vdFYBSlmsXw_30.000_40.000.wav	30.000	40.000	Car
+vtE1J8HsCUs_30.000_40.000.wav	30.000	40.000	Car
+w0vy1YvNcOg_30.000_40.000.wav	30.000	40.000	Car
+wDKrcZ7xLY8_80.000_90.000.wav	80.000	90.000	Car
+wM-sBzIDzok_30.000_40.000.wav	30.000	40.000	Car
+wUY4eWJt17w_30.000_40.000.wav	30.000	40.000	Car
+we66pU0MN1M_30.000_40.000.wav	30.000	40.000	Car
+wjfMWiYLDWA_30.000_40.000.wav	30.000	40.000	Car
+wu3-_VKULZU_30.000_40.000.wav	30.000	40.000	Car
+wwNIm8bgzKc_30.000_40.000.wav	30.000	40.000	Car
+xqH9TpH6Xy0_0.000_10.000.wav	0.000	10.000	Car
+xsT5ZJUnBg0_160.000_170.000.wav	160.000	170.000	Car
+y9DFJEsiTLk_110.000_120.000.wav	110.000	120.000	Car
+yESwp_fg0Po_70.000_80.000.wav	70.000	80.000	Car
+yQg3eMb0QKU_30.000_40.000.wav	30.000	40.000	Car
+yQjnNR7fXKo_50.000_60.000.wav	50.000	60.000	Car
+zCuKYr_oMlE_60.000_70.000.wav	60.000	70.000	Car
+zz35Va7tYmA_30.000_40.000.wav	30.000	40.000	Car
+-CZ1LIc8aos_20.000_30.000.wav	20.000	30.000	Car passing by
+-WgJ-M292Yc_30.000_40.000.wav	30.000	40.000	Car passing by
+-iAAxJkoqcM_0.000_6.000.wav	0.000	6.000	Car passing by
+0mQcGLpc8to_30.000_40.000.wav	30.000	40.000	Car passing by
+1HtGgZnlKjU_30.000_40.000.wav	30.000	40.000	Car passing by
+2IsAlhq0XFc_30.000_40.000.wav	30.000	40.000	Car passing by
+2UvEmetE__I_30.000_40.000.wav	30.000	40.000	Car passing by
+2oHGIzH_XzA_30.000_40.000.wav	30.000	40.000	Car passing by
+3mor5mPSYoU_7.000_17.000.wav	7.000	17.000	Car passing by
+8SYLYWR47EE_30.000_40.000.wav	30.000	40.000	Car passing by
+8rzhhvS0tGc_30.000_40.000.wav	30.000	40.000	Car passing by
+8v377AXrgac_30.000_40.000.wav	30.000	40.000	Car passing by
+9lMtTDKyDEk_30.000_40.000.wav	30.000	40.000	Car passing by
+BWoL8oKoTFI_30.000_40.000.wav	30.000	40.000	Car passing by
+BsvD806qNM8_10.000_20.000.wav	10.000	20.000	Car passing by
+C3LLtToB2zA_30.000_40.000.wav	30.000	40.000	Car passing by
+Dk6b9dVD0i8_6.000_16.000.wav	6.000	16.000	Car passing by
+Dw1q9rBv7oU_30.000_40.000.wav	30.000	40.000	Car passing by
+EqFuY_U0Yz0_30.000_40.000.wav	30.000	40.000	Car passing by
+FjpOboRcrNc_10.000_20.000.wav	10.000	20.000	Car passing by
+FjyZV8zIJ0k_30.000_40.000.wav	30.000	40.000	Car passing by
+Fn7eSPVvgCQ_30.000_40.000.wav	30.000	40.000	Car passing by
+G6A-sT2DOjY_30.000_40.000.wav	30.000	40.000	Car passing by
+GBXRuYIvhfM_30.000_40.000.wav	30.000	40.000	Car passing by
+HDEPd5MIaow_30.000_40.000.wav	30.000	40.000	Car passing by
+HQQxGJKg1iM_30.000_40.000.wav	30.000	40.000	Car passing by
+If-V0XO-mpo_30.000_40.000.wav	30.000	40.000	Car passing by
+JtuNiusRRLk_30.000_40.000.wav	30.000	40.000	Car passing by
+M8BFtmQRHq4_200.000_210.000.wav	200.000	210.000	Car passing by
+NKPAwhwZmqs_30.000_40.000.wav	30.000	40.000	Car passing by
+Oa2Os8eOUjs_30.000_40.000.wav	30.000	40.000	Car passing by
+QcLfJE-YfJY_30.000_40.000.wav	30.000	40.000	Car passing by
+SkbFczIabRY_30.000_40.000.wav	30.000	40.000	Car passing by
+VAiH1LX8guk_17.000_27.000.wav	17.000	27.000	Car passing by
+Yc_WuISxfLI_30.000_40.000.wav	30.000	40.000	Car passing by
+Yd10enP9ykM_30.000_40.000.wav	30.000	40.000	Car passing by
+_HGGCwtyNxM_30.000_40.000.wav	30.000	40.000	Car passing by
+a2U10_mi5as_30.000_40.000.wav	30.000	40.000	Car passing by
+aB6FDPKAPus_30.000_40.000.wav	30.000	40.000	Car passing by
+bDFQWubN4x4_30.000_40.000.wav	30.000	40.000	Car passing by
+cW859JAzVZ0_30.000_40.000.wav	30.000	40.000	Car passing by
+dDTvjXXFkDg_30.000_40.000.wav	30.000	40.000	Car passing by
+dfwr8wgZU8M_40.000_50.000.wav	40.000	50.000	Car passing by
+fJLCT3xDGxA_30.000_40.000.wav	30.000	40.000	Car passing by
+gc6VlixMHXE_30.000_40.000.wav	30.000	40.000	Car passing by
+gd_KjDM4fi8_0.000_10.000.wav	0.000	10.000	Car passing by
+j7OEpDiK3IA_30.000_40.000.wav	30.000	40.000	Car passing by
+jZxusrD28rM_30.000_40.000.wav	30.000	40.000	Car passing by
+llZBUsAwRWc_30.000_40.000.wav	30.000	40.000	Car passing by
+m_dCO5bBCic_26.000_36.000.wav	26.000	36.000	Car passing by
+qDQX7Xi3GsQ_30.000_40.000.wav	30.000	40.000	Car passing by
+qxLdv8u_Ujw_0.000_5.000.wav	0.000	5.000	Car passing by
+reP-OOWiLWU_30.000_40.000.wav	30.000	40.000	Car passing by
+s4jG5ZJYCvQ_30.000_40.000.wav	30.000	40.000	Car passing by
+s5s3aR8Z7I8_350.000_360.000.wav	350.000	360.000	Car passing by
+uUyB4q7jgn4_30.000_40.000.wav	30.000	40.000	Car passing by
+vPDXFKcdaS4_0.000_10.000.wav	0.000	10.000	Car passing by
+wD4QouhX8zo_30.000_40.000.wav	30.000	40.000	Car passing by
+xqH9TpH6Xy0_0.000_10.000.wav	0.000	10.000	Car passing by
+zd67ihUZ1u4_25.000_35.000.wav	25.000	35.000	Car passing by
+-3z5mFRgbxc_30.000_40.000.wav	30.000	40.000	Bus
+0N9EN0BEjP0_430.000_440.000.wav	430.000	440.000	Bus
+0lPcHRhXlWk_30.000_40.000.wav	30.000	40.000	Bus
+1E1evA4T_Tk_30.000_40.000.wav	30.000	40.000	Bus
+1hIg-Lsvc7Q_30.000_40.000.wav	30.000	40.000	Bus
+6-yQsEH2WYA_30.000_40.000.wav	30.000	40.000	Bus
+6Y8wSI1l-Lw_30.000_40.000.wav	30.000	40.000	Bus
+7T04388Ijk8_30.000_40.000.wav	30.000	40.000	Bus
+8E7okHnCcTA_30.000_40.000.wav	30.000	40.000	Bus
+8oEdgb8iXYA_1.000_11.000.wav	1.000	11.000	Bus
+AdpNSGX2_Pk_10.000_20.000.wav	10.000	20.000	Bus
+AwJ8orGuOXg_2.000_12.000.wav	2.000	12.000	Bus
+BS1fqEDAvh0_330.000_340.000.wav	330.000	340.000	Bus
+CoFbRc1OxFU_9.000_19.000.wav	9.000	19.000	Bus
+DRqKOlP8BmU_110.000_120.000.wav	110.000	120.000	Bus
+DYcXvyBFc5w_30.000_40.000.wav	30.000	40.000	Bus
+DYdalOQnx1Y_30.000_40.000.wav	30.000	40.000	Bus
+DkwFXd5nYLE_40.000_50.000.wav	40.000	50.000	Bus
+FBMR3pW9H9o_30.000_40.000.wav	30.000	40.000	Bus
+FEGa4e6RAlw_30.000_40.000.wav	30.000	40.000	Bus
+Ge_KWS-0098_30.000_40.000.wav	30.000	40.000	Bus
+HxMoMMrA6Eo_30.000_40.000.wav	30.000	40.000	Bus
+I7esm6vqqZ4_30.000_40.000.wav	30.000	40.000	Bus
+JLj11umr1CE_0.000_10.000.wav	0.000	10.000	Bus
+JwAhcHHF2qg_30.000_40.000.wav	30.000	40.000	Bus
+LhRNnXaSsCk_30.000_40.000.wav	30.000	40.000	Bus
+LzZ_nxuZ8Co_30.000_40.000.wav	30.000	40.000	Bus
+LzcNa3HvD7c_30.000_40.000.wav	30.000	40.000	Bus
+Nyi9_-u6-w0_30.000_40.000.wav	30.000	40.000	Bus
+O_SKumO328I_30.000_40.000.wav	30.000	40.000	Bus
+Owg_XU9XmRM_30.000_40.000.wav	30.000	40.000	Bus
+P94rcZSuTT8_30.000_40.000.wav	30.000	40.000	Bus
+PP741kd2vRM_30.000_40.000.wav	30.000	40.000	Bus
+Qna9qrV8_go_30.000_40.000.wav	30.000	40.000	Bus
+Qt7FJkuqWPE_30.000_40.000.wav	30.000	40.000	Bus
+UcQ7cVukaxY_21.000_31.000.wav	21.000	31.000	Bus
+W8fIlauyJkk_30.000_40.000.wav	30.000	40.000	Bus
+WDn851XbWTk_30.000_40.000.wav	30.000	40.000	Bus
+WvquSD2PcCE_30.000_40.000.wav	30.000	40.000	Bus
+a9B_HA3y8WQ_30.000_40.000.wav	30.000	40.000	Bus
+cEEoKQ38fHY_30.000_40.000.wav	30.000	40.000	Bus
+er1vQ-nse_g_30.000_40.000.wav	30.000	40.000	Bus
+fLvM4bbpg6w_0.000_10.000.wav	0.000	10.000	Bus
+fOVsAMJ3Yms_30.000_40.000.wav	30.000	40.000	Bus
+gxVhAVNjSU0_30.000_40.000.wav	30.000	40.000	Bus
+jaSK_t8QP1E_30.000_40.000.wav	30.000	40.000	Bus
+ji_YCMygNHQ_8.000_18.000.wav	8.000	18.000	Bus
+kNKfoDp0uUw_30.000_40.000.wav	30.000	40.000	Bus
+kdDgTDfo9HY_100.000_110.000.wav	100.000	110.000	Bus
+lHP0q2sQzPQ_30.000_40.000.wav	30.000	40.000	Bus
+mGG8rop4Jig_30.000_40.000.wav	30.000	40.000	Bus
+oHKTmTLEy68_11.000_21.000.wav	11.000	21.000	Bus
+tAfucDIyRiM_30.000_40.000.wav	30.000	40.000	Bus
+tQd0vFueRKs_30.000_40.000.wav	30.000	40.000	Bus
+ucICmff0K-Q_30.000_40.000.wav	30.000	40.000	Bus
+x-2Abohj8VY_30.000_40.000.wav	30.000	40.000	Bus
+xFr2xX6PulQ_70.000_80.000.wav	70.000	80.000	Bus
+yfSBqp5IZSM_10.000_20.000.wav	10.000	20.000	Bus
+-2sE5CH8Wb8_30.000_40.000.wav	30.000	40.000	Truck
+-BY64_p-vtM_30.000_40.000.wav	30.000	40.000	Truck
+-fJsZm3YRc0_30.000_40.000.wav	30.000	40.000	Truck
+-t-htrAtNvM_30.000_40.000.wav	30.000	40.000	Truck
+-zNEcuo28oE_30.000_40.000.wav	30.000	40.000	Truck
+01WuUBxFBp4_30.000_40.000.wav	30.000	40.000	Truck
+077aWlQn6XI_30.000_40.000.wav	30.000	40.000	Truck
+0Ga7T-2e490_17.000_27.000.wav	17.000	27.000	Truck
+0N9EN0BEjP0_430.000_440.000.wav	430.000	440.000	Truck
+10aF24rMeu0_30.000_40.000.wav	30.000	40.000	Truck
+2HZcxlRs-hg_30.000_40.000.wav	30.000	40.000	Truck
+2Jpg_KvJWL0_30.000_40.000.wav	30.000	40.000	Truck
+2Tmi7EqpGZQ_0.000_10.000.wav	0.000	10.000	Truck
+4DlKNmVcoek_20.000_30.000.wav	20.000	30.000	Truck
+4MRzQbAIyV4_90.000_100.000.wav	90.000	100.000	Truck
+4Tpy1lsfcSM_30.000_40.000.wav	30.000	40.000	Truck
+4ep09nZl3LA_30.000_40.000.wav	30.000	40.000	Truck
+5DW8WjxxCag_30.000_40.000.wav	30.000	40.000	Truck
+5DjZHCumLfs_11.000_21.000.wav	11.000	21.000	Truck
+5QP1Tc3XbDc_30.000_40.000.wav	30.000	40.000	Truck
+5V0xKS-FGMk_30.000_40.000.wav	30.000	40.000	Truck
+5fLzQegwHUg_30.000_40.000.wav	30.000	40.000	Truck
+6HL_DKWK-WA_10.000_20.000.wav	10.000	20.000	Truck
+6VQGk8IrV-4_30.000_40.000.wav	30.000	40.000	Truck
+6Y8bKS6KLeE_30.000_40.000.wav	30.000	40.000	Truck
+6xEHP-C-ZuU_30.000_40.000.wav	30.000	40.000	Truck
+6yyToq9cW9A_60.000_70.000.wav	60.000	70.000	Truck
+7Gua0-UrKIw_30.000_40.000.wav	30.000	40.000	Truck
+7nglQSmcjAk_30.000_40.000.wav	30.000	40.000	Truck
+81DteAPIhoE_30.000_40.000.wav	30.000	40.000	Truck
+84E9i9_ELBs_30.000_40.000.wav	30.000	40.000	Truck
+8jblPMBafKE_30.000_40.000.wav	30.000	40.000	Truck
+8k17D6qiuqI_30.000_40.000.wav	30.000	40.000	Truck
+9EsgN-WS2qY_30.000_40.000.wav	30.000	40.000	Truck
+9LJnjmcRcb8_280.000_290.000.wav	280.000	290.000	Truck
+9yhMtJ50sys_30.000_40.000.wav	30.000	40.000	Truck
+A9KMqwqLboE_30.000_40.000.wav	30.000	40.000	Truck
+ARIVxBOc0BQ_40.000_50.000.wav	40.000	50.000	Truck
+AwFuGITwrms_30.000_40.000.wav	30.000	40.000	Truck
+BQVXzH6YK8g_30.000_40.000.wav	30.000	40.000	Truck
+CnYWJp2bknU_50.000_60.000.wav	50.000	60.000	Truck
+DRqKOlP8BmU_110.000_120.000.wav	110.000	120.000	Truck
+DXlTakKvLzg_30.000_40.000.wav	30.000	40.000	Truck
+DkVfro9iq80_30.000_40.000.wav	30.000	40.000	Truck
+Dmy4EjohxxU_60.000_70.000.wav	60.000	70.000	Truck
+DvMFQ64YwcI_30.000_40.000.wav	30.000	40.000	Truck
+FEoMTMxzn3U_30.000_40.000.wav	30.000	40.000	Truck
+GTk_6JDmtCY_230.000_240.000.wav	230.000	240.000	Truck
+HDEPd5MIaow_30.000_40.000.wav	30.000	40.000	Truck
+HQkLVac7z9Q_70.000_80.000.wav	70.000	80.000	Truck
+I4VDcVTE4YA_30.000_40.000.wav	30.000	40.000	Truck
+IxlvxvG8zOE_110.000_120.000.wav	110.000	120.000	Truck
+JLzD44Im1Ec_30.000_40.000.wav	30.000	40.000	Truck
+K4Hcb00hTTY_30.000_40.000.wav	30.000	40.000	Truck
+L2M3xanqQP8_30.000_40.000.wav	30.000	40.000	Truck
+LA5TekLaIPI_10.000_20.000.wav	10.000	20.000	Truck
+LhRNnXaSsCk_30.000_40.000.wav	30.000	40.000	Truck
+MWTTe0M9vi4_30.000_40.000.wav	30.000	40.000	Truck
+Nkqx09b-xyI_70.000_80.000.wav	70.000	80.000	Truck
+NqzZbJJl3E4_30.000_40.000.wav	30.000	40.000	Truck
+OPd0cz1hRqc_30.000_40.000.wav	30.000	40.000	Truck
+PCl-q7lCT_U_50.000_60.000.wav	50.000	60.000	Truck
+PNaLTW50fxM_60.000_70.000.wav	60.000	70.000	Truck
+PO1eaJ7tQOg_180.000_190.000.wav	180.000	190.000	Truck
+PSt0xAYgf4g_0.000_10.000.wav	0.000	10.000	Truck
+Pef6g19i5iI_30.000_40.000.wav	30.000	40.000	Truck
+Q1CMSV81_ws_30.000_40.000.wav	30.000	40.000	Truck
+SiBIYAiIajM_30.000_40.000.wav	30.000	40.000	Truck
+T6oYCFRafPs_30.000_40.000.wav	30.000	40.000	Truck
+WdubBeFntYQ_460.000_470.000.wav	460.000	470.000	Truck
+_ZiJA6phEq8_30.000_40.000.wav	30.000	40.000	Truck
+_jfv_ziZWII_60.000_70.000.wav	60.000	70.000	Truck
+acvV6yYNc7Y_30.000_40.000.wav	30.000	40.000	Truck
+bQSaQ0iX_vk_30.000_40.000.wav	30.000	40.000	Truck
+bhxN5w03yS0_30.000_40.000.wav	30.000	40.000	Truck
+ckt7YEGcSoY_30.000_40.000.wav	30.000	40.000	Truck
+eIkUuCRE_0U_30.000_40.000.wav	30.000	40.000	Truck
+gxVhAVNjSU0_30.000_40.000.wav	30.000	40.000	Truck
+hDVNQOJCvOk_30.000_40.000.wav	30.000	40.000	Truck
+ieZVo7W3BQ4_30.000_40.000.wav	30.000	40.000	Truck
+ikmE_kRvDAc_30.000_40.000.wav	30.000	40.000	Truck
+jwZTKNsbf58_70.000_80.000.wav	70.000	80.000	Truck
+kH6fFjIZkB0_30.000_40.000.wav	30.000	40.000	Truck
+kr8ssbrDDMY_30.000_40.000.wav	30.000	40.000	Truck
+lp66EaEOOoU_30.000_40.000.wav	30.000	40.000	Truck
+n4o1r8Ai66o_30.000_40.000.wav	30.000	40.000	Truck
+nDtrUUc2J2U_0.000_10.000.wav	0.000	10.000	Truck
+nMaSkwx6cHE_30.000_40.000.wav	30.000	40.000	Truck
+p70IcMwsW9M_30.000_40.000.wav	30.000	40.000	Truck
+pJ1fore8JbQ_30.000_40.000.wav	30.000	40.000	Truck
+pt-J_L-OFI8_0.000_10.000.wav	0.000	10.000	Truck
+rdanJP7Usrg_30.000_40.000.wav	30.000	40.000	Truck
+srTX18ikXkE_10.000_20.000.wav	10.000	20.000	Truck
+tuplsUUDXKw_30.000_40.000.wav	30.000	40.000	Truck
+x6vuWsdeS3s_30.000_40.000.wav	30.000	40.000	Truck
+xMClk12ouB8_30.000_40.000.wav	30.000	40.000	Truck
+ycqDMKTrvLY_30.000_40.000.wav	30.000	40.000	Truck
+yk5LqHTtHLo_30.000_40.000.wav	30.000	40.000	Truck
+yrscqyUOIlI_30.000_40.000.wav	30.000	40.000	Truck
+zM3chsL-B7U_30.000_40.000.wav	30.000	40.000	Truck
+06si40RVDco_30.000_40.000.wav	30.000	40.000	Motorcycle
+0DzsPL-xElE_20.000_30.000.wav	20.000	30.000	Motorcycle
+145N68nh4m0_120.000_130.000.wav	120.000	130.000	Motorcycle
+16vw4K9qJnY_30.000_40.000.wav	30.000	40.000	Motorcycle
+21QlKF17ipc_30.000_40.000.wav	30.000	40.000	Motorcycle
+3LulQoOXNB0_30.000_40.000.wav	30.000	40.000	Motorcycle
+45JHcLU57B8_20.000_30.000.wav	20.000	30.000	Motorcycle
+4NZkW-XaIa4_30.000_40.000.wav	30.000	40.000	Motorcycle
+506I6LfdDuk_50.000_60.000.wav	50.000	60.000	Motorcycle
+6MCy1lh4qaw_20.000_30.000.wav	20.000	30.000	Motorcycle
+6R8cO4ARzkY_30.000_40.000.wav	30.000	40.000	Motorcycle
+6taAP7SFewI_30.000_40.000.wav	30.000	40.000	Motorcycle
+7g6aZTBe2xE_30.000_40.000.wav	30.000	40.000	Motorcycle
+9HcahqYUVoc_90.000_100.000.wav	90.000	100.000	Motorcycle
+9N1iw5Vdim8_20.000_30.000.wav	20.000	30.000	Motorcycle
+ANWU9Hiy_5k_40.000_50.000.wav	40.000	50.000	Motorcycle
+BTNz6NftP34_30.000_40.000.wav	30.000	40.000	Motorcycle
+BxnLAGsByCI_10.000_20.000.wav	10.000	20.000	Motorcycle
+CZgx_6XaEkg_30.000_40.000.wav	30.000	40.000	Motorcycle
+D3BJuOwltoI_10.000_20.000.wav	10.000	20.000	Motorcycle
+FgN9v1jYqjA_30.000_40.000.wav	30.000	40.000	Motorcycle
+HQ8eR2lvjSE_30.000_40.000.wav	30.000	40.000	Motorcycle
+Mb-GyQEKoEc_30.000_40.000.wav	30.000	40.000	Motorcycle
+Pair_NsHdTc_30.000_40.000.wav	30.000	40.000	Motorcycle
+UFIBEBkm7ao_30.000_40.000.wav	30.000	40.000	Motorcycle
+UWz5OIijWM4_30.000_40.000.wav	30.000	40.000	Motorcycle
+WLX3Db60418_20.000_30.000.wav	20.000	30.000	Motorcycle
+X5Xs8Y1cJK0_30.000_40.000.wav	30.000	40.000	Motorcycle
+ZGf0vrZStwI_30.000_40.000.wav	30.000	40.000	Motorcycle
+ZfkO1HlI0zM_30.000_40.000.wav	30.000	40.000	Motorcycle
+bhtB2Zgh9Q8_110.000_120.000.wav	110.000	120.000	Motorcycle
+d-m8eXCpeDg_30.000_40.000.wav	30.000	40.000	Motorcycle
+d21IwtH2oHI_30.000_40.000.wav	30.000	40.000	Motorcycle
+dhaKGPCgtfw_30.000_40.000.wav	30.000	40.000	Motorcycle
+ee-0JGvEIng_30.000_40.000.wav	30.000	40.000	Motorcycle
+epGDNMrsQb8_40.000_50.000.wav	40.000	50.000	Motorcycle
+ezUkPETm6cs_30.000_40.000.wav	30.000	40.000	Motorcycle
+f724u5z_UDw_30.000_40.000.wav	30.000	40.000	Motorcycle
+gGmWm1i6pVo_30.000_40.000.wav	30.000	40.000	Motorcycle
+i9VjpIbM3iE_410.000_420.000.wav	410.000	420.000	Motorcycle
+iMp8nODaotA_580.000_590.000.wav	580.000	590.000	Motorcycle
+lVW2CqsHJ4Y_30.000_40.000.wav	30.000	40.000	Motorcycle
+lj7hzmz19-M_30.000_40.000.wav	30.000	40.000	Motorcycle
+mX45CiTjf8I_30.000_40.000.wav	30.000	40.000	Motorcycle
+mbLiZ_jpgeY_20.000_30.000.wav	20.000	30.000	Motorcycle
+owZDBEq6WdU_30.000_40.000.wav	30.000	40.000	Motorcycle
+pNMBIqvbyB4_30.000_40.000.wav	30.000	40.000	Motorcycle
+po-tnKZAzdg_40.000_50.000.wav	40.000	50.000	Motorcycle
+qAQuljp-atA_30.000_40.000.wav	30.000	40.000	Motorcycle
+r0Oll28wmXs_30.000_40.000.wav	30.000	40.000	Motorcycle
+sAMjMyCdGOc_30.000_40.000.wav	30.000	40.000	Motorcycle
+vHlqKDR7ggA_30.000_40.000.wav	30.000	40.000	Motorcycle
+wPfv8ifzzyg_30.000_40.000.wav	30.000	40.000	Motorcycle
+wyhurCZbKQU_30.000_40.000.wav	30.000	40.000	Motorcycle
+xQTPEQDb0Gg_30.000_40.000.wav	30.000	40.000	Motorcycle
+xTPmoYwgKf4_30.000_40.000.wav	30.000	40.000	Motorcycle
+xXGIKM4daMU_30.000_40.000.wav	30.000	40.000	Motorcycle
+xZ8hQliZqhg_160.000_170.000.wav	160.000	170.000	Motorcycle
+xuMBy2NoROI_30.000_40.000.wav	30.000	40.000	Motorcycle
+z_8yGVO1qws_30.000_40.000.wav	30.000	40.000	Motorcycle
+-BaVEk1zS2g_50.000_60.000.wav	50.000	60.000	Train
+-Q4fBQ4egrs_0.000_10.000.wav	0.000	10.000	Train
+-QxSFr1cYuQ_20.000_30.000.wav	20.000	30.000	Train
+-ZdReI9dL6M_530.000_540.000.wav	530.000	540.000	Train
+0YIyGEM0yG0_550.000_560.000.wav	550.000	560.000	Train
+1Mk2MJDhLJQ_20.000_30.000.wav	20.000	30.000	Train
+2nejPPEWqJ8_320.000_330.000.wav	320.000	330.000	Train
+3ACjUf9QpAQ_30.000_40.000.wav	30.000	40.000	Train
+3RfrTU1p5SA_500.000_510.000.wav	500.000	510.000	Train
+3YJewEC-NWo_30.000_40.000.wav	30.000	40.000	Train
+3ZZDuYU2HM4_150.000_160.000.wav	150.000	160.000	Train
+3fPX1LaGwJo_60.000_70.000.wav	60.000	70.000	Train
+4_gyCWuPxRg_170.000_180.000.wav	170.000	180.000	Train
+4l4vGrMD4Tw_550.000_560.000.wav	550.000	560.000	Train
+4oT0bxldS80_30.000_40.000.wav	30.000	40.000	Train
+4t7Mi3pnSA4_210.000_220.000.wav	210.000	220.000	Train
+53oq_Otm_XI_30.000_40.000.wav	30.000	40.000	Train
+6OgSNQOTw2U_30.000_40.000.wav	30.000	40.000	Train
+6_TGlFO0DCk_10.000_20.000.wav	10.000	20.000	Train
+7KdSGBzXvz8_420.000_430.000.wav	420.000	430.000	Train
+7W_kcu0CJqI_310.000_320.000.wav	310.000	320.000	Train
+8IaInXpdd9M_0.000_10.000.wav	0.000	10.000	Train
+8nU1aVscJec_30.000_40.000.wav	30.000	40.000	Train
+9LQEZJPNVpw_30.000_40.000.wav	30.000	40.000	Train
+9NT6gEiqpWA_30.000_40.000.wav	30.000	40.000	Train
+AFhll08KM98_30.000_40.000.wav	30.000	40.000	Train
+AHom7lBbtoY_30.000_40.000.wav	30.000	40.000	Train
+AK0kZUDk294_2.000_12.000.wav	2.000	12.000	Train
+AKPC4rEGoyI_30.000_40.000.wav	30.000	40.000	Train
+APsvUzw7bWA_60.000_70.000.wav	60.000	70.000	Train
+AshwkKUV07s_23.000_33.000.wav	23.000	33.000	Train
+BI2Tol64na0_30.000_40.000.wav	30.000	40.000	Train
+BmS2NiuT2c0_160.000_170.000.wav	160.000	170.000	Train
+CCX_4cW_SAU_0.000_10.000.wav	0.000	10.000	Train
+D_nXtMgbPNY_30.000_40.000.wav	30.000	40.000	Train
+F-JFxERdA2w_30.000_40.000.wav	30.000	40.000	Train
+FoIBRxw0tyE_30.000_40.000.wav	30.000	40.000	Train
+G958vjLYBcI_110.000_120.000.wav	110.000	120.000	Train
+GFQnh84kNwU_30.000_40.000.wav	30.000	40.000	Train
+GKc8PCTen8Q_310.000_320.000.wav	310.000	320.000	Train
+I4qODX0fypE_30.000_40.000.wav	30.000	40.000	Train
+IIIxN_ziy_I_60.000_70.000.wav	60.000	70.000	Train
+IdqEbjujFb8_30.000_40.000.wav	30.000	40.000	Train
+K-i81KrH8BQ_30.000_40.000.wav	30.000	40.000	Train
+K9pSRLw6FNc_40.000_50.000.wav	40.000	50.000	Train
+KPyYUly5xCc_90.000_100.000.wav	90.000	100.000	Train
+L3a132_uApg_50.000_60.000.wav	50.000	60.000	Train
+LK4b2eJpy24_30.000_40.000.wav	30.000	40.000	Train
+LzcNa3HvD7c_30.000_40.000.wav	30.000	40.000	Train
+MCYY8tJsnfY_7.000_17.000.wav	7.000	17.000	Train
+MDF2vsjm8jU_10.000_20.000.wav	10.000	20.000	Train
+MMfiWJVftMA_60.000_70.000.wav	60.000	70.000	Train
+MYzVHespZ-E_30.000_40.000.wav	30.000	40.000	Train
+Mbe4rlNiM84_0.000_7.000.wav	0.000	7.000	Train
+MczH_PWBNeI_360.000_370.000.wav	360.000	370.000	Train
+Mfkif49LLc4_30.000_40.000.wav	30.000	40.000	Train
+MwSbYICrYj8_290.000_300.000.wav	290.000	300.000	Train
+PJUy17bXlhc_40.000_50.000.wav	40.000	50.000	Train
+QDTbchu0LrU_30.000_40.000.wav	30.000	40.000	Train
+QZJ5WAYIUh8_70.000_80.000.wav	70.000	80.000	Train
+QrAoRSA13bM_30.000_40.000.wav	30.000	40.000	Train
+RN-_agT8_Cg_0.000_10.000.wav	0.000	10.000	Train
+R_Lpb-51Kl4_30.000_40.000.wav	30.000	40.000	Train
+Rhvy7V4F95Q_40.000_50.000.wav	40.000	50.000	Train
+Rq-22Cycrpg_30.000_40.000.wav	30.000	40.000	Train
+RrlgSfQrqQc_20.000_30.000.wav	20.000	30.000	Train
+RwBKGPEg6uA_340.000_350.000.wav	340.000	350.000	Train
+T73runykdnE_25.000_35.000.wav	25.000	35.000	Train
+T8M6W4yOzI4_30.000_40.000.wav	30.000	40.000	Train
+Tmm4H6alHCE_30.000_40.000.wav	30.000	40.000	Train
+TyTORMEourg_270.000_280.000.wav	270.000	280.000	Train
+UQx0EMXtLZA_60.000_70.000.wav	60.000	70.000	Train
+UZx7OAgRMRY_90.000_100.000.wav	90.000	100.000	Train
+UerX5Bv2hcs_70.000_80.000.wav	70.000	80.000	Train
+UxSUGCvpskM_340.000_350.000.wav	340.000	350.000	Train
+V2hln47cP78_130.000_140.000.wav	130.000	140.000	Train
+VIe_Qkg5RJI_130.000_140.000.wav	130.000	140.000	Train
+WDn851XbWTk_30.000_40.000.wav	30.000	40.000	Train
+WFdpQCtpBB4_30.000_40.000.wav	30.000	40.000	Train
+XAUtk9lwzU8_30.000_40.000.wav	30.000	40.000	Train
+XDTlBb3aYqo_30.000_40.000.wav	30.000	40.000	Train
+XKvLkIM8dck_40.000_50.000.wav	40.000	50.000	Train
+XQbeLJYzY9k_90.000_100.000.wav	90.000	100.000	Train
+XW8pSKLyr0o_20.000_30.000.wav	20.000	30.000	Train
+XeYiNanFS_M_120.000_130.000.wav	120.000	130.000	Train
+Y10I9JSvJuQ_30.000_40.000.wav	30.000	40.000	Train
+YDGf-razgyU_250.000_260.000.wav	250.000	260.000	Train
+YFD1Qrlskrg_60.000_70.000.wav	60.000	70.000	Train
+Y_jwEflLthg_190.000_200.000.wav	190.000	200.000	Train
+Y_ynIwm3qm0_370.000_380.000.wav	370.000	380.000	Train
+Zy0goYEHPHU_30.000_40.000.wav	30.000	40.000	Train
+_dkeW6lqmq4_30.000_40.000.wav	30.000	40.000	Train
+aNO2KEXBCOk_30.000_40.000.wav	30.000	40.000	Train
+aXsUHAKbyLs_30.000_40.000.wav	30.000	40.000	Train
+ahct5yzUtdE_20.000_30.000.wav	20.000	30.000	Train
+arevYmB0qGg_30.000_40.000.wav	30.000	40.000	Train
+bCGtzspNbNo_30.000_40.000.wav	30.000	40.000	Train
+bI6wPI9kAm8_70.000_80.000.wav	70.000	80.000	Train
+bpdCMWWiB_0_30.000_40.000.wav	30.000	40.000	Train
+cdrjKqyDrak_420.000_430.000.wav	420.000	430.000	Train
+d1o334I5X_k_30.000_40.000.wav	30.000	40.000	Train
+dSzZWgbJ378_30.000_40.000.wav	30.000	40.000	Train
+eRclX9l0F_c_150.000_160.000.wav	150.000	160.000	Train
+fOVsAMJ3Yms_30.000_40.000.wav	30.000	40.000	Train
+fWVfi9pAh_4_10.000_20.000.wav	10.000	20.000	Train
+fztkF47lVQg_0.000_10.000.wav	0.000	10.000	Train
+g0ICxHjC9Uc_30.000_40.000.wav	30.000	40.000	Train
+g2scd3YVgwQ_30.000_40.000.wav	30.000	40.000	Train
+g4cA-ifQc70_30.000_40.000.wav	30.000	40.000	Train
+g9JVq7wfDIo_30.000_40.000.wav	30.000	40.000	Train
+gKMpowHeyKc_30.000_40.000.wav	30.000	40.000	Train
+gTFCK9TuLOQ_30.000_40.000.wav	30.000	40.000	Train
+gU0mD2fSh4c_500.000_510.000.wav	500.000	510.000	Train
+gkH_Zxasn8o_40.000_50.000.wav	40.000	50.000	Train
+gvnM4kK4r70_10.000_20.000.wav	10.000	20.000	Train
+hH_M56EnnDk_30.000_40.000.wav	30.000	40.000	Train
+hVvtTC9AmNs_30.000_40.000.wav	30.000	40.000	Train
+hYqzr_rIIAw_30.000_40.000.wav	30.000	40.000	Train
+hdYQzH2E-e4_310.000_320.000.wav	310.000	320.000	Train
+iZgzRfa-xPQ_30.000_40.000.wav	30.000	40.000	Train
+j9Z63H5hvrQ_0.000_10.000.wav	0.000	10.000	Train
+jbW2ew8VMfU_50.000_60.000.wav	50.000	60.000	Train
+jlz7r-NSUuA_50.000_60.000.wav	50.000	60.000	Train
+k0vRZm7ZnQk_280.000_290.000.wav	280.000	290.000	Train
+k8H8rn4NaSM_0.000_10.000.wav	0.000	10.000	Train
+kbfkq3TuAe0_470.000_480.000.wav	470.000	480.000	Train
+lf1Sblrda3A_560.000_570.000.wav	560.000	570.000	Train
+m4DS9-5Gkds_30.000_40.000.wav	30.000	40.000	Train
+m5HeCy87QYY_380.000_390.000.wav	380.000	390.000	Train
+nKM4MUAsVzg_100.000_110.000.wav	100.000	110.000	Train
+nY1gcEMzsWI_10.000_20.000.wav	10.000	20.000	Train
+nfY_zkJceDw_30.000_40.000.wav	30.000	40.000	Train
+oogrnx-_LBA_60.000_70.000.wav	60.000	70.000	Train
+pW5SI1ZKUpA_30.000_40.000.wav	30.000	40.000	Train
+pbOZLMrJy0A_0.000_10.000.wav	0.000	10.000	Train
+pxmrmtEnROk_30.000_40.000.wav	30.000	40.000	Train
+q7zzKHFWGkg_30.000_40.000.wav	30.000	40.000	Train
+qu8vVFWKszA_30.000_40.000.wav	30.000	40.000	Train
+r6mHSfFkY_8_30.000_40.000.wav	30.000	40.000	Train
+rNNPQ9DD4no_30.000_40.000.wav	30.000	40.000	Train
+rSrBDAgLUoI_460.000_470.000.wav	460.000	470.000	Train
+stdjjG6Y5IU_30.000_40.000.wav	30.000	40.000	Train
+t_lFhyZaZR0_150.000_160.000.wav	150.000	160.000	Train
+txXSE7kgrc8_30.000_40.000.wav	30.000	40.000	Train
+uZfsEDo3elY_20.000_30.000.wav	20.000	30.000	Train
+umcnfA9veOw_160.000_170.000.wav	160.000	170.000	Train
+uysTr0SfhLI_10.000_20.000.wav	10.000	20.000	Train
+wM9wNgY8d4g_150.000_160.000.wav	150.000	160.000	Train
+xabrKa79prM_30.000_40.000.wav	30.000	40.000	Train
+xshKOSEF_6o_0.000_10.000.wav	0.000	10.000	Train
+yBVxtq9k8Sg_0.000_10.000.wav	0.000	10.000	Train
+yH1r2Bblluw_240.000_250.000.wav	240.000	250.000	Train
+yywGJu6jp8U_30.000_40.000.wav	30.000	40.000	Train
+z5uKFGeTtNg_30.000_40.000.wav	30.000	40.000	Train

audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv ADDED Viewed

	@@ -0,0 +1,606 @@

+-5QrBL6MzLg_60.000_70.000.wav	60.000	70.000	Train horn
+-E0shPRxAbo_30.000_40.000.wav	30.000	40.000	Train horn
+-GCwoyCnYsY_0.000_10.000.wav	0.000	10.000	Train horn
+-Gbohom8C4Q_30.000_40.000.wav	30.000	40.000	Train horn
+-Qfk_Q2ctBs_30.000_40.000.wav	30.000	40.000	Train horn
+-Wd1pV7UjWg_60.000_70.000.wav	60.000	70.000	Train horn
+-Zq22n4OewA_30.000_40.000.wav	30.000	40.000	Train horn
+-jj2tyuf6-A_80.000_90.000.wav	80.000	90.000	Train horn
+-nGBPqlRNg4_30.000_40.000.wav	30.000	40.000	Train horn
+-u9BxBNcrw4_30.000_40.000.wav	30.000	40.000	Train horn
+-zqW9xCZd80_260.000_270.000.wav	260.000	270.000	Train horn
+02w3vd_GgF0_390.000_400.000.wav	390.000	400.000	Train horn
+0HqeYIREv8M_30.000_40.000.wav	30.000	40.000	Train horn
+0IpYF91Fdt0_80.000_90.000.wav	80.000	90.000	Train horn
+0NaZejdABG0_90.000_100.000.wav	90.000	100.000	Train horn
+0RurXUfKyow_4.000_14.000.wav	4.000	14.000	Train horn
+0_HnD-rW3lI_170.000_180.000.wav	170.000	180.000	Train horn
+10i60V1RZkQ_210.000_220.000.wav	210.000	220.000	Train horn
+1FJY5X1iY9I_170.000_180.000.wav	170.000	180.000	Train horn
+1S5WKCcf-wU_40.000_50.000.wav	40.000	50.000	Train horn
+1U0Ty6CW6AM_40.000_50.000.wav	40.000	50.000	Train horn
+1hQLr88iCvg_30.000_40.000.wav	30.000	40.000	Train horn
+1iUXERALOOs_190.000_200.000.wav	190.000	200.000	Train horn
+1iWFlLpixKU_5.000_15.000.wav	5.000	15.000	Train horn
+1oJAVJPX0YY_20.000_30.000.wav	20.000	30.000	Train horn
+26dNsDuIt9Q_340.000_350.000.wav	340.000	350.000	Train horn
+2BMHsKLcb7E_90.000_100.000.wav	90.000	100.000	Train horn
+2RpOd9MJjyQ_10.000_20.000.wav	10.000	20.000	Train horn
+2U4wSdl10to_200.000_210.000.wav	200.000	210.000	Train horn
+2aBV6AZt5nk_570.000_580.000.wav	570.000	580.000	Train horn
+-8baTnilyjs_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+-Gbohom8C4Q_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+-jG26jT3fP8_230.000_240.000.wav	230.000	240.000	Air horn, truck horn
+-jj2tyuf6-A_80.000_90.000.wav	80.000	90.000	Air horn, truck horn
+-v7cUxke-f4_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+-yeWlsEpcpA_15.000_25.000.wav	15.000	25.000	Air horn, truck horn
+04KOunVOkSA_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+08y2LHhxmsM_400.000_410.000.wav	400.000	410.000	Air horn, truck horn
+0G73yqtBwgE_11.000_21.000.wav	11.000	21.000	Air horn, truck horn
+0UPY7ws-VFs_10.000_20.000.wav	10.000	20.000	Air horn, truck horn
+0euD32aKYUs_10.000_20.000.wav	10.000	20.000	Air horn, truck horn
+1T1i2rny8RU_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+1iRgwn7p0DA_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+1myTsHAIvYc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+1z0XoG6GEv4_420.000_430.000.wav	420.000	430.000	Air horn, truck horn
+26dNsDuIt9Q_340.000_350.000.wav	340.000	350.000	Air horn, truck horn
+2KmSuPb9gwA_24.000_34.000.wav	24.000	34.000	Air horn, truck horn
+2Vy5NCEkg2I_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+2ZciT0XrifM_0.000_8.000.wav	0.000	8.000	Air horn, truck horn
+2jOzX06bzuA_16.000_26.000.wav	16.000	26.000	Air horn, truck horn
+35EOmSMTQ6I_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+3YaLkgUMhAA_110.000_120.000.wav	110.000	120.000	Air horn, truck horn
+3ntFslTK6hM_90.000_100.000.wav	90.000	100.000	Air horn, truck horn
+3rGOv4evODE_20.000_30.000.wav	20.000	30.000	Air horn, truck horn
+42U7xIucU68_20.000_30.000.wav	20.000	30.000	Air horn, truck horn
+46r7mO2k6zY_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+4EBnb2DN3Yg_13.000_23.000.wav	13.000	23.000	Air horn, truck horn
+4NTjS5pFfSc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+4bvfOnX7BIE_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+4l78f9VZ9uE_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+-ajCLjpfGKI_83.000_93.000.wav	83.000	93.000	Car alarm
+-hLSc9aPOms_13.000_23.000.wav	13.000	23.000	Car alarm
+-rgDWfvxxqw_30.000_40.000.wav	30.000	40.000	Car alarm
+0C3kqtF76t8_50.000_60.000.wav	50.000	60.000	Car alarm
+0Hz4R_m0hmI_80.000_90.000.wav	80.000	90.000	Car alarm
+0ZPafgZftWk_80.000_90.000.wav	80.000	90.000	Car alarm
+0npLQ4LzD0c_40.000_50.000.wav	40.000	50.000	Car alarm
+17VuPl9Wxvs_20.000_30.000.wav	20.000	30.000	Car alarm
+3HxQ83IMyw4_70.000_80.000.wav	70.000	80.000	Car alarm
+3z05luLEc_Q_0.000_10.000.wav	0.000	10.000	Car alarm
+4A1Ar1TIXIY_30.000_40.000.wav	30.000	40.000	Car alarm
+4Kpklmj-ze0_53.000_63.000.wav	53.000	63.000	Car alarm
+4h01lBkTVQY_18.000_28.000.wav	18.000	28.000	Car alarm
+5-SzZotiaBU_30.000_40.000.wav	30.000	40.000	Car alarm
+54PbkldEp9M_30.000_40.000.wav	30.000	40.000	Car alarm
+5P6YYsMaIH4_30.000_40.000.wav	30.000	40.000	Car alarm
+5tzTahLHylw_70.000_80.000.wav	70.000	80.000	Car alarm
+7DC3HtNi4fU_160.000_170.000.wav	160.000	170.000	Car alarm
+7NJ5TbNEIvA_250.000_260.000.wav	250.000	260.000	Car alarm
+7NZ0kMj2HSI_54.000_64.000.wav	54.000	64.000	Car alarm
+7RQpt1_1ZzU_30.000_40.000.wav	30.000	40.000	Car alarm
+7ee54nr6jG8_30.000_40.000.wav	30.000	40.000	Car alarm
+8OajsyPSNt8_40.000_50.000.wav	40.000	50.000	Car alarm
+9fCibkUT_gQ_30.000_40.000.wav	30.000	40.000	Car alarm
+9fzeD7CeI7Y_110.000_120.000.wav	110.000	120.000	Car alarm
+9jYv9WuyknA_130.000_140.000.wav	130.000	140.000	Car alarm
+A-GNszKtjJc_93.000_103.000.wav	93.000	103.000	Car alarm
+A437a4Y_xag_230.000_240.000.wav	230.000	240.000	Car alarm
+APMPW2YI-Zk_20.000_30.000.wav	20.000	30.000	Car alarm
+AR-KmtlXg4Y_70.000_80.000.wav	70.000	80.000	Car alarm
+-60XojQWWoc_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-6d-zxMvC5E_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-6qSMlbJJ58_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-8OITuFZha8_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-8n2NqDFRko_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-AIrHVeCgtM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-AVzYvKHwPg_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-AXDeY-N2_M_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-B1uzsLG0Dk_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-BM_EAszxBg_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-Em3OpyaefM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-FWkB2IDMhc_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-SP7KWmTRUU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-h4or05bj_I_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-oV6dQu5tZo_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-r8mfjRiHrU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-s9kwrRilOY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-uMiGr6xvRA_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-x70B12Mb-8_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-xYsfYZOI-Y_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-zxrdL6MlKI_30.000_40.000.wav	30.000	40.000	Reversing beeps
+03xMfqt4fZI_24.000_34.000.wav	24.000	34.000	Reversing beeps
+0E4AqW9dmdk_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0FQo-2xRJ0E_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0HmiH-wKLB4_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0KskqFt3DoY_15.000_25.000.wav	15.000	25.000	Reversing beeps
+0OiPtV9sd_w_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0P-YGHC5cBU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0QKet-tdquc_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0VnoYVqd-yo_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-5px8DVPl8A_28.000_38.000.wav	28.000	38.000	Bicycle
+-D08wyQwDPQ_10.000_20.000.wav	10.000	20.000	Bicycle
+-F1_Gh78vJ0_30.000_40.000.wav	30.000	40.000	Bicycle
+-FZQIkX44Pk_10.000_20.000.wav	10.000	20.000	Bicycle
+-FsvS99nWTc_30.000_40.000.wav	30.000	40.000	Bicycle
+-Holdef_BZ0_30.000_40.000.wav	30.000	40.000	Bicycle
+-Inn26beF70_30.000_40.000.wav	30.000	40.000	Bicycle
+-Jq9HNSs_ns_14.000_24.000.wav	14.000	24.000	Bicycle
+-KlN_AXMM0Q_30.000_40.000.wav	30.000	40.000	Bicycle
+-NCcqKWiGus_30.000_40.000.wav	30.000	40.000	Bicycle
+-NNC_TqWfGw_30.000_40.000.wav	30.000	40.000	Bicycle
+-OGFiXvmldM_30.000_40.000.wav	30.000	40.000	Bicycle
+-RFpDUZhN-g_13.000_23.000.wav	13.000	23.000	Bicycle
+-XUfeRTw3b4_0.000_6.000.wav	0.000	6.000	Bicycle
+-XoATxJ-Qcg_30.000_40.000.wav	30.000	40.000	Bicycle
+-bFNxvFwDts_470.000_480.000.wav	470.000	480.000	Bicycle
+-e5PokL6Cyo_30.000_40.000.wav	30.000	40.000	Bicycle
+-fNyOf9zIU0_30.000_40.000.wav	30.000	40.000	Bicycle
+-fhpkRyZL90_30.000_40.000.wav	30.000	40.000	Bicycle
+-fo3m0hiZbg_30.000_40.000.wav	30.000	40.000	Bicycle
+-ikJkNwcmkA_27.000_37.000.wav	27.000	37.000	Bicycle
+-k2nMcxAjWE_30.000_40.000.wav	30.000	40.000	Bicycle
+-k80ibA-fyw_30.000_40.000.wav	30.000	40.000	Bicycle
+-lBcEVa_NKw_30.000_40.000.wav	30.000	40.000	Bicycle
+-mQyAYU_Bd4_50.000_60.000.wav	50.000	60.000	Bicycle
+-ngrinYHF4c_30.000_40.000.wav	30.000	40.000	Bicycle
+-nqm_RJ2xj8_40.000_50.000.wav	40.000	50.000	Bicycle
+-oAw5iTeT1g_40.000_50.000.wav	40.000	50.000	Bicycle
+-p2EMzpTE38_4.000_14.000.wav	4.000	14.000	Bicycle
+-qmfWP_yzn4_30.000_40.000.wav	30.000	40.000	Bicycle
+-0DIFwkUpjQ_50.000_60.000.wav	50.000	60.000	Skateboard
+-53qltVyjpc_180.000_190.000.wav	180.000	190.000	Skateboard
+-5y4jb9eUWs_110.000_120.000.wav	110.000	120.000	Skateboard
+-81kolkG8M0_0.000_8.000.wav	0.000	8.000	Skateboard
+-9dwTSq6JZg_70.000_80.000.wav	70.000	80.000	Skateboard
+-9oKZsjjf_0_20.000_30.000.wav	20.000	30.000	Skateboard
+-AFGfu5zOzQ_30.000_40.000.wav	30.000	40.000	Skateboard
+-DHGwygUsQc_30.000_40.000.wav	30.000	40.000	Skateboard
+-DkuTmIs7_Q_30.000_40.000.wav	30.000	40.000	Skateboard
+-E1E17R7UBA_260.000_270.000.wav	260.000	270.000	Skateboard
+-E1aIXhB4YU_30.000_40.000.wav	30.000	40.000	Skateboard
+-McJLXNN3-o_50.000_60.000.wav	50.000	60.000	Skateboard
+-N7nQ4CXGsY_170.000_180.000.wav	170.000	180.000	Skateboard
+-O5vrHFRzcY_30.000_40.000.wav	30.000	40.000	Skateboard
+-Plh9jAN_Eo_0.000_2.000.wav	0.000	2.000	Skateboard
+-Qd_dXTbgK0_30.000_40.000.wav	30.000	40.000	Skateboard
+-aVZ-H92M_s_0.000_4.000.wav	0.000	4.000	Skateboard
+-cd-Zn8qFxU_90.000_100.000.wav	90.000	100.000	Skateboard
+-esP4loyvjM_60.000_70.000.wav	60.000	70.000	Skateboard
+-iB3a71aPew_30.000_40.000.wav	30.000	40.000	Skateboard
+-lZapwtvwlg_0.000_10.000.wav	0.000	10.000	Skateboard
+-mxMaMJCXL8_180.000_190.000.wav	180.000	190.000	Skateboard
+-nYGTw9Sypg_20.000_30.000.wav	20.000	30.000	Skateboard
+-oS19KshdlM_30.000_40.000.wav	30.000	40.000	Skateboard
+-s6uxc77NWo_40.000_50.000.wav	40.000	50.000	Skateboard
+-sCrXS2kJlA_30.000_40.000.wav	30.000	40.000	Skateboard
+-saCvPTdQ7s_30.000_40.000.wav	30.000	40.000	Skateboard
+-sb-knLiDic_20.000_30.000.wav	20.000	30.000	Skateboard
+-tSwRvqaKWg_90.000_100.000.wav	90.000	100.000	Skateboard
+-x_jV34hVq4_30.000_40.000.wav	30.000	40.000	Skateboard
+--ljM2Kojag_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-4F1TX-T6T4_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-7HVWUwyMig_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-9pUUT-6o8U_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-Ei2LE71Dfg_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+-LGTb-xyjzA_11.000_21.000.wav	11.000	21.000	Ambulance (siren)
+-Y1qiiugnk8_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-YsrLG2K1TE_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-ZeMV790MXE_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+-d-T8Y9-TOg_17.000_27.000.wav	17.000	27.000	Ambulance (siren)
+-dcrL5JLmvo_11.000_21.000.wav	11.000	21.000	Ambulance (siren)
+-fCSO8SVWZU_6.000_16.000.wav	6.000	16.000	Ambulance (siren)
+-fGFQTGd2nA_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+-hA1yMrEXz0_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+-jnQgpHubNI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-k6p9n9y22Q_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-kr4SUjnm88_29.000_39.000.wav	29.000	39.000	Ambulance (siren)
+-lyPnABQhCI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-od8LQAVgno_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-pVEgzu95Nc_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-w-9yF465IY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-woquFRnQk8_16.000_26.000.wav	16.000	26.000	Ambulance (siren)
+-xz75wUCln8_50.000_60.000.wav	50.000	60.000	Ambulance (siren)
+-yGElLHdkEI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-yPSgCn9AWo_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-z8jsgl3iHE_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+00H_s-krtg8_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+02u3P99INjs_8.000_18.000.wav	8.000	18.000	Ambulance (siren)
+06RreMb5qbE_0.000_10.000.wav	0.000	10.000	Ambulance (siren)
+0EPK7Pv_lbE_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-0Eem_FuIto_15.000_25.000.wav	15.000	25.000	Fire engine, fire truck (siren)
+-2sT5oBBWWY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-45cKZA7Jww_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-4B435WQvag_20.000_30.000.wav	20.000	30.000	Fire engine, fire truck (siren)
+-6qhtwdfGOA_23.000_33.000.wav	23.000	33.000	Fire engine, fire truck (siren)
+-8uyNBFbdFc_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-Jsu4dbuO4A_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-KsPTvgJJVE_350.000_360.000.wav	350.000	360.000	Fire engine, fire truck (siren)
+-PRrNx6_MD0_16.000_26.000.wav	16.000	26.000	Fire engine, fire truck (siren)
+-QBo1W2w8II_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-QX-ddNtUvE_24.000_34.000.wav	24.000	34.000	Fire engine, fire truck (siren)
+-RlUu1el2G4_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-SkO97C81Ms_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-T8QHPXfIC4_13.000_23.000.wav	13.000	23.000	Fire engine, fire truck (siren)
+-USiTjZoh88_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-X0vNLwH1C0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-Z3ByS_RCwI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-ZtZOcg3s7M_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-cOjJ0Nvtlw_23.000_33.000.wav	23.000	33.000	Fire engine, fire truck (siren)
+-cbYvBBXE6A_12.000_22.000.wav	12.000	22.000	Fire engine, fire truck (siren)
+-eYUCWGQ_wU_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-hA1yMrEXz0_10.000_20.000.wav	10.000	20.000	Fire engine, fire truck (siren)
+-hplTh4SGvs_90.000_100.000.wav	90.000	100.000	Fire engine, fire truck (siren)
+-nPhg6Eu4b4_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-oCvKmNbhl0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-oEGuMg8hT4_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-pvaJ4DwtRg_3.000_13.000.wav	3.000	13.000	Fire engine, fire truck (siren)
+-qKRKDTbt4c_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-sJn3uUxpH8_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-sfn1NDHWJI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-09rxiqNNEs_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-3qh-WFUV2U_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-4JG_Ag99hY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-60NmEaP0is_0.000_10.000.wav	0.000	10.000	Civil defense siren
+-6cTEqIcics_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-6iVBmb5PZU_40.000_50.000.wav	40.000	50.000	Civil defense siren
+-6qp8NjWffE_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-75iY1j3MeY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-E3Yju3lrRo_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-FHSBdx5A3g_40.000_50.000.wav	40.000	50.000	Civil defense siren
+-JhSzxTdcwY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-OtNDK_Hxp8_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-S3_I0RiG3g_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-YMXgDKKAwU_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-c7XoYM-SSY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-j8EeIX9ynk_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-t478yabOQw_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-uIyMR9luvg_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-wgP6ua-t4k_40.000_50.000.wav	40.000	50.000	Civil defense siren
+-zGAb18JxmI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+03NLMEMi8-I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0552YhBdeXo_30.000_40.000.wav	30.000	40.000	Civil defense siren
+06TM6z3NvuY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0CUi0oGUzjU_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0GpUFFJNFH8_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0H_WUo2srs0_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0HvYkBXQ44A_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0I6Mlp27_gM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0JKcTVpby0I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0PhU-PIsUMw_40.000_50.000.wav	40.000	50.000	Civil defense siren
+-122tCXtFhU_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-1U98XBTyB4_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-2GlU3e0nTU_170.000_180.000.wav	170.000	180.000	Police car (siren)
+-6WqJCSmkCw_70.000_80.000.wav	70.000	80.000	Police car (siren)
+-AF7wp3ezww_140.000_150.000.wav	140.000	150.000	Police car (siren)
+-AFASmp1fpk_6.000_16.000.wav	6.000	16.000	Police car (siren)
+-F2lk9A8B8M_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-GPv09qi9A8_120.000_130.000.wav	120.000	130.000	Police car (siren)
+-Hi-WpRGUpc_9.000_19.000.wav	9.000	19.000	Police car (siren)
+-KsPTvgJJVE_350.000_360.000.wav	350.000	360.000	Police car (siren)
+-MfBpxtGQmE_20.000_30.000.wav	20.000	30.000	Police car (siren)
+-Pg4vVPs4bE_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-UCf_-3yzWU_290.000_300.000.wav	290.000	300.000	Police car (siren)
+-VULyMtKazE_0.000_7.000.wav	0.000	7.000	Police car (siren)
+-XRiLbb3Syo_2.000_12.000.wav	2.000	12.000	Police car (siren)
+-XrpzGb6xCU_190.000_200.000.wav	190.000	200.000	Police car (siren)
+-YsrLG2K1TE_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-ZtZOcg3s7M_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-_8fdnv6Crg_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-az6BooRLxw_40.000_50.000.wav	40.000	50.000	Police car (siren)
+-bs3c27rEtc_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-dBTGdL4RFs_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-gKNRXbpAKs_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-hA1yMrEXz0_10.000_20.000.wav	10.000	20.000	Police car (siren)
+-haSUR_IUto_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-l-DEfDAvNA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-lWs7_49gss_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-lhnhB4rbGw_3.000_13.000.wav	3.000	13.000	Police car (siren)
+-rkJeBBmiTQ_60.000_70.000.wav	60.000	70.000	Police car (siren)
+-rs7FPxzc6w_8.000_18.000.wav	8.000	18.000	Police car (siren)
+-20uudT97E0_30.000_40.000.wav	30.000	40.000	Screaming
+-3bGlOhRkAo_140.000_150.000.wav	140.000	150.000	Screaming
+-4pUrlMafww_1.000_11.000.wav	1.000	11.000	Screaming
+-7R0ybQQAHg_60.000_70.000.wav	60.000	70.000	Screaming
+-7gojlG6bE4_30.000_40.000.wav	30.000	40.000	Screaming
+-GI5PbO6j50_30.000_40.000.wav	30.000	40.000	Screaming
+-MuIRudOtxw_30.000_40.000.wav	30.000	40.000	Screaming
+-WfQBr42ymw_30.000_40.000.wav	30.000	40.000	Screaming
+-YOjIgYspsY_30.000_40.000.wav	30.000	40.000	Screaming
+-g_AcRVFfXU_30.000_40.000.wav	30.000	40.000	Screaming
+-gb5uvwsRpI_30.000_40.000.wav	30.000	40.000	Screaming
+-iAwqlQ3TEk_0.000_3.000.wav	0.000	3.000	Screaming
+-nJoxcmxz5g_30.000_40.000.wav	30.000	40.000	Screaming
+-pwgypWE-J8_30.000_40.000.wav	30.000	40.000	Screaming
+-pzasCR0kpc_30.000_40.000.wav	30.000	40.000	Screaming
+-sUgHKZQKYc_30.000_40.000.wav	30.000	40.000	Screaming
+-uazzQEmQ7c_0.000_10.000.wav	0.000	10.000	Screaming
+-vHJU1wDRsY_30.000_40.000.wav	30.000	40.000	Screaming
+0-RnTXpp8Q0_30.000_40.000.wav	30.000	40.000	Screaming
+09YQukdYVI4_30.000_40.000.wav	30.000	40.000	Screaming
+0Ees8KFCUXM_30.000_40.000.wav	30.000	40.000	Screaming
+0EymGuYWkFk_30.000_40.000.wav	30.000	40.000	Screaming
+0Nw1OyTsaAo_30.000_40.000.wav	30.000	40.000	Screaming
+0YnOMAls83g_30.000_40.000.wav	30.000	40.000	Screaming
+0_gyUQkLCY8_30.000_40.000.wav	30.000	40.000	Screaming
+0_hnDV2SHBI_7.000_17.000.wav	7.000	17.000	Screaming
+0cqEaAkbrbI_80.000_90.000.wav	80.000	90.000	Screaming
+0hC044mDsWA_30.000_40.000.wav	30.000	40.000	Screaming
+0kQANiakiH0_30.000_40.000.wav	30.000	40.000	Screaming
+0rVBXpbgO8s_30.000_40.000.wav	30.000	40.000	Screaming
+---lTs1dxhU_30.000_40.000.wav	30.000	40.000	Car
+--330hg-Ocw_30.000_40.000.wav	30.000	40.000	Car
+--8puiAGLhs_30.000_40.000.wav	30.000	40.000	Car
+--9VR_F7CtY_30.000_40.000.wav	30.000	40.000	Car
+--F70LWypIg_30.000_40.000.wav	30.000	40.000	Car
+--P4wuph3Mc_0.000_8.000.wav	0.000	8.000	Car
+--QvRbvnbUE_30.000_40.000.wav	30.000	40.000	Car
+--SeOZy3Yik_30.000_40.000.wav	30.000	40.000	Car
+--Zz7BgxSUg_30.000_40.000.wav	30.000	40.000	Car
+--e0Vu_ruTc_30.000_40.000.wav	30.000	40.000	Car
+--iFD6IyQW8_30.000_40.000.wav	30.000	40.000	Car
+--jGnLqFsQ4_24.000_34.000.wav	24.000	34.000	Car
+--jc0NAxK8M_30.000_40.000.wav	30.000	40.000	Car
+--v1WjOJv-w_150.000_160.000.wav	150.000	160.000	Car
+--xDffQ9Mwo_30.000_40.000.wav	30.000	40.000	Car
+--yaQA8d1dI_6.000_16.000.wav	6.000	16.000	Car
+--zLzL0sq3M_30.000_40.000.wav	30.000	40.000	Car
+-0-jXXldDOU_10.000_20.000.wav	10.000	20.000	Car
+-03ld83JliM_29.000_39.000.wav	29.000	39.000	Car
+-0B-egfXU7E_30.000_40.000.wav	30.000	40.000	Car
+-0Bkyt8iZ1I_8.000_18.000.wav	8.000	18.000	Car
+-0CIk-OOp7Y_30.000_40.000.wav	30.000	40.000	Car
+-0CRb8H4hzY_4.000_14.000.wav	4.000	14.000	Car
+-0CY5NWBHyY_20.000_30.000.wav	20.000	30.000	Car
+-0HsrVfb5vc_20.000_30.000.wav	20.000	30.000	Car
+-0I89-H0AFo_26.000_36.000.wav	26.000	36.000	Car
+-0P6VDQ1YDs_80.000_90.000.wav	80.000	90.000	Car
+-0PrEsytvc0_30.000_40.000.wav	30.000	40.000	Car
+-0RqnaXZu_E_30.000_40.000.wav	30.000	40.000	Car
+-0Yynyhm1AY_14.000_24.000.wav	14.000	24.000	Car
+---lTs1dxhU_30.000_40.000.wav	30.000	40.000	Car passing by
+--P4wuph3Mc_0.000_8.000.wav	0.000	8.000	Car passing by
+--xDffQ9Mwo_30.000_40.000.wav	30.000	40.000	Car passing by
+--zLzL0sq3M_30.000_40.000.wav	30.000	40.000	Car passing by
+--zbPxnl27o_20.000_30.000.wav	20.000	30.000	Car passing by
+-0CRb8H4hzY_4.000_14.000.wav	4.000	14.000	Car passing by
+-0MnD7jBvkE_0.000_4.000.wav	0.000	4.000	Car passing by
+-0U3c4PN8sc_30.000_40.000.wav	30.000	40.000	Car passing by
+-0Yynyhm1AY_14.000_24.000.wav	14.000	24.000	Car passing by
+-10fWp7Pqs4_30.000_40.000.wav	30.000	40.000	Car passing by
+-14BFlDzjS4_6.000_16.000.wav	6.000	16.000	Car passing by
+-15nPYi2v1g_30.000_40.000.wav	30.000	40.000	Car passing by
+-19pq3HJoBM_30.000_40.000.wav	30.000	40.000	Car passing by
+-1BrkFLHD74_19.000_29.000.wav	19.000	29.000	Car passing by
+-1HlfoHZCEE_6.000_16.000.wav	6.000	16.000	Car passing by
+-1McjOPUzbo_30.000_40.000.wav	30.000	40.000	Car passing by
+-1sGSNmgiPs_4.000_14.000.wav	4.000	14.000	Car passing by
+-2-luek6dI8_30.000_40.000.wav	30.000	40.000	Car passing by
+-21-RfxQscI_30.000_40.000.wav	30.000	40.000	Car passing by
+-25LkbSjEos_30.000_40.000.wav	30.000	40.000	Car passing by
+-2LJWaL2PuA_30.000_40.000.wav	30.000	40.000	Car passing by
+-2ZbvsBSZmY_2.000_12.000.wav	2.000	12.000	Car passing by
+-2cz2qQDmr4_30.000_40.000.wav	30.000	40.000	Car passing by
+-31KUAOSg5U_5.000_15.000.wav	5.000	15.000	Car passing by
+-35qBdzN9ck_30.000_40.000.wav	30.000	40.000	Car passing by
+-3929cmVE20_30.000_40.000.wav	30.000	40.000	Car passing by
+-3M-k4nIYIM_30.000_40.000.wav	30.000	40.000	Car passing by
+-3MNphBfq_0_30.000_40.000.wav	30.000	40.000	Car passing by
+-3_RSVYKkkk_30.000_40.000.wav	30.000	40.000	Car passing by
+-3exNVlj92w_30.000_40.000.wav	30.000	40.000	Car passing by
+--0w1YA1Hm4_30.000_40.000.wav	30.000	40.000	Bus
+-0_vEaaXndY_11.000_21.000.wav	11.000	21.000	Bus
+-5GcZwBvBdI_30.000_40.000.wav	30.000	40.000	Bus
+-5digoPWn6U_8.000_18.000.wav	8.000	18.000	Bus
+-79l4w4DsYM_30.000_40.000.wav	30.000	40.000	Bus
+-7B4pbkIEas_30.000_40.000.wav	30.000	40.000	Bus
+-8YTu7ZGA2w_30.000_40.000.wav	30.000	40.000	Bus
+-93IM29_8rs_14.000_24.000.wav	14.000	24.000	Bus
+-9GhPxGkpio_26.000_36.000.wav	26.000	36.000	Bus
+-9J9xs7LM9Y_25.000_35.000.wav	25.000	35.000	Bus
+-AY_lZLYJR8_8.000_18.000.wav	8.000	18.000	Bus
+-AdQBgtN_4E_30.000_40.000.wav	30.000	40.000	Bus
+-BxfsWlPUPY_30.000_40.000.wav	30.000	40.000	Bus
+-CgCr8Eknm0_14.000_24.000.wav	14.000	24.000	Bus
+-CnsvTDIXdE_20.000_30.000.wav	20.000	30.000	Bus
+-CpMlnGhxEU_0.000_9.000.wav	0.000	9.000	Bus
+-DP_cv0x_Ng_30.000_40.000.wav	30.000	40.000	Bus
+-FEXRjcryZE_30.000_40.000.wav	30.000	40.000	Bus
+-Fp2-w-iLiE_20.000_30.000.wav	20.000	30.000	Bus
+-GLk6G9U09A_30.000_40.000.wav	30.000	40.000	Bus
+-Ga9sSkpngg_30.000_40.000.wav	30.000	40.000	Bus
+-H8V23dZoLo_0.000_10.000.wav	0.000	10.000	Bus
+-HeQfwKbFzg_30.000_40.000.wav	30.000	40.000	Bus
+-HzzEuFBiDU_30.000_40.000.wav	30.000	40.000	Bus
+-I4INTpMKT4_30.000_40.000.wav	30.000	40.000	Bus
+-II-7qJxKPc_21.000_31.000.wav	21.000	31.000	Bus
+-LnpzyfTkF8_30.000_40.000.wav	30.000	40.000	Bus
+-OgRshQfsi8_30.000_40.000.wav	30.000	40.000	Bus
+-P53lJ1ViWk_30.000_40.000.wav	30.000	40.000	Bus
+-PvNUvEov4Q_30.000_40.000.wav	30.000	40.000	Bus
+--12UOziMF0_30.000_40.000.wav	30.000	40.000	Truck
+--73E04RpiQ_0.000_9.000.wav	0.000	9.000	Truck
+--J947HxQVM_0.000_9.000.wav	0.000	9.000	Truck
+--bD1DVKlzQ_30.000_40.000.wav	30.000	40.000	Truck
+--ivFZu-hlc_30.000_40.000.wav	30.000	40.000	Truck
+--wuU7kzB5o_30.000_40.000.wav	30.000	40.000	Truck
+-0B_CYyG5Dg_30.000_40.000.wav	30.000	40.000	Truck
+-0JqTq_4jaE_40.000_50.000.wav	40.000	50.000	Truck
+-0MrEZKJ5MQ_30.000_40.000.wav	30.000	40.000	Truck
+-0awng26xQ8_30.000_40.000.wav	30.000	40.000	Truck
+-0dq1Vg9rd8_30.000_40.000.wav	30.000	40.000	Truck
+-0wkq7CUYME_310.000_320.000.wav	310.000	320.000	Truck
+-14RXdkqYuI_30.000_40.000.wav	30.000	40.000	Truck
+-1B3CzpiW1M_30.000_40.000.wav	30.000	40.000	Truck
+-1Q21cZhHDE_30.000_40.000.wav	30.000	40.000	Truck
+-1ZXXnBXJ6c_8.000_18.000.wav	8.000	18.000	Truck
+-1s0DWApvT8_30.000_40.000.wav	30.000	40.000	Truck
+-1s84_2Vn4g_30.000_40.000.wav	30.000	40.000	Truck
+-26ansJluVo_30.000_40.000.wav	30.000	40.000	Truck
+-2EscdO0l-A_30.000_40.000.wav	30.000	40.000	Truck
+-2GlU3e0nTU_170.000_180.000.wav	170.000	180.000	Truck
+-2NBZUCcvm0_30.000_40.000.wav	30.000	40.000	Truck
+-2sT5oBBWWY_30.000_40.000.wav	30.000	40.000	Truck
+-2vmprMUw10_30.000_40.000.wav	30.000	40.000	Truck
+-2x4TB8VWvE_18.000_28.000.wav	18.000	28.000	Truck
+-39q4y0tt-g_30.000_40.000.wav	30.000	40.000	Truck
+-3N5rjPrNCc_190.000_200.000.wav	190.000	200.000	Truck
+-3NcUIyJtFY_30.000_40.000.wav	30.000	40.000	Truck
+-3PplV0ErOk_30.000_40.000.wav	30.000	40.000	Truck
+-3gSkrDKNSA_27.000_37.000.wav	27.000	37.000	Truck
+--p-rk_HBuU_30.000_40.000.wav	30.000	40.000	Motorcycle
+-1WK72M4xeg_220.000_230.000.wav	220.000	230.000	Motorcycle
+-1XfuJcdvfg_30.000_40.000.wav	30.000	40.000	Motorcycle
+-3XWBAmjmaQ_11.000_21.000.wav	11.000	21.000	Motorcycle
+-4-87UgJcUw_70.000_80.000.wav	70.000	80.000	Motorcycle
+-4D3Gkyisyc_30.000_40.000.wav	30.000	40.000	Motorcycle
+-5k5GyHd2So_4.000_14.000.wav	4.000	14.000	Motorcycle
+-6A2L1U9b5Y_54.000_64.000.wav	54.000	64.000	Motorcycle
+-6Yfati1N10_80.000_90.000.wav	80.000	90.000	Motorcycle
+-7_o_GhpZpM_12.000_22.000.wav	12.000	22.000	Motorcycle
+-7rZwMK6uSs_70.000_80.000.wav	70.000	80.000	Motorcycle
+-85f5DKKfSo_30.000_40.000.wav	30.000	40.000	Motorcycle
+-9Smdrt5zwk_40.000_50.000.wav	40.000	50.000	Motorcycle
+-9gZLVDKpnE_30.000_40.000.wav	30.000	40.000	Motorcycle
+-BGebo8V4XY_30.000_40.000.wav	30.000	40.000	Motorcycle
+-DdiduB5B_w_190.000_200.000.wav	190.000	200.000	Motorcycle
+-HIPq7T3eFI_11.000_21.000.wav	11.000	21.000	Motorcycle
+-H_3oEkKe0M_50.000_60.000.wav	50.000	60.000	Motorcycle
+-HmuMoykRqA_500.000_510.000.wav	500.000	510.000	Motorcycle
+-IMRE_psvtI_30.000_40.000.wav	30.000	40.000	Motorcycle
+-Ie4LSPDEF4_6.000_16.000.wav	6.000	16.000	Motorcycle
+-J0F29UCZiA_70.000_80.000.wav	70.000	80.000	Motorcycle
+-KFCJ7ydu2E_0.000_10.000.wav	0.000	10.000	Motorcycle
+-KmDAgYb0Uo_100.000_110.000.wav	100.000	110.000	Motorcycle
+-P7iW3WzNfc_400.000_410.000.wav	400.000	410.000	Motorcycle
+-QMAKXzIGx4_10.000_20.000.wav	10.000	20.000	Motorcycle
+-S-5z2vYtxw_10.000_20.000.wav	10.000	20.000	Motorcycle
+-SlL0NZh51w_30.000_40.000.wav	30.000	40.000	Motorcycle
+-US2mpJxbj4_30.000_40.000.wav	30.000	40.000	Motorcycle
+-VO-C9C0uqY_1.000_11.000.wav	1.000	11.000	Motorcycle
+--H_-CEB2wA_30.000_40.000.wav	30.000	40.000	Train
+-1VsFy0eVJs_30.000_40.000.wav	30.000	40.000	Train
+-1X7kpLnOpM_60.000_70.000.wav	60.000	70.000	Train
+-3FIglJti0s_30.000_40.000.wav	30.000	40.000	Train
+-5QrBL6MzLg_60.000_70.000.wav	60.000	70.000	Train
+-6KOEEiAf9s_19.000_29.000.wav	19.000	29.000	Train
+-97l_c6PToE_30.000_40.000.wav	30.000	40.000	Train
+-9S5Z-uciLo_70.000_80.000.wav	70.000	80.000	Train
+-CkgGfKepO4_140.000_150.000.wav	140.000	150.000	Train
+-E0shPRxAbo_30.000_40.000.wav	30.000	40.000	Train
+-Gbohom8C4Q_30.000_40.000.wav	30.000	40.000	Train
+-JpQivta6MQ_20.000_30.000.wav	20.000	30.000	Train
+-K9oTZj3mVQ_30.000_40.000.wav	30.000	40.000	Train
+-KjE40DlSdU_0.000_10.000.wav	0.000	10.000	Train
+-NrFtZ_xxFU_30.000_40.000.wav	30.000	40.000	Train
+-PYRamK58Ss_0.000_10.000.wav	0.000	10.000	Train
+-P_XDJt4p_s_30.000_40.000.wav	30.000	40.000	Train
+-Pjylzex7oc_350.000_360.000.wav	350.000	360.000	Train
+-QHuZGmIy_I_30.000_40.000.wav	30.000	40.000	Train
+-Qfk_Q2ctBs_30.000_40.000.wav	30.000	40.000	Train
+-RXKRoRPWXg_30.000_40.000.wav	30.000	40.000	Train
+-VH414svzI0_30.000_40.000.wav	30.000	40.000	Train
+-WFdYxE-PYI_30.000_40.000.wav	30.000	40.000	Train
+-Wd1pV7UjWg_60.000_70.000.wav	60.000	70.000	Train
+-XcC-UlbcRA_30.000_40.000.wav	30.000	40.000	Train
+-Y2cD8xvCHI_30.000_40.000.wav	30.000	40.000	Train
+-ZKZkMHe3cY_70.000_80.000.wav	70.000	80.000	Train
+-Zq22n4OewA_30.000_40.000.wav	30.000	40.000	Train
+-aZ7XC4LG2A_30.000_40.000.wav	30.000	40.000	Train
+-abVemAm9HM_430.000_440.000.wav	430.000	440.000	Train
+1T1i2rny8RU_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+7DC3HtNi4fU_160.000_170.000.wav	160.000	170.000	Ambulance (siren)
+-z8jsgl3iHE_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+00H_s-krtg8_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+0I6Mlp27_gM_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+3YaLkgUMhAA_110.000_120.000.wav	110.000	120.000	Fire engine, fire truck (siren)
+4l78f9VZ9uE_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+35EOmSMTQ6I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+06RreMb5qbE_0.000_10.000.wav	0.000	10.000	Police car (siren)
+0EPK7Pv_lbE_30.000_40.000.wav	30.000	40.000	Police car (siren)
+0I6Mlp27_gM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+17VuPl9Wxvs_20.000_30.000.wav	20.000	30.000	Police car (siren)
+4A1Ar1TIXIY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-10fWp7Pqs4_30.000_40.000.wav	30.000	40.000	Car
+-122tCXtFhU_30.000_40.000.wav	30.000	40.000	Car
+-14BFlDzjS4_6.000_16.000.wav	6.000	16.000	Car
+-1BrkFLHD74_19.000_29.000.wav	19.000	29.000	Car
+-1HlfoHZCEE_6.000_16.000.wav	6.000	16.000	Car
+-1McjOPUzbo_30.000_40.000.wav	30.000	40.000	Car
+-1sGSNmgiPs_4.000_14.000.wav	4.000	14.000	Car
+-25LkbSjEos_30.000_40.000.wav	30.000	40.000	Car
+-2GlU3e0nTU_170.000_180.000.wav	170.000	180.000	Car
+-2LJWaL2PuA_30.000_40.000.wav	30.000	40.000	Car
+-2ZbvsBSZmY_2.000_12.000.wav	2.000	12.000	Car
+-2cz2qQDmr4_30.000_40.000.wav	30.000	40.000	Car
+-31KUAOSg5U_5.000_15.000.wav	5.000	15.000	Car
+-35qBdzN9ck_30.000_40.000.wav	30.000	40.000	Car
+-3929cmVE20_30.000_40.000.wav	30.000	40.000	Car
+-3M-k4nIYIM_30.000_40.000.wav	30.000	40.000	Car
+-3MNphBfq_0_30.000_40.000.wav	30.000	40.000	Car
+-3_RSVYKkkk_30.000_40.000.wav	30.000	40.000	Car
+-AF7wp3ezww_140.000_150.000.wav	140.000	150.000	Car
+-Pg4vVPs4bE_30.000_40.000.wav	30.000	40.000	Car
+-VULyMtKazE_0.000_7.000.wav	0.000	7.000	Car
+-cbYvBBXE6A_12.000_22.000.wav	12.000	22.000	Car
+06RreMb5qbE_0.000_10.000.wav	0.000	10.000	Car
+0E4AqW9dmdk_30.000_40.000.wav	30.000	40.000	Car
+0Hz4R_m0hmI_80.000_90.000.wav	80.000	90.000	Car
+4Kpklmj-ze0_53.000_63.000.wav	53.000	63.000	Car
+5tzTahLHylw_70.000_80.000.wav	70.000	80.000	Car
+7NJ5TbNEIvA_250.000_260.000.wav	250.000	260.000	Car
+9fCibkUT_gQ_30.000_40.000.wav	30.000	40.000	Car
+9jYv9WuyknA_130.000_140.000.wav	130.000	140.000	Car
+-l-DEfDAvNA_30.000_40.000.wav	30.000	40.000	Car passing by
+9fCibkUT_gQ_30.000_40.000.wav	30.000	40.000	Car passing by
+-jj2tyuf6-A_80.000_90.000.wav	80.000	90.000	Bus
+-45cKZA7Jww_30.000_40.000.wav	30.000	40.000	Truck
+-4B435WQvag_20.000_30.000.wav	20.000	30.000	Truck
+-60XojQWWoc_30.000_40.000.wav	30.000	40.000	Truck
+-6qhtwdfGOA_23.000_33.000.wav	23.000	33.000	Truck
+-8OITuFZha8_30.000_40.000.wav	30.000	40.000	Truck
+-8n2NqDFRko_30.000_40.000.wav	30.000	40.000	Truck
+-AIrHVeCgtM_30.000_40.000.wav	30.000	40.000	Truck
+-AVzYvKHwPg_30.000_40.000.wav	30.000	40.000	Truck
+-BM_EAszxBg_30.000_40.000.wav	30.000	40.000	Truck
+-Ei2LE71Dfg_20.000_30.000.wav	20.000	30.000	Truck
+-FWkB2IDMhc_30.000_40.000.wav	30.000	40.000	Truck
+-Jsu4dbuO4A_30.000_40.000.wav	30.000	40.000	Truck
+-PRrNx6_MD0_16.000_26.000.wav	16.000	26.000	Truck
+-X0vNLwH1C0_30.000_40.000.wav	30.000	40.000	Truck
+-cbYvBBXE6A_12.000_22.000.wav	12.000	22.000	Truck
+-oCvKmNbhl0_30.000_40.000.wav	30.000	40.000	Truck
+-oV6dQu5tZo_30.000_40.000.wav	30.000	40.000	Truck
+-qKRKDTbt4c_30.000_40.000.wav	30.000	40.000	Truck
+-r8mfjRiHrU_30.000_40.000.wav	30.000	40.000	Truck
+-s9kwrRilOY_30.000_40.000.wav	30.000	40.000	Truck
+-uMiGr6xvRA_30.000_40.000.wav	30.000	40.000	Truck
+-x70B12Mb-8_30.000_40.000.wav	30.000	40.000	Truck
+-xYsfYZOI-Y_30.000_40.000.wav	30.000	40.000	Truck
+-zxrdL6MlKI_30.000_40.000.wav	30.000	40.000	Truck
+0C3kqtF76t8_50.000_60.000.wav	50.000	60.000	Truck
+0HmiH-wKLB4_30.000_40.000.wav	30.000	40.000	Truck
+0KskqFt3DoY_15.000_25.000.wav	15.000	25.000	Truck
+0OiPtV9sd_w_30.000_40.000.wav	30.000	40.000	Truck
+0VnoYVqd-yo_30.000_40.000.wav	30.000	40.000	Truck
+3YaLkgUMhAA_110.000_120.000.wav	110.000	120.000	Truck
+-nGBPqlRNg4_30.000_40.000.wav	30.000	40.000	Train
+02w3vd_GgF0_390.000_400.000.wav	390.000	400.000	Train
+0HqeYIREv8M_30.000_40.000.wav	30.000	40.000	Train
+0IpYF91Fdt0_80.000_90.000.wav	80.000	90.000	Train
+0NaZejdABG0_90.000_100.000.wav	90.000	100.000	Train
+0RurXUfKyow_4.000_14.000.wav	4.000	14.000	Train
+0_HnD-rW3lI_170.000_180.000.wav	170.000	180.000	Train
+10i60V1RZkQ_210.000_220.000.wav	210.000	220.000	Train
+1FJY5X1iY9I_170.000_180.000.wav	170.000	180.000	Train
+1U0Ty6CW6AM_40.000_50.000.wav	40.000	50.000	Train
+1hQLr88iCvg_30.000_40.000.wav	30.000	40.000	Train
+1iUXERALOOs_190.000_200.000.wav	190.000	200.000	Train
+1iWFlLpixKU_5.000_15.000.wav	5.000	15.000	Train
+1oJAVJPX0YY_20.000_30.000.wav	20.000	30.000	Train
+26dNsDuIt9Q_340.000_350.000.wav	340.000	350.000	Train
+2BMHsKLcb7E_90.000_100.000.wav	90.000	100.000	Train
+2RpOd9MJjyQ_10.000_20.000.wav	10.000	20.000	Train
+2U4wSdl10to_200.000_210.000.wav	200.000	210.000	Train
+2aBV6AZt5nk_570.000_580.000.wav	570.000	580.000	Train
+3ntFslTK6hM_90.000_100.000.wav	90.000	100.000	Train

audio_detection/audio_infer/metadata/class_labels_indices.csv ADDED Viewed

	@@ -0,0 +1,528 @@

+index,mid,display_name
+0,/m/09x0r,"Speech"
+1,/m/05zppz,"Male speech, man speaking"
+2,/m/02zsn,"Female speech, woman speaking"
+3,/m/0ytgt,"Child speech, kid speaking"
+4,/m/01h8n0,"Conversation"
+5,/m/02qldy,"Narration, monologue"
+6,/m/0261r1,"Babbling"
+7,/m/0brhx,"Speech synthesizer"
+8,/m/07p6fty,"Shout"
+9,/m/07q4ntr,"Bellow"
+10,/m/07rwj3x,"Whoop"
+11,/m/07sr1lc,"Yell"
+12,/m/04gy_2,"Battle cry"
+13,/t/dd00135,"Children shouting"
+14,/m/03qc9zr,"Screaming"
+15,/m/02rtxlg,"Whispering"
+16,/m/01j3sz,"Laughter"
+17,/t/dd00001,"Baby laughter"
+18,/m/07r660_,"Giggle"
+19,/m/07s04w4,"Snicker"
+20,/m/07sq110,"Belly laugh"
+21,/m/07rgt08,"Chuckle, chortle"
+22,/m/0463cq4,"Crying, sobbing"
+23,/t/dd00002,"Baby cry, infant cry"
+24,/m/07qz6j3,"Whimper"
+25,/m/07qw_06,"Wail, moan"
+26,/m/07plz5l,"Sigh"
+27,/m/015lz1,"Singing"
+28,/m/0l14jd,"Choir"
+29,/m/01swy6,"Yodeling"
+30,/m/02bk07,"Chant"
+31,/m/01c194,"Mantra"
+32,/t/dd00003,"Male singing"
+33,/t/dd00004,"Female singing"
+34,/t/dd00005,"Child singing"
+35,/t/dd00006,"Synthetic singing"
+36,/m/06bxc,"Rapping"
+37,/m/02fxyj,"Humming"
+38,/m/07s2xch,"Groan"
+39,/m/07r4k75,"Grunt"
+40,/m/01w250,"Whistling"
+41,/m/0lyf6,"Breathing"
+42,/m/07mzm6,"Wheeze"
+43,/m/01d3sd,"Snoring"
+44,/m/07s0dtb,"Gasp"
+45,/m/07pyy8b,"Pant"
+46,/m/07q0yl5,"Snort"
+47,/m/01b_21,"Cough"
+48,/m/0dl9sf8,"Throat clearing"
+49,/m/01hsr_,"Sneeze"
+50,/m/07ppn3j,"Sniff"
+51,/m/06h7j,"Run"
+52,/m/07qv_x_,"Shuffle"
+53,/m/07pbtc8,"Walk, footsteps"
+54,/m/03cczk,"Chewing, mastication"
+55,/m/07pdhp0,"Biting"
+56,/m/0939n_,"Gargling"
+57,/m/01g90h,"Stomach rumble"
+58,/m/03q5_w,"Burping, eructation"
+59,/m/02p3nc,"Hiccup"
+60,/m/02_nn,"Fart"
+61,/m/0k65p,"Hands"
+62,/m/025_jnm,"Finger snapping"
+63,/m/0l15bq,"Clapping"
+64,/m/01jg02,"Heart sounds, heartbeat"
+65,/m/01jg1z,"Heart murmur"
+66,/m/053hz1,"Cheering"
+67,/m/028ght,"Applause"
+68,/m/07rkbfh,"Chatter"
+69,/m/03qtwd,"Crowd"
+70,/m/07qfr4h,"Hubbub, speech noise, speech babble"
+71,/t/dd00013,"Children playing"
+72,/m/0jbk,"Animal"
+73,/m/068hy,"Domestic animals, pets"
+74,/m/0bt9lr,"Dog"
+75,/m/05tny_,"Bark"
+76,/m/07r_k2n,"Yip"
+77,/m/07qf0zm,"Howl"
+78,/m/07rc7d9,"Bow-wow"
+79,/m/0ghcn6,"Growling"
+80,/t/dd00136,"Whimper (dog)"
+81,/m/01yrx,"Cat"
+82,/m/02yds9,"Purr"
+83,/m/07qrkrw,"Meow"
+84,/m/07rjwbb,"Hiss"
+85,/m/07r81j2,"Caterwaul"
+86,/m/0ch8v,"Livestock, farm animals, working animals"
+87,/m/03k3r,"Horse"
+88,/m/07rv9rh,"Clip-clop"
+89,/m/07q5rw0,"Neigh, whinny"
+90,/m/01xq0k1,"Cattle, bovinae"
+91,/m/07rpkh9,"Moo"
+92,/m/0239kh,"Cowbell"
+93,/m/068zj,"Pig"
+94,/t/dd00018,"Oink"
+95,/m/03fwl,"Goat"
+96,/m/07q0h5t,"Bleat"
+97,/m/07bgp,"Sheep"
+98,/m/025rv6n,"Fowl"
+99,/m/09b5t,"Chicken, rooster"
+100,/m/07st89h,"Cluck"
+101,/m/07qn5dc,"Crowing, cock-a-doodle-doo"
+102,/m/01rd7k,"Turkey"
+103,/m/07svc2k,"Gobble"
+104,/m/09ddx,"Duck"
+105,/m/07qdb04,"Quack"
+106,/m/0dbvp,"Goose"
+107,/m/07qwf61,"Honk"
+108,/m/01280g,"Wild animals"
+109,/m/0cdnk,"Roaring cats (lions, tigers)"
+110,/m/04cvmfc,"Roar"
+111,/m/015p6,"Bird"
+112,/m/020bb7,"Bird vocalization, bird call, bird song"
+113,/m/07pggtn,"Chirp, tweet"
+114,/m/07sx8x_,"Squawk"
+115,/m/0h0rv,"Pigeon, dove"
+116,/m/07r_25d,"Coo"
+117,/m/04s8yn,"Crow"
+118,/m/07r5c2p,"Caw"
+119,/m/09d5_,"Owl"
+120,/m/07r_80w,"Hoot"
+121,/m/05_wcq,"Bird flight, flapping wings"
+122,/m/01z5f,"Canidae, dogs, wolves"
+123,/m/06hps,"Rodents, rats, mice"
+124,/m/04rmv,"Mouse"
+125,/m/07r4gkf,"Patter"
+126,/m/03vt0,"Insect"
+127,/m/09xqv,"Cricket"
+128,/m/09f96,"Mosquito"
+129,/m/0h2mp,"Fly, housefly"
+130,/m/07pjwq1,"Buzz"
+131,/m/01h3n,"Bee, wasp, etc."
+132,/m/09ld4,"Frog"
+133,/m/07st88b,"Croak"
+134,/m/078jl,"Snake"
+135,/m/07qn4z3,"Rattle"
+136,/m/032n05,"Whale vocalization"
+137,/m/04rlf,"Music"
+138,/m/04szw,"Musical instrument"
+139,/m/0fx80y,"Plucked string instrument"
+140,/m/0342h,"Guitar"
+141,/m/02sgy,"Electric guitar"
+142,/m/018vs,"Bass guitar"
+143,/m/042v_gx,"Acoustic guitar"
+144,/m/06w87,"Steel guitar, slide guitar"
+145,/m/01glhc,"Tapping (guitar technique)"
+146,/m/07s0s5r,"Strum"
+147,/m/018j2,"Banjo"
+148,/m/0jtg0,"Sitar"
+149,/m/04rzd,"Mandolin"
+150,/m/01bns_,"Zither"
+151,/m/07xzm,"Ukulele"
+152,/m/05148p4,"Keyboard (musical)"
+153,/m/05r5c,"Piano"
+154,/m/01s0ps,"Electric piano"
+155,/m/013y1f,"Organ"
+156,/m/03xq_f,"Electronic organ"
+157,/m/03gvt,"Hammond organ"
+158,/m/0l14qv,"Synthesizer"
+159,/m/01v1d8,"Sampler"
+160,/m/03q5t,"Harpsichord"
+161,/m/0l14md,"Percussion"
+162,/m/02hnl,"Drum kit"
+163,/m/0cfdd,"Drum machine"
+164,/m/026t6,"Drum"
+165,/m/06rvn,"Snare drum"
+166,/m/03t3fj,"Rimshot"
+167,/m/02k_mr,"Drum roll"
+168,/m/0bm02,"Bass drum"
+169,/m/011k_j,"Timpani"
+170,/m/01p970,"Tabla"
+171,/m/01qbl,"Cymbal"
+172,/m/03qtq,"Hi-hat"
+173,/m/01sm1g,"Wood block"
+174,/m/07brj,"Tambourine"
+175,/m/05r5wn,"Rattle (instrument)"
+176,/m/0xzly,"Maraca"
+177,/m/0mbct,"Gong"
+178,/m/016622,"Tubular bells"
+179,/m/0j45pbj,"Mallet percussion"
+180,/m/0dwsp,"Marimba, xylophone"
+181,/m/0dwtp,"Glockenspiel"
+182,/m/0dwt5,"Vibraphone"
+183,/m/0l156b,"Steelpan"
+184,/m/05pd6,"Orchestra"
+185,/m/01kcd,"Brass instrument"
+186,/m/0319l,"French horn"
+187,/m/07gql,"Trumpet"
+188,/m/07c6l,"Trombone"
+189,/m/0l14_3,"Bowed string instrument"
+190,/m/02qmj0d,"String section"
+191,/m/07y_7,"Violin, fiddle"
+192,/m/0d8_n,"Pizzicato"
+193,/m/01xqw,"Cello"
+194,/m/02fsn,"Double bass"
+195,/m/085jw,"Wind instrument, woodwind instrument"
+196,/m/0l14j_,"Flute"
+197,/m/06ncr,"Saxophone"
+198,/m/01wy6,"Clarinet"
+199,/m/03m5k,"Harp"
+200,/m/0395lw,"Bell"
+201,/m/03w41f,"Church bell"
+202,/m/027m70_,"Jingle bell"
+203,/m/0gy1t2s,"Bicycle bell"
+204,/m/07n_g,"Tuning fork"
+205,/m/0f8s22,"Chime"
+206,/m/026fgl,"Wind chime"
+207,/m/0150b9,"Change ringing (campanology)"
+208,/m/03qjg,"Harmonica"
+209,/m/0mkg,"Accordion"
+210,/m/0192l,"Bagpipes"
+211,/m/02bxd,"Didgeridoo"
+212,/m/0l14l2,"Shofar"
+213,/m/07kc_,"Theremin"
+214,/m/0l14t7,"Singing bowl"
+215,/m/01hgjl,"Scratching (performance technique)"
+216,/m/064t9,"Pop music"
+217,/m/0glt670,"Hip hop music"
+218,/m/02cz_7,"Beatboxing"
+219,/m/06by7,"Rock music"
+220,/m/03lty,"Heavy metal"
+221,/m/05r6t,"Punk rock"
+222,/m/0dls3,"Grunge"
+223,/m/0dl5d,"Progressive rock"
+224,/m/07sbbz2,"Rock and roll"
+225,/m/05w3f,"Psychedelic rock"
+226,/m/06j6l,"Rhythm and blues"
+227,/m/0gywn,"Soul music"
+228,/m/06cqb,"Reggae"
+229,/m/01lyv,"Country"
+230,/m/015y_n,"Swing music"
+231,/m/0gg8l,"Bluegrass"
+232,/m/02x8m,"Funk"
+233,/m/02w4v,"Folk music"
+234,/m/06j64v,"Middle Eastern music"
+235,/m/03_d0,"Jazz"
+236,/m/026z9,"Disco"
+237,/m/0ggq0m,"Classical music"
+238,/m/05lls,"Opera"
+239,/m/02lkt,"Electronic music"
+240,/m/03mb9,"House music"
+241,/m/07gxw,"Techno"
+242,/m/07s72n,"Dubstep"
+243,/m/0283d,"Drum and bass"
+244,/m/0m0jc,"Electronica"
+245,/m/08cyft,"Electronic dance music"
+246,/m/0fd3y,"Ambient music"
+247,/m/07lnk,"Trance music"
+248,/m/0g293,"Music of Latin America"
+249,/m/0ln16,"Salsa music"
+250,/m/0326g,"Flamenco"
+251,/m/0155w,"Blues"
+252,/m/05fw6t,"Music for children"
+253,/m/02v2lh,"New-age music"
+254,/m/0y4f8,"Vocal music"
+255,/m/0z9c,"A capella"
+256,/m/0164x2,"Music of Africa"
+257,/m/0145m,"Afrobeat"
+258,/m/02mscn,"Christian music"
+259,/m/016cjb,"Gospel music"
+260,/m/028sqc,"Music of Asia"
+261,/m/015vgc,"Carnatic music"
+262,/m/0dq0md,"Music of Bollywood"
+263,/m/06rqw,"Ska"
+264,/m/02p0sh1,"Traditional music"
+265,/m/05rwpb,"Independent music"
+266,/m/074ft,"Song"
+267,/m/025td0t,"Background music"
+268,/m/02cjck,"Theme music"
+269,/m/03r5q_,"Jingle (music)"
+270,/m/0l14gg,"Soundtrack music"
+271,/m/07pkxdp,"Lullaby"
+272,/m/01z7dr,"Video game music"
+273,/m/0140xf,"Christmas music"
+274,/m/0ggx5q,"Dance music"
+275,/m/04wptg,"Wedding music"
+276,/t/dd00031,"Happy music"
+277,/t/dd00032,"Funny music"
+278,/t/dd00033,"Sad music"
+279,/t/dd00034,"Tender music"
+280,/t/dd00035,"Exciting music"
+281,/t/dd00036,"Angry music"
+282,/t/dd00037,"Scary music"
+283,/m/03m9d0z,"Wind"
+284,/m/09t49,"Rustling leaves"
+285,/t/dd00092,"Wind noise (microphone)"
+286,/m/0jb2l,"Thunderstorm"
+287,/m/0ngt1,"Thunder"
+288,/m/0838f,"Water"
+289,/m/06mb1,"Rain"
+290,/m/07r10fb,"Raindrop"
+291,/t/dd00038,"Rain on surface"
+292,/m/0j6m2,"Stream"
+293,/m/0j2kx,"Waterfall"
+294,/m/05kq4,"Ocean"
+295,/m/034srq,"Waves, surf"
+296,/m/06wzb,"Steam"
+297,/m/07swgks,"Gurgling"
+298,/m/02_41,"Fire"
+299,/m/07pzfmf,"Crackle"
+300,/m/07yv9,"Vehicle"
+301,/m/019jd,"Boat, Water vehicle"
+302,/m/0hsrw,"Sailboat, sailing ship"
+303,/m/056ks2,"Rowboat, canoe, kayak"
+304,/m/02rlv9,"Motorboat, speedboat"
+305,/m/06q74,"Ship"
+306,/m/012f08,"Motor vehicle (road)"
+307,/m/0k4j,"Car"
+308,/m/0912c9,"Vehicle horn, car horn, honking"
+309,/m/07qv_d5,"Toot"
+310,/m/02mfyn,"Car alarm"
+311,/m/04gxbd,"Power windows, electric windows"
+312,/m/07rknqz,"Skidding"
+313,/m/0h9mv,"Tire squeal"
+314,/t/dd00134,"Car passing by"
+315,/m/0ltv,"Race car, auto racing"
+316,/m/07r04,"Truck"
+317,/m/0gvgw0,"Air brake"
+318,/m/05x_td,"Air horn, truck horn"
+319,/m/02rhddq,"Reversing beeps"
+320,/m/03cl9h,"Ice cream truck, ice cream van"
+321,/m/01bjv,"Bus"
+322,/m/03j1ly,"Emergency vehicle"
+323,/m/04qvtq,"Police car (siren)"
+324,/m/012n7d,"Ambulance (siren)"
+325,/m/012ndj,"Fire engine, fire truck (siren)"
+326,/m/04_sv,"Motorcycle"
+327,/m/0btp2,"Traffic noise, roadway noise"
+328,/m/06d_3,"Rail transport"
+329,/m/07jdr,"Train"
+330,/m/04zmvq,"Train whistle"
+331,/m/0284vy3,"Train horn"
+332,/m/01g50p,"Railroad car, train wagon"
+333,/t/dd00048,"Train wheels squealing"
+334,/m/0195fx,"Subway, metro, underground"
+335,/m/0k5j,"Aircraft"
+336,/m/014yck,"Aircraft engine"
+337,/m/04229,"Jet engine"
+338,/m/02l6bg,"Propeller, airscrew"
+339,/m/09ct_,"Helicopter"
+340,/m/0cmf2,"Fixed-wing aircraft, airplane"
+341,/m/0199g,"Bicycle"
+342,/m/06_fw,"Skateboard"
+343,/m/02mk9,"Engine"
+344,/t/dd00065,"Light engine (high frequency)"
+345,/m/08j51y,"Dental drill, dentist's drill"
+346,/m/01yg9g,"Lawn mower"
+347,/m/01j4z9,"Chainsaw"
+348,/t/dd00066,"Medium engine (mid frequency)"
+349,/t/dd00067,"Heavy engine (low frequency)"
+350,/m/01h82_,"Engine knocking"
+351,/t/dd00130,"Engine starting"
+352,/m/07pb8fc,"Idling"
+353,/m/07q2z82,"Accelerating, revving, vroom"
+354,/m/02dgv,"Door"
+355,/m/03wwcy,"Doorbell"
+356,/m/07r67yg,"Ding-dong"
+357,/m/02y_763,"Sliding door"
+358,/m/07rjzl8,"Slam"
+359,/m/07r4wb8,"Knock"
+360,/m/07qcpgn,"Tap"
+361,/m/07q6cd_,"Squeak"
+362,/m/0642b4,"Cupboard open or close"
+363,/m/0fqfqc,"Drawer open or close"
+364,/m/04brg2,"Dishes, pots, and pans"
+365,/m/023pjk,"Cutlery, silverware"
+366,/m/07pn_8q,"Chopping (food)"
+367,/m/0dxrf,"Frying (food)"
+368,/m/0fx9l,"Microwave oven"
+369,/m/02pjr4,"Blender"
+370,/m/02jz0l,"Water tap, faucet"
+371,/m/0130jx,"Sink (filling or washing)"
+372,/m/03dnzn,"Bathtub (filling or washing)"
+373,/m/03wvsk,"Hair dryer"
+374,/m/01jt3m,"Toilet flush"
+375,/m/012xff,"Toothbrush"
+376,/m/04fgwm,"Electric toothbrush"
+377,/m/0d31p,"Vacuum cleaner"
+378,/m/01s0vc,"Zipper (clothing)"
+379,/m/03v3yw,"Keys jangling"
+380,/m/0242l,"Coin (dropping)"
+381,/m/01lsmm,"Scissors"
+382,/m/02g901,"Electric shaver, electric razor"
+383,/m/05rj2,"Shuffling cards"
+384,/m/0316dw,"Typing"
+385,/m/0c2wf,"Typewriter"
+386,/m/01m2v,"Computer keyboard"
+387,/m/081rb,"Writing"
+388,/m/07pp_mv,"Alarm"
+389,/m/07cx4,"Telephone"
+390,/m/07pp8cl,"Telephone bell ringing"
+391,/m/01hnzm,"Ringtone"
+392,/m/02c8p,"Telephone dialing, DTMF"
+393,/m/015jpf,"Dial tone"
+394,/m/01z47d,"Busy signal"
+395,/m/046dlr,"Alarm clock"
+396,/m/03kmc9,"Siren"
+397,/m/0dgbq,"Civil defense siren"
+398,/m/030rvx,"Buzzer"
+399,/m/01y3hg,"Smoke detector, smoke alarm"
+400,/m/0c3f7m,"Fire alarm"
+401,/m/04fq5q,"Foghorn"
+402,/m/0l156k,"Whistle"
+403,/m/06hck5,"Steam whistle"
+404,/t/dd00077,"Mechanisms"
+405,/m/02bm9n,"Ratchet, pawl"
+406,/m/01x3z,"Clock"
+407,/m/07qjznt,"Tick"
+408,/m/07qjznl,"Tick-tock"
+409,/m/0l7xg,"Gears"
+410,/m/05zc1,"Pulleys"
+411,/m/0llzx,"Sewing machine"
+412,/m/02x984l,"Mechanical fan"
+413,/m/025wky1,"Air conditioning"
+414,/m/024dl,"Cash register"
+415,/m/01m4t,"Printer"
+416,/m/0dv5r,"Camera"
+417,/m/07bjf,"Single-lens reflex camera"
+418,/m/07k1x,"Tools"
+419,/m/03l9g,"Hammer"
+420,/m/03p19w,"Jackhammer"
+421,/m/01b82r,"Sawing"
+422,/m/02p01q,"Filing (rasp)"
+423,/m/023vsd,"Sanding"
+424,/m/0_ksk,"Power tool"
+425,/m/01d380,"Drill"
+426,/m/014zdl,"Explosion"
+427,/m/032s66,"Gunshot, gunfire"
+428,/m/04zjc,"Machine gun"
+429,/m/02z32qm,"Fusillade"
+430,/m/0_1c,"Artillery fire"
+431,/m/073cg4,"Cap gun"
+432,/m/0g6b5,"Fireworks"
+433,/g/122z_qxw,"Firecracker"
+434,/m/07qsvvw,"Burst, pop"
+435,/m/07pxg6y,"Eruption"
+436,/m/07qqyl4,"Boom"
+437,/m/083vt,"Wood"
+438,/m/07pczhz,"Chop"
+439,/m/07pl1bw,"Splinter"
+440,/m/07qs1cx,"Crack"
+441,/m/039jq,"Glass"
+442,/m/07q7njn,"Chink, clink"
+443,/m/07rn7sz,"Shatter"
+444,/m/04k94,"Liquid"
+445,/m/07rrlb6,"Splash, splatter"
+446,/m/07p6mqd,"Slosh"
+447,/m/07qlwh6,"Squish"
+448,/m/07r5v4s,"Drip"
+449,/m/07prgkl,"Pour"
+450,/m/07pqc89,"Trickle, dribble"
+451,/t/dd00088,"Gush"
+452,/m/07p7b8y,"Fill (with liquid)"
+453,/m/07qlf79,"Spray"
+454,/m/07ptzwd,"Pump (liquid)"
+455,/m/07ptfmf,"Stir"
+456,/m/0dv3j,"Boiling"
+457,/m/0790c,"Sonar"
+458,/m/0dl83,"Arrow"
+459,/m/07rqsjt,"Whoosh, swoosh, swish"
+460,/m/07qnq_y,"Thump, thud"
+461,/m/07rrh0c,"Thunk"
+462,/m/0b_fwt,"Electronic tuner"
+463,/m/02rr_,"Effects unit"
+464,/m/07m2kt,"Chorus effect"
+465,/m/018w8,"Basketball bounce"
+466,/m/07pws3f,"Bang"
+467,/m/07ryjzk,"Slap, smack"
+468,/m/07rdhzs,"Whack, thwack"
+469,/m/07pjjrj,"Smash, crash"
+470,/m/07pc8lb,"Breaking"
+471,/m/07pqn27,"Bouncing"
+472,/m/07rbp7_,"Whip"
+473,/m/07pyf11,"Flap"
+474,/m/07qb_dv,"Scratch"
+475,/m/07qv4k0,"Scrape"
+476,/m/07pdjhy,"Rub"
+477,/m/07s8j8t,"Roll"
+478,/m/07plct2,"Crushing"
+479,/t/dd00112,"Crumpling, crinkling"
+480,/m/07qcx4z,"Tearing"
+481,/m/02fs_r,"Beep, bleep"
+482,/m/07qwdck,"Ping"
+483,/m/07phxs1,"Ding"
+484,/m/07rv4dm,"Clang"
+485,/m/07s02z0,"Squeal"
+486,/m/07qh7jl,"Creak"
+487,/m/07qwyj0,"Rustle"
+488,/m/07s34ls,"Whir"
+489,/m/07qmpdm,"Clatter"
+490,/m/07p9k1k,"Sizzle"
+491,/m/07qc9xj,"Clicking"
+492,/m/07rwm0c,"Clickety-clack"
+493,/m/07phhsh,"Rumble"
+494,/m/07qyrcz,"Plop"
+495,/m/07qfgpx,"Jingle, tinkle"
+496,/m/07rcgpl,"Hum"
+497,/m/07p78v5,"Zing"
+498,/t/dd00121,"Boing"
+499,/m/07s12q4,"Crunch"
+500,/m/028v0c,"Silence"
+501,/m/01v_m0,"Sine wave"
+502,/m/0b9m1,"Harmonic"
+503,/m/0hdsk,"Chirp tone"
+504,/m/0c1dj,"Sound effect"
+505,/m/07pt_g0,"Pulse"
+506,/t/dd00125,"Inside, small room"
+507,/t/dd00126,"Inside, large room or hall"
+508,/t/dd00127,"Inside, public space"
+509,/t/dd00128,"Outside, urban or manmade"
+510,/t/dd00129,"Outside, rural or natural"
+511,/m/01b9nn,"Reverberation"
+512,/m/01jnbd,"Echo"
+513,/m/096m7z,"Noise"
+514,/m/06_y0by,"Environmental noise"
+515,/m/07rgkc5,"Static"
+516,/m/06xkwv,"Mains hum"
+517,/m/0g12c5,"Distortion"
+518,/m/08p9q4,"Sidetone"
+519,/m/07szfh9,"Cacophony"
+520,/m/0chx_,"White noise"
+521,/m/0cj0r,"Pink noise"
+522,/m/07p_0gm,"Throbbing"
+523,/m/01jwx6,"Vibration"
+524,/m/07c52,"Television"
+525,/m/06bz3,"Radio"
+526,/m/07hvw1,"Field recording"

audio_detection/audio_infer/pytorch/__pycache__/models.cpython-38.pyc ADDED Viewed

Binary file (24.6 kB). View file

audio_detection/audio_infer/pytorch/__pycache__/pytorch_utils.cpython-38.pyc ADDED Viewed

Binary file (7.3 kB). View file

audio_detection/audio_infer/pytorch/evaluate.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from sklearn import metrics
+from pytorch_utils import forward
+class Evaluator(object):
+    def __init__(self, model):
+        """Evaluator.
+        Args:
+          model: object
+        """
+        self.model = model
+    def evaluate(self, data_loader):
+        """Forward evaluation data and calculate statistics.
+        Args:
+          data_loader: object
+        Returns:
+          statistics: dict,
+              {'average_precision': (classes_num,), 'auc': (classes_num,)}
+        """
+        # Forward
+        output_dict = forward(
+            model=self.model,
+            generator=data_loader,
+            return_target=True)
+        clipwise_output = output_dict['clipwise_output']    # (audios_num, classes_num)
+        target = output_dict['target']    # (audios_num, classes_num)
+        average_precision = metrics.average_precision_score(
+            target, clipwise_output, average=None)
+        auc = metrics.roc_auc_score(target, clipwise_output, average=None)
+        statistics = {'average_precision': average_precision, 'auc': auc}
+        return statistics

audio_detection/audio_infer/pytorch/finetune_template.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
+import numpy as np
+import argparse
+import h5py
+import math
+import time
+import logging
+import matplotlib.pyplot as plt
+import torch
+torch.backends.cudnn.benchmark=True
+torch.manual_seed(0)
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data
+from utilities import get_filename
+from models import *
+import config
+class Transfer_Cnn14(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num, freeze_base):
+        """Classifier for a new task using pretrained Cnn14 as a sub module.
+        """
+        super(Transfer_Cnn14, self).__init__()
+        audioset_classes_num = 527
+        self.base = Cnn14(sample_rate, window_size, hop_size, mel_bins, fmin,
+            fmax, audioset_classes_num)
+        # Transfer to another task layer
+        self.fc_transfer = nn.Linear(2048, classes_num, bias=True)
+        if freeze_base:
+            # Freeze AudioSet pretrained layers
+            for param in self.base.parameters():
+                param.requires_grad = False
+        self.init_weights()
+    def init_weights(self):
+        init_layer(self.fc_transfer)
+    def load_from_pretrain(self, pretrained_checkpoint_path):
+        checkpoint = torch.load(pretrained_checkpoint_path)
+        self.base.load_state_dict(checkpoint['model'])
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, data_length)
+        """
+        output_dict = self.base(input, mixup_lambda)
+        embedding = output_dict['embedding']
+        clipwise_output =  torch.log_softmax(self.fc_transfer(embedding), dim=-1)
+        output_dict['clipwise_output'] = clipwise_output
+        return output_dict
+def train(args):
+    # Arugments & parameters
+    sample_rate = args.sample_rate
+    window_size = args.window_size
+    hop_size = args.hop_size
+    mel_bins = args.mel_bins
+    fmin = args.fmin
+    fmax = args.fmax
+    model_type = args.model_type
+    pretrained_checkpoint_path = args.pretrained_checkpoint_path
+    freeze_base = args.freeze_base
+    device = 'cuda' if (args.cuda and torch.cuda.is_available()) else 'cpu'
+    classes_num = config.classes_num
+    pretrain = True if pretrained_checkpoint_path else False
+    # Model
+    Model = eval(model_type)
+    model = Model(sample_rate, window_size, hop_size, mel_bins, fmin, fmax,
+        classes_num, freeze_base)
+    # Load pretrained model
+    if pretrain:
+        logging.info('Load pretrained model from {}'.format(pretrained_checkpoint_path))
+        model.load_from_pretrain(pretrained_checkpoint_path)
+    # Parallel
+    print('GPU number: {}'.format(torch.cuda.device_count()))
+    model = torch.nn.DataParallel(model)
+    if 'cuda' in device:
+        model.to(device)
+    print('Load pretrained model successfully!')
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Example of parser. ')
+    subparsers = parser.add_subparsers(dest='mode')
+    # Train
+    parser_train = subparsers.add_parser('train')
+    parser_train.add_argument('--sample_rate', type=int, required=True)
+    parser_train.add_argument('--window_size', type=int, required=True)
+    parser_train.add_argument('--hop_size', type=int, required=True)
+    parser_train.add_argument('--mel_bins', type=int, required=True)
+    parser_train.add_argument('--fmin', type=int, required=True)
+    parser_train.add_argument('--fmax', type=int, required=True)
+    parser_train.add_argument('--model_type', type=str, required=True)
+    parser_train.add_argument('--pretrained_checkpoint_path', type=str)
+    parser_train.add_argument('--freeze_base', action='store_true', default=False)
+    parser_train.add_argument('--cuda', action='store_true', default=False)
+    # Parse arguments
+    args = parser.parse_args()
+    args.filename = get_filename(__file__)
+    if args.mode == 'train':
+        train(args)
+    else:
+        raise Exception('Error argument!')

audio_detection/audio_infer/pytorch/inference.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
+import numpy as np
+import argparse
+import librosa
+import matplotlib.pyplot as plt
+import torch
+from utilities import create_folder, get_filename
+from models import *
+from pytorch_utils import move_data_to_device
+import config
+def audio_tagging(args):
+    """Inference audio tagging result of an audio clip.
+    """
+    # Arugments & parameters
+    sample_rate = args.sample_rate
+    window_size = args.window_size
+    hop_size = args.hop_size
+    mel_bins = args.mel_bins
+    fmin = args.fmin
+    fmax = args.fmax
+    model_type = args.model_type
+    checkpoint_path = args.checkpoint_path
+    audio_path = args.audio_path
+    device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
+    classes_num = config.classes_num
+    labels = config.labels
+    # Model
+    Model = eval(model_type)
+    model = Model(sample_rate=sample_rate, window_size=window_size,
+        hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax,
+        classes_num=classes_num)
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    model.load_state_dict(checkpoint['model'])
+    # Parallel
+    if 'cuda' in str(device):
+        model.to(device)
+        print('GPU number: {}'.format(torch.cuda.device_count()))
+        model = torch.nn.DataParallel(model)
+    else:
+        print('Using CPU.')
+    # Load audio
+    (waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+    waveform = waveform[None, :]    # (1, audio_length)
+    waveform = move_data_to_device(waveform, device)
+    # Forward
+    with torch.no_grad():
+        model.eval()
+        batch_output_dict = model(waveform, None)
+    clipwise_output = batch_output_dict['clipwise_output'].data.cpu().numpy()[0]
+    """(classes_num,)"""
+    sorted_indexes = np.argsort(clipwise_output)[::-1]
+    # Print audio tagging top probabilities
+    for k in range(10):
+        print('{}: {:.3f}'.format(np.array(labels)[sorted_indexes[k]],
+            clipwise_output[sorted_indexes[k]]))
+    # Print embedding
+    if 'embedding' in batch_output_dict.keys():
+        embedding = batch_output_dict['embedding'].data.cpu().numpy()[0]
+        print('embedding: {}'.format(embedding.shape))
+    return clipwise_output, labels
+def sound_event_detection(args):
+    """Inference sound event detection result of an audio clip.
+    """
+    # Arugments & parameters
+    sample_rate = args.sample_rate
+    window_size = args.window_size
+    hop_size = args.hop_size
+    mel_bins = args.mel_bins
+    fmin = args.fmin
+    fmax = args.fmax
+    model_type = args.model_type
+    checkpoint_path = args.checkpoint_path
+    audio_path = args.audio_path
+    device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
+    classes_num = config.classes_num
+    labels = config.labels
+    frames_per_second = sample_rate // hop_size
+    # Paths
+    fig_path = os.path.join('results', '{}.png'.format(get_filename(audio_path)))
+    create_folder(os.path.dirname(fig_path))
+    # Model
+    Model = eval(model_type)
+    model = Model(sample_rate=sample_rate, window_size=window_size,
+        hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax,
+        classes_num=classes_num)
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    model.load_state_dict(checkpoint['model'])
+    # Parallel
+    print('GPU number: {}'.format(torch.cuda.device_count()))
+    model = torch.nn.DataParallel(model)
+    if 'cuda' in str(device):
+        model.to(device)
+    # Load audio
+    (waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+    waveform = waveform[None, :]    # (1, audio_length)
+    waveform = move_data_to_device(waveform, device)
+    # Forward
+    with torch.no_grad():
+        model.eval()
+        batch_output_dict = model(waveform, None)
+    framewise_output = batch_output_dict['framewise_output'].data.cpu().numpy()[0]
+    """(time_steps, classes_num)"""
+    print('Sound event detection result (time_steps x classes_num): {}'.format(
+        framewise_output.shape))
+    sorted_indexes = np.argsort(np.max(framewise_output, axis=0))[::-1]
+    top_k = 10  # Show top results
+    top_result_mat = framewise_output[:, sorted_indexes[0 : top_k]]
+    """(time_steps, top_k)"""
+    # Plot result
+    stft = librosa.core.stft(y=waveform[0].data.cpu().numpy(), n_fft=window_size,
+        hop_length=hop_size, window='hann', center=True)
+    frames_num = stft.shape[-1]
+    fig, axs = plt.subplots(2, 1, sharex=True, figsize=(10, 4))
+    axs[0].matshow(np.log(np.abs(stft)), origin='lower', aspect='auto', cmap='jet')
+    axs[0].set_ylabel('Frequency bins')
+    axs[0].set_title('Log spectrogram')
+    axs[1].matshow(top_result_mat.T, origin='upper', aspect='auto', cmap='jet', vmin=0, vmax=1)
+    axs[1].xaxis.set_ticks(np.arange(0, frames_num, frames_per_second))
+    axs[1].xaxis.set_ticklabels(np.arange(0, frames_num / frames_per_second))
+    axs[1].yaxis.set_ticks(np.arange(0, top_k))
+    axs[1].yaxis.set_ticklabels(np.array(labels)[sorted_indexes[0 : top_k]])
+    axs[1].yaxis.grid(color='k', linestyle='solid', linewidth=0.3, alpha=0.3)
+    axs[1].set_xlabel('Seconds')
+    axs[1].xaxis.set_ticks_position('bottom')
+    plt.tight_layout()
+    plt.savefig(fig_path)
+    print('Save sound event detection visualization to {}'.format(fig_path))
+    return framewise_output, labels
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Example of parser. ')
+    subparsers = parser.add_subparsers(dest='mode')
+    parser_at = subparsers.add_parser('audio_tagging')
+    parser_at.add_argument('--sample_rate', type=int, default=32000)
+    parser_at.add_argument('--window_size', type=int, default=1024)
+    parser_at.add_argument('--hop_size', type=int, default=320)
+    parser_at.add_argument('--mel_bins', type=int, default=64)
+    parser_at.add_argument('--fmin', type=int, default=50)
+    parser_at.add_argument('--fmax', type=int, default=14000)
+    parser_at.add_argument('--model_type', type=str, required=True)
+    parser_at.add_argument('--checkpoint_path', type=str, required=True)
+    parser_at.add_argument('--audio_path', type=str, required=True)
+    parser_at.add_argument('--cuda', action='store_true', default=False)
+    parser_sed = subparsers.add_parser('sound_event_detection')
+    parser_sed.add_argument('--sample_rate', type=int, default=32000)
+    parser_sed.add_argument('--window_size', type=int, default=1024)
+    parser_sed.add_argument('--hop_size', type=int, default=320)
+    parser_sed.add_argument('--mel_bins', type=int, default=64)
+    parser_sed.add_argument('--fmin', type=int, default=50)
+    parser_sed.add_argument('--fmax', type=int, default=14000)
+    parser_sed.add_argument('--model_type', type=str, required=True)
+    parser_sed.add_argument('--checkpoint_path', type=str, required=True)
+    parser_sed.add_argument('--audio_path', type=str, required=True)
+    parser_sed.add_argument('--cuda', action='store_true', default=False)
+    args = parser.parse_args()
+    if args.mode == 'audio_tagging':
+        audio_tagging(args)
+    elif args.mode == 'sound_event_detection':
+        sound_event_detection(args)
+    else:
+        raise Exception('Error argument!')

audio_detection/audio_infer/pytorch/losses.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch
+import torch.nn.functional as F
+def clip_bce(output_dict, target_dict):
+    """Binary crossentropy loss.
+    """
+    return F.binary_cross_entropy(
+        output_dict['clipwise_output'], target_dict['target'])
+def get_loss_func(loss_type):
+    if loss_type == 'clip_bce':
+        return clip_bce

audio_detection/audio_infer/pytorch/main.py ADDED Viewed

	@@ -0,0 +1,378 @@

+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
+import numpy as np
+import argparse
+import time
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data
+from utilities import (create_folder, get_filename, create_logging, Mixup,
+    StatisticsContainer)
+from models import (PVT, PVT2, PVT_lr, PVT_nopretrain, PVT_2layer, Cnn14, Cnn14_no_specaug, Cnn14_no_dropout,
+    Cnn6, Cnn10, ResNet22, ResNet38, ResNet54, Cnn14_emb512, Cnn14_emb128,
+    Cnn14_emb32, MobileNetV1, MobileNetV2, LeeNet11, LeeNet24, DaiNet19,
+    Res1dNet31, Res1dNet51, Wavegram_Cnn14, Wavegram_Logmel_Cnn14,
+    Wavegram_Logmel128_Cnn14, Cnn14_16k, Cnn14_8k, Cnn14_mel32, Cnn14_mel128,
+    Cnn14_mixup_time_domain, Cnn14_DecisionLevelMax, Cnn14_DecisionLevelAtt, Cnn6_Transformer, GLAM, GLAM2, GLAM3, Cnn4, EAT)
+#from models_test import (PVT_test)
+#from models1 import (PVT1)
+#from models_vig import (VIG, VIG2)
+#from models_vvt import (VVT)
+#from models2 import (MPVIT, MPVIT2)
+#from models_reshape import (PVT_reshape, PVT_tscam)
+#from models_swin import (Swin, Swin_nopretrain)
+#from models_swin2 import (Swin2)
+#from models_van import (Van, Van_tiny)
+#from models_focal import (Focal)
+#from models_cross import (Cross)
+#from models_cov import (Cov)
+#from models_cnn import (Cnn_light)
+#from models_twins import (Twins)
+#from models_cmt import (Cmt, Cmt1)
+#from models_shunted import (Shunted)
+#from models_quadtree import (Quadtree, Quadtree2, Quadtree_nopretrain)
+#from models_davit import (Davit_tscam, Davit, Davit_nopretrain)
+from pytorch_utils import (move_data_to_device, count_parameters, count_flops,
+    do_mixup)
+from data_generator import (AudioSetDataset, TrainSampler, BalancedTrainSampler,
+    AlternateTrainSampler, EvaluateSampler, collate_fn)
+from evaluate import Evaluator
+import config
+from losses import get_loss_func
+def train(args):
+    """Train AudioSet tagging model.
+    Args:
+      dataset_dir: str
+      workspace: str
+      data_type: 'balanced_train' | 'full_train'
+      window_size: int
+      hop_size: int
+      mel_bins: int
+      model_type: str
+      loss_type: 'clip_bce'
+      balanced: 'none' | 'balanced' | 'alternate'
+      augmentation: 'none' | 'mixup'
+      batch_size: int
+      learning_rate: float
+      resume_iteration: int
+      early_stop: int
+      accumulation_steps: int
+      cuda: bool
+    """
+    # Arugments & parameters
+    workspace = args.workspace
+    data_type = args.data_type
+    sample_rate = args.sample_rate
+    window_size = args.window_size
+    hop_size = args.hop_size
+    mel_bins = args.mel_bins
+    fmin = args.fmin
+    fmax = args.fmax
+    model_type = args.model_type
+    loss_type = args.loss_type
+    balanced = args.balanced
+    augmentation = args.augmentation
+    batch_size = args.batch_size
+    learning_rate = args.learning_rate
+    resume_iteration = args.resume_iteration
+    early_stop = args.early_stop
+    device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
+    filename = args.filename
+    num_workers = 8
+    clip_samples = config.clip_samples
+    classes_num = config.classes_num
+    loss_func = get_loss_func(loss_type)
+    # Paths
+    black_list_csv = None
+    train_indexes_hdf5_path = os.path.join(workspace, 'hdf5s', 'indexes',
+        '{}.h5'.format(data_type))
+    eval_bal_indexes_hdf5_path = os.path.join(workspace,
+        'hdf5s', 'indexes', 'balanced_train.h5')
+    eval_test_indexes_hdf5_path = os.path.join(workspace, 'hdf5s', 'indexes',
+        'eval.h5')
+    checkpoints_dir = os.path.join(workspace, 'checkpoints', filename,
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+        'data_type={}'.format(data_type), model_type,
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size))
+    create_folder(checkpoints_dir)
+    statistics_path = os.path.join(workspace, 'statistics', filename,
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+        'data_type={}'.format(data_type), model_type,
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+        'statistics.pkl')
+    create_folder(os.path.dirname(statistics_path))
+    logs_dir = os.path.join(workspace, 'logs', filename,
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+        'data_type={}'.format(data_type), model_type,
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size))
+    create_logging(logs_dir, filemode='w')
+    logging.info(args)
+    if 'cuda' in str(device):
+        logging.info('Using GPU.')
+        device = 'cuda'
+    else:
+        logging.info('Using CPU. Set --cuda flag to use GPU.')
+        device = 'cpu'
+    # Model
+    Model = eval(model_type)
+    model = Model(sample_rate=sample_rate, window_size=window_size,
+        hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax,
+        classes_num=classes_num)
+    total = sum(p.numel() for p in model.parameters())
+    print("Total params: %.2fM" % (total/1e6))
+    logging.info("Total params: %.2fM" % (total/1e6))
+    #params_num = count_parameters(model)
+    # flops_num = count_flops(model, clip_samples)
+    #logging.info('Parameters num: {}'.format(params_num))
+    # logging.info('Flops num: {:.3f} G'.format(flops_num / 1e9))
+    # Dataset will be used by DataLoader later. Dataset takes a meta as input
+    # and return a waveform and a target.
+    dataset = AudioSetDataset(sample_rate=sample_rate)
+    # Train sampler
+    if balanced == 'none':
+        Sampler = TrainSampler
+    elif balanced == 'balanced':
+        Sampler = BalancedTrainSampler
+    elif balanced == 'alternate':
+        Sampler = AlternateTrainSampler
+    train_sampler = Sampler(
+        indexes_hdf5_path=train_indexes_hdf5_path,
+        batch_size=batch_size * 2 if 'mixup' in augmentation else batch_size,
+        black_list_csv=black_list_csv)
+    # Evaluate sampler
+    eval_bal_sampler = EvaluateSampler(
+        indexes_hdf5_path=eval_bal_indexes_hdf5_path, batch_size=batch_size)
+    eval_test_sampler = EvaluateSampler(
+        indexes_hdf5_path=eval_test_indexes_hdf5_path, batch_size=batch_size)
+    # Data loader
+    train_loader = torch.utils.data.DataLoader(dataset=dataset,
+        batch_sampler=train_sampler, collate_fn=collate_fn,
+        num_workers=num_workers, pin_memory=True)
+    eval_bal_loader = torch.utils.data.DataLoader(dataset=dataset,
+        batch_sampler=eval_bal_sampler, collate_fn=collate_fn,
+        num_workers=num_workers, pin_memory=True)
+    eval_test_loader = torch.utils.data.DataLoader(dataset=dataset,
+        batch_sampler=eval_test_sampler, collate_fn=collate_fn,
+        num_workers=num_workers, pin_memory=True)
+    mix=0.5
+    if 'mixup' in augmentation:
+        mixup_augmenter = Mixup(mixup_alpha=mix)
+    print(mix)
+    logging.info(mix)
+    # Evaluator
+    evaluator = Evaluator(model=model)
+    # Statistics
+    statistics_container = StatisticsContainer(statistics_path)
+    # Optimizer
+    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.05, amsgrad=True)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4, min_lr=1e-06, verbose=True)
+    train_bgn_time = time.time()
+    # Resume training
+    if resume_iteration > 0:
+        resume_checkpoint_path = os.path.join(workspace, 'checkpoints', filename,
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
+            'data_type={}'.format(data_type), model_type,
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
+            '{}_iterations.pth'.format(resume_iteration))
+        logging.info('Loading checkpoint {}'.format(resume_checkpoint_path))
+        checkpoint = torch.load(resume_checkpoint_path)
+        model.load_state_dict(checkpoint['model'])
+        train_sampler.load_state_dict(checkpoint['sampler'])
+        statistics_container.load_state_dict(resume_iteration)
+        iteration = checkpoint['iteration']
+    else:
+        iteration = 0
+    # Parallel
+    print('GPU number: {}'.format(torch.cuda.device_count()))
+    model = torch.nn.DataParallel(model)
+    if 'cuda' in str(device):
+        model.to(device)
+    if resume_iteration:
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        scheduler.load_state_dict(checkpoint['scheduler'])
+        print(optimizer.state_dict()['param_groups'][0]['lr'])
+    time1 = time.time()
+    for batch_data_dict in train_loader:
+        """batch_data_dict: {
+            'audio_name': (batch_size [*2 if mixup],),
+            'waveform': (batch_size [*2 if mixup], clip_samples),
+            'target': (batch_size [*2 if mixup], classes_num),
+            (ifexist) 'mixup_lambda': (batch_size * 2,)}
+        """
+        # Evaluate
+        if (iteration % 2000 == 0 and iteration >= resume_iteration) or (iteration == 0):
+            train_fin_time = time.time()
+            bal_statistics = evaluator.evaluate(eval_bal_loader)
+            test_statistics = evaluator.evaluate(eval_test_loader)
+            logging.info('Validate bal mAP: {:.3f}'.format(
+                np.mean(bal_statistics['average_precision'])))
+            logging.info('Validate test mAP: {:.3f}'.format(
+                np.mean(test_statistics['average_precision'])))
+            statistics_container.append(iteration, bal_statistics, data_type='bal')
+            statistics_container.append(iteration, test_statistics, data_type='test')
+            statistics_container.dump()
+            train_time = train_fin_time - train_bgn_time
+            validate_time = time.time() - train_fin_time
+            logging.info(
+                'iteration: {}, train time: {:.3f} s, validate time: {:.3f} s'
+                    ''.format(iteration, train_time, validate_time))
+            logging.info('------------------------------------')
+            train_bgn_time = time.time()
+        # Save model
+        if iteration % 2000 == 0:
+            checkpoint = {
+                'iteration': iteration,
+                'model': model.module.state_dict(),
+                'sampler': train_sampler.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'scheduler': scheduler.state_dict()}
+            checkpoint_path = os.path.join(
+                checkpoints_dir, '{}_iterations.pth'.format(iteration))
+            torch.save(checkpoint, checkpoint_path)
+            logging.info('Model saved to {}'.format(checkpoint_path))
+        # Mixup lambda
+        if 'mixup' in augmentation:
+            batch_data_dict['mixup_lambda'] = mixup_augmenter.get_lambda(
+                batch_size=len(batch_data_dict['waveform']))
+        # Move data to device
+        for key in batch_data_dict.keys():
+            batch_data_dict[key] = move_data_to_device(batch_data_dict[key], device)
+        # Forward
+        model.train()
+        if 'mixup' in augmentation:
+            batch_output_dict = model(batch_data_dict['waveform'],
+                batch_data_dict['mixup_lambda'])
+            """{'clipwise_output': (batch_size, classes_num), ...}"""
+            batch_target_dict = {'target': do_mixup(batch_data_dict['target'],
+                batch_data_dict['mixup_lambda'])}
+            """{'target': (batch_size, classes_num)}"""
+        else:
+            batch_output_dict = model(batch_data_dict['waveform'], None)
+            """{'clipwise_output': (batch_size, classes_num), ...}"""
+            batch_target_dict = {'target': batch_data_dict['target']}
+            """{'target': (batch_size, classes_num)}"""
+        # Loss
+        loss = loss_func(batch_output_dict, batch_target_dict)
+        # Backward
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        if iteration % 10 == 0:
+            print(iteration, loss)
+            #print('--- Iteration: {}, train time: {:.3f} s / 10 iterations ---'\
+            #    .format(iteration, time.time() - time1))
+            #time1 = time.time()
+        if iteration % 2000 == 0:
+            scheduler.step(np.mean(test_statistics['average_precision']))
+            print(optimizer.state_dict()['param_groups'][0]['lr'])
+            logging.info(optimizer.state_dict()['param_groups'][0]['lr'])
+        # Stop learning
+        if iteration == early_stop:
+            break
+        iteration += 1
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Example of parser. ')
+    subparsers = parser.add_subparsers(dest='mode')
+    parser_train = subparsers.add_parser('train')
+    parser_train.add_argument('--workspace', type=str, required=True)
+    parser_train.add_argument('--data_type', type=str, default='full_train', choices=['balanced_train', 'full_train'])
+    parser_train.add_argument('--sample_rate', type=int, default=32000)
+    parser_train.add_argument('--window_size', type=int, default=1024)
+    parser_train.add_argument('--hop_size', type=int, default=320)
+    parser_train.add_argument('--mel_bins', type=int, default=64)
+    parser_train.add_argument('--fmin', type=int, default=50)
+    parser_train.add_argument('--fmax', type=int, default=14000)
+    parser_train.add_argument('--model_type', type=str, required=True)
+    parser_train.add_argument('--loss_type', type=str, default='clip_bce', choices=['clip_bce'])
+    parser_train.add_argument('--balanced', type=str, default='balanced', choices=['none', 'balanced', 'alternate'])
+    parser_train.add_argument('--augmentation', type=str, default='mixup', choices=['none', 'mixup'])
+    parser_train.add_argument('--batch_size', type=int, default=32)
+    parser_train.add_argument('--learning_rate', type=float, default=1e-3)
+    parser_train.add_argument('--resume_iteration', type=int, default=0)
+    parser_train.add_argument('--early_stop', type=int, default=1000000)
+    parser_train.add_argument('--cuda', action='store_true', default=False)
+    args = parser.parse_args()
+    args.filename = get_filename(__file__)
+    if args.mode == 'train':
+        train(args)
+    else:
+        raise Exception('Error argument!')

audio_detection/audio_infer/pytorch/models.py ADDED Viewed

	@@ -0,0 +1,951 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+from audio_infer.pytorch.pytorch_utils import do_mixup, interpolate, pad_framewise_output
+import os
+import sys
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+from audio_infer.pytorch.pytorch_utils import do_mixup
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+import warnings
+from functools import partial
+#from mmdet.models.builder import BACKBONES
+from mmdet.utils import get_root_logger
+from mmcv.runner import load_checkpoint
+os.environ['TORCH_HOME'] = '../pretrained_models'
+from copy import deepcopy
+from timm.models.helpers import load_pretrained
+from torch.cuda.amp import autocast
+from collections import OrderedDict
+import io
+import re
+from mmcv.runner import _load_checkpoint, load_state_dict
+import mmcv.runner
+import copy
+import random
+from einops import rearrange
+from einops.layers.torch import Rearrange, Reduce
+from torch import nn, einsum
+def load_checkpoint(model,
+                    filename,
+                    map_location=None,
+                    strict=False,
+                    logger=None,
+                    revise_keys=[(r'^module\.', '')]):
+    """Load checkpoint from a file or URI.
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        revise_keys (list): A list of customized keywords to modify the
+            state_dict in checkpoint. Each item is a (pattern, replacement)
+            pair of the regular expression operations. Default: strip
+            the prefix 'module.' by [(r'^module\\.', '')].
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location, logger)
+    new_proj = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    # strip prefix of state_dict
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    for p, r in revise_keys:
+        state_dict = OrderedDict(
+            {re.sub(p, r, k): v
+             for k, v in state_dict.items()})
+    state_dict = OrderedDict({k.replace('backbone.',''):v for k,v in state_dict.items()})
+    # Keep metadata in state_dict
+    state_dict._metadata = metadata
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer. """
+    nn.init.xavier_uniform_(layer.weight)
+    if hasattr(layer, 'bias'):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+def init_bn(bn):
+    """Initialize a Batchnorm layer. """
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.)
+class TimeShift(nn.Module):
+    def __init__(self, mean, std):
+        super().__init__()
+        self.mean = mean
+        self.std = std
+    def forward(self, x):
+        if self.training:
+            shift = torch.empty(1).normal_(self.mean, self.std).int().item()
+            x = torch.roll(x, shift, dims=2)
+        return x
+class LinearSoftPool(nn.Module):
+    """LinearSoftPool
+    Linear softmax, takes logits and returns a probability, near to the actual maximum value.
+    Taken from the paper:
+        A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling
+    https://arxiv.org/abs/1810.09050
+    """
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+    def forward(self, logits, time_decision):
+        return (time_decision**2).sum(self.pooldim) / time_decision.sum(
+            self.pooldim)
+class PVT(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num):
+        super(PVT, self).__init__()
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+            freq_drop_width=8, freq_stripes_num=2)
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128, 320, 512],
+                                depths=[3, 4, 6, 3],
+                                num_heads=[1, 2, 5, 8],
+                                mlp_ratios=[8, 8, 4, 4],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4, 2, 1],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=4,
+                                #pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        #self.temp_pool = LinearSoftPool()
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+        self.init_weights()
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        interpolate_ratio = 32
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        if self.training:
+            x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        #clipwise_output = torch.mean(framewise_output, dim=1)
+        #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        x = framewise_output.transpose(1, 2).contiguous()
+        x = self.avgpool(x)
+        clipwise_output = torch.flatten(x, 1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        #framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output,
+            'clipwise_output': clipwise_output}
+        return output_dict
+class PVT2(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num):
+        super(PVT2, self).__init__()
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+            freq_drop_width=8, freq_stripes_num=2)
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128, 320, 512],
+                                depths=[3, 4, 6, 3],
+                                num_heads=[1, 2, 5, 8],
+                                mlp_ratios=[8, 8, 4, 4],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4, 2, 1],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=4,
+                                pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        #self.temp_pool = LinearSoftPool()
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+        self.init_weights()
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        interpolate_ratio = 32
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        if self.training:
+            #x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        clipwise_output = torch.mean(framewise_output, dim=1)
+        #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        #framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output,
+            'clipwise_output': clipwise_output}
+        return output_dict
+class PVT_2layer(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num):
+        super(PVT_2layer, self).__init__()
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+            freq_drop_width=8, freq_stripes_num=2)
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128],
+                                depths=[3, 4],
+                                num_heads=[1, 2],
+                                mlp_ratios=[8, 8],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=2,
+                                pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        #self.temp_pool = LinearSoftPool()
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.fc_audioset = nn.Linear(128, classes_num, bias=True)
+        self.init_weights()
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        interpolate_ratio = 8
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        if self.training:
+            x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        #clipwise_output = torch.mean(framewise_output, dim=1)
+        #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        x = framewise_output.transpose(1, 2).contiguous()
+        x = self.avgpool(x)
+        clipwise_output = torch.flatten(x, 1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        #framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output,
+            'clipwise_output': clipwise_output}
+        return output_dict
+class PVT_lr(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num):
+        super(PVT_lr, self).__init__()
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+            freq_drop_width=8, freq_stripes_num=2)
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128, 320, 512],
+                                depths=[3, 4, 6, 3],
+                                num_heads=[1, 2, 5, 8],
+                                mlp_ratios=[8, 8, 4, 4],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4, 2, 1],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=4,
+                                pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        self.temp_pool = LinearSoftPool()
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+        self.init_weights()
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        interpolate_ratio = 32
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        if self.training:
+            x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        #framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output,
+            'clipwise_output': clipwise_output}
+        return output_dict
+class PVT_nopretrain(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num):
+        super(PVT_nopretrain, self).__init__()
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+            freq_drop_width=8, freq_stripes_num=2)
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128, 320, 512],
+                                depths=[3, 4, 6, 3],
+                                num_heads=[1, 2, 5, 8],
+                                mlp_ratios=[8, 8, 4, 4],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4, 2, 1],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=4,
+                                #pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        self.temp_pool = LinearSoftPool()
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+        self.init_weights()
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        interpolate_ratio = 32
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        if self.training:
+            x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output,
+            'clipwise_output': clipwise_output}
+        return output_dict
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.linear = linear
+        if self.linear:
+            self.relu = nn.ReLU()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        if self.linear:
+            x = self.relu(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.linear = linear
+        self.sr_ratio = sr_ratio
+        if not linear:
+            if sr_ratio > 1:
+                self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+                self.norm = nn.LayerNorm(dim)
+        else:
+            self.pool = nn.AdaptiveAvgPool2d(7)
+            self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1)
+            self.norm = nn.LayerNorm(dim)
+            self.act = nn.GELU()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        if not self.linear:
+            if self.sr_ratio > 1:
+                x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+                x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
+                x_ = self.norm(x_)
+                kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            else:
+                kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        else:
+            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+            x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1)
+            x_ = self.norm(x_)
+            x_ = self.act(x_)
+            kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Pooling(nn.Module):
+    """
+    Implementation of pooling for PoolFormer
+    --pool_size: pooling size
+    """
+    def __init__(self, pool_size=3):
+        super().__init__()
+        self.pool = nn.AvgPool2d(
+            pool_size, stride=1, padding=pool_size//2, count_include_pad=False)
+    def forward(self, x):
+        return self.pool(x) - x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear)
+        #self.norm3 = norm_layer(dim)
+        #self.token_mixer = Pooling(pool_size=3)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        return x
+class OverlapPatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, tdim, fdim, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = (tdim, fdim)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // stride, img_size[1] // stride
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+                              padding=(patch_size[0] // 3, patch_size[1] // 3))
+        self.norm = nn.LayerNorm(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, H, W
+class PyramidVisionTransformerV2(nn.Module):
+    def __init__(self, tdim=1001, fdim=64, patch_size=16, stride=4, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
+                 attn_drop_rate=0., drop_path_rate=0.1, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1], num_stages=2, linear=False, pretrained=None):
+        super().__init__()
+        # self.num_classes = num_classes
+        self.depths = depths
+        self.num_stages = num_stages
+        self.linear = linear
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        cur = 0
+        for i in range(num_stages):
+            patch_embed = OverlapPatchEmbed(tdim=tdim if i == 0 else tdim // (2 ** (i + 1)),
+                                            fdim=fdim if i == 0 else tdim // (2 ** (i + 1)),
+                                            patch_size=7 if i == 0 else 3,
+                                            stride=stride if i == 0 else 2,
+                                            in_chans=in_chans if i == 0 else embed_dims[i - 1],
+                                            embed_dim=embed_dims[i])
+            block = nn.ModuleList([Block(
+                dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer,
+                sr_ratio=sr_ratios[i], linear=linear)
+                for j in range(depths[i])])
+            norm = norm_layer(embed_dims[i])
+            cur += depths[i]
+            setattr(self, f"patch_embed{i + 1}", patch_embed)
+            setattr(self, f"block{i + 1}", block)
+            setattr(self, f"norm{i + 1}", norm)
+        #self.n = nn.Linear(125, 250, bias=True)
+        # classification head
+        # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+        self.init_weights(pretrained)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
+    def freeze_patch_emb(self):
+        self.patch_embed1.requires_grad = False
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'}  # has pos_embed may be better
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+    def forward_features(self, x):
+        B = x.shape[0]
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f"patch_embed{i + 1}")
+            block = getattr(self, f"block{i + 1}")
+            norm = getattr(self, f"norm{i + 1}")
+            x, H, W = patch_embed(x)
+            #print(x.shape)
+            for blk in block:
+                x = blk(x, H, W)
+            #print(x.shape)
+            x = norm(x)
+            #if i != self.num_stages - 1:
+            x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        #print(x.shape)
+        return x
+    def forward(self, x):
+        x = self.forward_features(x)
+        # x = self.head(x)
+        return x
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+def _conv_filter(state_dict, patch_size=16):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    for k, v in state_dict.items():
+        if 'patch_embed.proj.weight' in k:
+            v = v.reshape((v.shape[0], 3, patch_size, patch_size))
+        out_dict[k] = v
+    return out_dict

audio_detection/audio_infer/pytorch/pytorch_utils.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import numpy as np
+import time
+import torch
+import torch.nn as nn
+def move_data_to_device(x, device):
+    if 'float' in str(x.dtype):
+        x = torch.Tensor(x)
+    elif 'int' in str(x.dtype):
+        x = torch.LongTensor(x)
+    else:
+        return x
+    return x.to(device)
+def do_mixup(x, mixup_lambda):
+    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes
+    (1, 3, 5, ...).
+    Args:
+      x: (batch_size * 2, ...)
+      mixup_lambda: (batch_size * 2,)
+    Returns:
+      out: (batch_size, ...)
+    """
+    out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \
+        x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1)
+    return out
+def append_to_dict(dict, key, value):
+    if key in dict.keys():
+        dict[key].append(value)
+    else:
+        dict[key] = [value]
+def forward(model, generator, return_input=False,
+    return_target=False):
+    """Forward data to a model.
+    Args:
+      model: object
+      generator: object
+      return_input: bool
+      return_target: bool
+    Returns:
+      audio_name: (audios_num,)
+      clipwise_output: (audios_num, classes_num)
+      (ifexist) segmentwise_output: (audios_num, segments_num, classes_num)
+      (ifexist) framewise_output: (audios_num, frames_num, classes_num)
+      (optional) return_input: (audios_num, segment_samples)
+      (optional) return_target: (audios_num, classes_num)
+    """
+    output_dict = {}
+    device = next(model.parameters()).device
+    time1 = time.time()
+    # Forward data to a model in mini-batches
+    for n, batch_data_dict in enumerate(generator):
+        print(n)
+        batch_waveform = move_data_to_device(batch_data_dict['waveform'], device)
+        with torch.no_grad():
+            model.eval()
+            batch_output = model(batch_waveform)
+        append_to_dict(output_dict, 'audio_name', batch_data_dict['audio_name'])
+        append_to_dict(output_dict, 'clipwise_output',
+            batch_output['clipwise_output'].data.cpu().numpy())
+        if 'segmentwise_output' in batch_output.keys():
+            append_to_dict(output_dict, 'segmentwise_output',
+                batch_output['segmentwise_output'].data.cpu().numpy())
+        if 'framewise_output' in batch_output.keys():
+            append_to_dict(output_dict, 'framewise_output',
+                batch_output['framewise_output'].data.cpu().numpy())
+        if return_input:
+            append_to_dict(output_dict, 'waveform', batch_data_dict['waveform'])
+        if return_target:
+            if 'target' in batch_data_dict.keys():
+                append_to_dict(output_dict, 'target', batch_data_dict['target'])
+        if n % 10 == 0:
+            print(' --- Inference time: {:.3f} s / 10 iterations ---'.format(
+                time.time() - time1))
+            time1 = time.time()
+    for key in output_dict.keys():
+        output_dict[key] = np.concatenate(output_dict[key], axis=0)
+    return output_dict
+def interpolate(x, ratio):
+    """Interpolate data in time domain. This is used to compensate the
+    resolution reduction in downsampling of a CNN.
+    Args:
+      x: (batch_size, time_steps, classes_num)
+      ratio: int, ratio to interpolate
+    Returns:
+      upsampled: (batch_size, time_steps * ratio, classes_num)
+    """
+    (batch_size, time_steps, classes_num) = x.shape
+    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+    return upsampled
+def pad_framewise_output(framewise_output, frames_num):
+    """Pad framewise_output to the same length as input frames. The pad value
+    is the same as the value of the last frame.
+    Args:
+      framewise_output: (batch_size, frames_num, classes_num)
+      frames_num: int, number of frames to pad
+    Outputs:
+      output: (batch_size, frames_num, classes_num)
+    """
+    pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
+    """tensor for padding"""
+    output = torch.cat((framewise_output, pad), dim=1)
+    """(batch_size, frames_num, classes_num)"""
+    return output
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+def count_flops(model, audio_length):
+    """Count flops. Code modified from others' implementation.
+    """
+    multiply_adds = True
+    list_conv2d=[]
+    def conv2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        output_channels, output_height, output_width = output[0].size()
+        kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
+        bias_ops = 1 if self.bias is not None else 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_height * output_width
+        list_conv2d.append(flops)
+    list_conv1d=[]
+    def conv1d_hook(self, input, output):
+        batch_size, input_channels, input_length = input[0].size()
+        output_channels, output_length = output[0].size()
+        kernel_ops = self.kernel_size[0] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
+        bias_ops = 1 if self.bias is not None else 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_length
+        list_conv1d.append(flops)
+    list_linear=[]
+    def linear_hook(self, input, output):
+        batch_size = input[0].size(0) if input[0].dim() == 2 else 1
+        weight_ops = self.weight.nelement() * (2 if multiply_adds else 1)
+        bias_ops = self.bias.nelement()
+        flops = batch_size * (weight_ops + bias_ops)
+        list_linear.append(flops)
+    list_bn=[]
+    def bn_hook(self, input, output):
+        list_bn.append(input[0].nelement() * 2)
+    list_relu=[]
+    def relu_hook(self, input, output):
+        list_relu.append(input[0].nelement() * 2)
+    list_pooling2d=[]
+    def pooling2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        output_channels, output_height, output_width = output[0].size()
+        kernel_ops = self.kernel_size * self.kernel_size
+        bias_ops = 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_height * output_width
+        list_pooling2d.append(flops)
+    list_pooling1d=[]
+    def pooling1d_hook(self, input, output):
+        batch_size, input_channels, input_length = input[0].size()
+        output_channels, output_length = output[0].size()
+        kernel_ops = self.kernel_size[0]
+        bias_ops = 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_length
+        list_pooling2d.append(flops)
+    def foo(net):
+        childrens = list(net.children())
+        if not childrens:
+            if isinstance(net, nn.Conv2d):
+                net.register_forward_hook(conv2d_hook)
+            elif isinstance(net, nn.Conv1d):
+                net.register_forward_hook(conv1d_hook)
+            elif isinstance(net, nn.Linear):
+                net.register_forward_hook(linear_hook)
+            elif isinstance(net, nn.BatchNorm2d) or isinstance(net, nn.BatchNorm1d):
+                net.register_forward_hook(bn_hook)
+            elif isinstance(net, nn.ReLU):
+                net.register_forward_hook(relu_hook)
+            elif isinstance(net, nn.AvgPool2d) or isinstance(net, nn.MaxPool2d):
+                net.register_forward_hook(pooling2d_hook)
+            elif isinstance(net, nn.AvgPool1d) or isinstance(net, nn.MaxPool1d):
+                net.register_forward_hook(pooling1d_hook)
+            else:
+                print('Warning: flop of module {} is not counted!'.format(net))
+            return
+        for c in childrens:
+            foo(c)
+    # Register hook
+    foo(model)
+    device = device = next(model.parameters()).device
+    input = torch.rand(1, audio_length).to(device)
+    out = model(input)
+    total_flops = sum(list_conv2d) + sum(list_conv1d) + sum(list_linear) + \
+        sum(list_bn) + sum(list_relu) + sum(list_pooling2d) + sum(list_pooling1d)
+    return total_flops

audio_detection/audio_infer/results/YDlWd7Wmdi1E.png ADDED Viewed

audio_detection/audio_infer/useful_ckpts/audio_detection.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f909808f17d424dc29063a21953ff2be103489518a4f60a6c649d2e3e7d3e81
+size 441042195

audio_detection/audio_infer/utils/__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (6.33 kB). View file

audio_detection/audio_infer/utils/config.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import numpy as np
+import csv
+sample_rate = 32000
+clip_samples = sample_rate * 10     # Audio clips are 10-second
+# Load label
+with open('./audio_detection/audio_infer/metadata/class_labels_indices.csv', 'r') as f:
+    reader = csv.reader(f, delimiter=',')
+    lines = list(reader)
+labels = []
+ids = []    # Each label has a unique id such as "/m/068hy"
+for i1 in range(1, len(lines)):
+    id = lines[i1][1]
+    label = lines[i1][2]
+    ids.append(id)
+    labels.append(label)
+classes_num = len(labels)
+lb_to_ix = {label : i for i, label in enumerate(labels)}
+ix_to_lb = {i : label for i, label in enumerate(labels)}
+id_to_ix = {id : i for i, id in enumerate(ids)}
+ix_to_id = {i : id for i, id in enumerate(ids)}
+full_samples_per_class = np.array([
+        937432,  16344,   7822,  10271,   2043,  14420,    733,   1511,
+         1258,    424,   1751,    704,    369,    590,   1063,   1375,
+         5026,    743,    853,   1648,    714,   1497,   1251,   2139,
+         1093,    133,    224,  39469,   6423,    407,   1559,   4546,
+         6826,   7464,   2468,    549,   4063,    334,    587,    238,
+         1766,    691,    114,   2153,    236,    209,    421,    740,
+          269,    959,    137,   4192,    485,   1515,    655,    274,
+           69,    157,   1128,    807,   1022,    346,     98,    680,
+          890,    352,   4169,   2061,   1753,   9883,   1339,    708,
+        37857,  18504,  12864,   2475,   2182,    757,   3624,    677,
+         1683,   3583,    444,   1780,   2364,    409,   4060,   3097,
+         3143,    502,    723,    600,    230,    852,   1498,   1865,
+         1879,   2429,   5498,   5430,   2139,   1761,   1051,    831,
+         2401,   2258,   1672,   1711,    987,    646,    794,  25061,
+         5792,   4256,     96,   8126,   2740,    752,    513,    554,
+          106,    254,   1592,    556,    331,    615,   2841,    737,
+          265,   1349,    358,   1731,   1115,    295,   1070,    972,
+          174, 937780, 112337,  42509,  49200,  11415,   6092,  13851,
+         2665,   1678,  13344,   2329,   1415,   2244,   1099,   5024,
+         9872,  10948,   4409,   2732,   1211,   1289,   4807,   5136,
+         1867,  16134,  14519,   3086,  19261,   6499,   4273,   2790,
+         8820,   1228,   1575,   4420,   3685,   2019,    664,    324,
+          513,    411,    436,   2997,   5162,   3806,   1389,    899,
+         8088,   7004,   1105,   3633,   2621,   9753,   1082,  26854,
+         3415,   4991,   2129,   5546,   4489,   2850,   1977,   1908,
+         1719,   1106,   1049,    152,    136,    802,    488,    592,
+         2081,   2712,   1665,   1128,    250,    544,    789,   2715,
+         8063,   7056,   2267,   8034,   6092,   3815,   1833,   3277,
+         8813,   2111,   4662,   2678,   2954,   5227,   1472,   2591,
+         3714,   1974,   1795,   4680,   3751,   6585,   2109,  36617,
+         6083,  16264,  17351,   3449,   5034,   3931,   2599,   4134,
+         3892,   2334,   2211,   4516,   2766,   2862,   3422,   1788,
+         2544,   2403,   2892,   4042,   3460,   1516,   1972,   1563,
+         1579,   2776,   1647,   4535,   3921,   1261,   6074,   2922,
+         3068,   1948,   4407,    712,   1294,   1019,   1572,   3764,
+         5218,    975,   1539,   6376,   1606,   6091,   1138,   1169,
+         7925,   3136,   1108,   2677,   2680,   1383,   3144,   2653,
+         1986,   1800,   1308,   1344, 122231,  12977,   2552,   2678,
+         7824,    768,   8587,  39503,   3474,    661,    430,    193,
+         1405,   1442,   3588,   6280,  10515,    785,    710,    305,
+          206,   4990,   5329,   3398,   1771,   3022,   6907,   1523,
+         8588,  12203,    666,   2113,   7916,    434,   1636,   5185,
+         1062,    664,    952,   3490,   2811,   2749,   2848,  15555,
+          363,    117,   1494,   1647,   5886,   4021,    633,   1013,
+         5951,  11343,   2324,    243,    372,    943,    734,    242,
+         3161,    122,    127,    201,   1654,    768,    134,   1467,
+          642,   1148,   2156,   1368,   1176,    302,   1909,     61,
+          223,   1812,    287,    422,    311,    228,    748,    230,
+         1876,    539,   1814,    737,    689,   1140,    591,    943,
+          353,    289,    198,    490,   7938,   1841,    850,    457,
+        814,    146,    551,    728,   1627,    620,    648,   1621,
+         2731,    535,     88,   1736,    736,    328,    293,   3170,
+          344,    384,   7640,    433,    215,    715,    626,    128,
+         3059,   1833,   2069,   3732,   1640,   1508,    836,    567,
+         2837,   1151,   2068,    695,   1494,   3173,    364,     88,
+          188,    740,    677,    273,   1533,    821,   1091,    293,
+          647,    318,   1202,    328,    532,   2847,    526,    721,
+          370,    258,    956,   1269,   1641,    339,   1322,   4485,
+          286,   1874,    277,    757,   1393,   1330,    380,    146,
+          377,    394,    318,    339,   1477,   1886,    101,   1435,
+          284,   1425,    686,    621,    221,    117,     87,   1340,
+          201,   1243,   1222,    651,   1899,    421,    712,   1016,
+         1279,    124,    351,    258,   7043,    368,    666,    162,
+         7664,    137,  70159,  26179,   6321,  32236,  33320,    771,
+         1169,    269,   1103,    444,    364,   2710,    121,    751,
+         1609,    855,   1141,   2287,   1940,   3943,    289])

audio_detection/audio_infer/utils/crash.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import sys
+class ExceptionHook:
+    instance = None
+    def __call__(self, *args, **kwargs):
+        if self.instance is None:
+            from IPython.core import ultratb
+            self.instance = ultratb.FormattedTB(mode='Plain',
+                 color_scheme='Linux', call_pdb=1)
+        return self.instance(*args, **kwargs)
+sys.excepthook = ExceptionHook()

audio_detection/audio_infer/utils/create_black_list.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import argparse
+import csv
+import os
+from utilities import create_folder
+def dcase2017task4(args):
+    """Create black list. Black list is a list of audio ids that will be
+    skipped in training.
+    """
+    # Augments & parameters
+    workspace = args.workspace
+    # Black list from DCASE 2017 Task 4
+    test_weak_csv = 'metadata/black_list/groundtruth_weak_label_testing_set.csv'
+    evaluation_weak_csv = 'metadata/black_list/groundtruth_weak_label_evaluation_set.csv'
+    black_list_csv = os.path.join(workspace, 'black_list', 'dcase2017task4.csv')
+    create_folder(os.path.dirname(black_list_csv))
+    def get_id_sets(csv_path):
+        with open(csv_path, 'r') as fr:
+            reader = csv.reader(fr, delimiter='\t')
+            lines = list(reader)
+        ids_set = []
+        for line in lines:
+            """line: ['-5QrBL6MzLg_60.000_70.000.wav', '60.000', '70.000', 'Train horn']"""
+            ids_set.append(line[0][0 : 11])
+        ids_set = list(set(ids_set))
+        return ids_set
+    test_ids_set = get_id_sets(test_weak_csv)
+    evaluation_ids_set = get_id_sets(evaluation_weak_csv)
+    full_ids_set = test_ids_set + evaluation_ids_set
+    # Write black list
+    fw = open(black_list_csv, 'w')
+    for id in full_ids_set:
+        fw.write('{}\n'.format(id))
+    print('Write black list to {}'.format(black_list_csv))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    subparsers = parser.add_subparsers(dest='mode')
+    parser_dcase2017task4 = subparsers.add_parser('dcase2017task4')
+    parser_dcase2017task4.add_argument('--workspace', type=str, required=True)
+    args = parser.parse_args()
+    if args.mode == 'dcase2017task4':
+        dcase2017task4(args)
+    else:
+        raise Exception('Error argument!')

audio_detection/audio_infer/utils/create_indexes.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import numpy as np
+import argparse
+import csv
+import os
+import glob
+import datetime
+import time
+import logging
+import h5py
+import librosa
+from utilities import create_folder, get_sub_filepaths
+import config
+def create_indexes(args):
+    """Create indexes a for dataloader to read for training. When users have
+    a new task and their own data, they need to create similar indexes. The
+    indexes contain meta information of "where to find the data for training".
+    """
+    # Arguments & parameters
+    waveforms_hdf5_path = args.waveforms_hdf5_path
+    indexes_hdf5_path = args.indexes_hdf5_path
+    # Paths
+    create_folder(os.path.dirname(indexes_hdf5_path))
+    with h5py.File(waveforms_hdf5_path, 'r') as hr:
+        with h5py.File(indexes_hdf5_path, 'w') as hw:
+            audios_num = len(hr['audio_name'])
+            hw.create_dataset('audio_name', data=hr['audio_name'][:], dtype='S20')
+            hw.create_dataset('target', data=hr['target'][:], dtype=np.bool)
+            hw.create_dataset('hdf5_path', data=[waveforms_hdf5_path.encode()] * audios_num, dtype='S200')
+            hw.create_dataset('index_in_hdf5', data=np.arange(audios_num), dtype=np.int32)
+    print('Write to {}'.format(indexes_hdf5_path))
+def combine_full_indexes(args):
+    """Combine all balanced and unbalanced indexes hdf5s to a single hdf5. This
+    combined indexes hdf5 is used for training with full data (~20k balanced
+    audio clips + ~1.9m unbalanced audio clips).
+    """
+    # Arguments & parameters
+    indexes_hdf5s_dir = args.indexes_hdf5s_dir
+    full_indexes_hdf5_path = args.full_indexes_hdf5_path
+    classes_num = config.classes_num
+    # Paths
+    paths = get_sub_filepaths(indexes_hdf5s_dir)
+    paths = [path for path in paths if (
+        'train' in path and 'full_train' not in path and 'mini' not in path)]
+    print('Total {} hdf5 to combine.'.format(len(paths)))
+    with h5py.File(full_indexes_hdf5_path, 'w') as full_hf:
+        full_hf.create_dataset(
+            name='audio_name',
+            shape=(0,),
+            maxshape=(None,),
+            dtype='S20')
+        full_hf.create_dataset(
+            name='target',
+            shape=(0, classes_num),
+            maxshape=(None, classes_num),
+            dtype=np.bool)
+        full_hf.create_dataset(
+            name='hdf5_path',
+            shape=(0,),
+            maxshape=(None,),
+            dtype='S200')
+        full_hf.create_dataset(
+            name='index_in_hdf5',
+            shape=(0,),
+            maxshape=(None,),
+            dtype=np.int32)
+        for path in paths:
+            with h5py.File(path, 'r') as part_hf:
+                print(path)
+                n = len(full_hf['audio_name'][:])
+                new_n = n + len(part_hf['audio_name'][:])
+                full_hf['audio_name'].resize((new_n,))
+                full_hf['audio_name'][n : new_n] = part_hf['audio_name'][:]
+                full_hf['target'].resize((new_n, classes_num))
+                full_hf['target'][n : new_n] = part_hf['target'][:]
+                full_hf['hdf5_path'].resize((new_n,))
+                full_hf['hdf5_path'][n : new_n] = part_hf['hdf5_path'][:]
+                full_hf['index_in_hdf5'].resize((new_n,))
+                full_hf['index_in_hdf5'][n : new_n] = part_hf['index_in_hdf5'][:]
+    print('Write combined full hdf5 to {}'.format(full_indexes_hdf5_path))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(dest='mode')
+    parser_create_indexes = subparsers.add_parser('create_indexes')
+    parser_create_indexes.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path of packed waveforms hdf5.')
+    parser_create_indexes.add_argument('--indexes_hdf5_path', type=str, required=True, help='Path to write out indexes hdf5.')
+    parser_combine_full_indexes = subparsers.add_parser('combine_full_indexes')
+    parser_combine_full_indexes.add_argument('--indexes_hdf5s_dir', type=str, required=True, help='Directory containing indexes hdf5s to be combined.')
+    parser_combine_full_indexes.add_argument('--full_indexes_hdf5_path', type=str, required=True, help='Path to write out full indexes hdf5 file.')
+    args = parser.parse_args()
+    if args.mode == 'create_indexes':
+        create_indexes(args)
+    elif args.mode == 'combine_full_indexes':
+        combine_full_indexes(args)
+    else:
+        raise Exception('Incorrect arguments!')

audio_detection/audio_infer/utils/data_generator.py ADDED Viewed

	@@ -0,0 +1,421 @@

+import numpy as np
+import h5py
+import csv
+import time
+import logging
+from utilities import int16_to_float32
+def read_black_list(black_list_csv):
+    """Read audio names from black list.
+    """
+    with open(black_list_csv, 'r') as fr:
+        reader = csv.reader(fr)
+        lines = list(reader)
+    black_list_names = ['Y{}.wav'.format(line[0]) for line in lines]
+    return black_list_names
+class AudioSetDataset(object):
+    def __init__(self, sample_rate=32000):
+        """This class takes the meta of an audio clip as input, and return
+        the waveform and target of the audio clip. This class is used by DataLoader.
+        """
+        self.sample_rate = sample_rate
+    def __getitem__(self, meta):
+        """Load waveform and target of an audio clip.
+        Args:
+          meta: {
+            'hdf5_path': str,
+            'index_in_hdf5': int}
+        Returns:
+          data_dict: {
+            'audio_name': str,
+            'waveform': (clip_samples,),
+            'target': (classes_num,)}
+        """
+        hdf5_path = meta['hdf5_path']
+        index_in_hdf5 = meta['index_in_hdf5']
+        with h5py.File(hdf5_path, 'r') as hf:
+            audio_name = hf['audio_name'][index_in_hdf5].decode()
+            waveform = int16_to_float32(hf['waveform'][index_in_hdf5])
+            waveform = self.resample(waveform)
+            target = hf['target'][index_in_hdf5].astype(np.float32)
+        data_dict = {
+            'audio_name': audio_name, 'waveform': waveform, 'target': target}
+        return data_dict
+    def resample(self, waveform):
+        """Resample.
+        Args:
+          waveform: (clip_samples,)
+        Returns:
+          (resampled_clip_samples,)
+        """
+        if self.sample_rate == 32000:
+            return waveform
+        elif self.sample_rate == 16000:
+            return waveform[0 :: 2]
+        elif self.sample_rate == 8000:
+            return waveform[0 :: 4]
+        else:
+            raise Exception('Incorrect sample rate!')
+class Base(object):
+    def __init__(self, indexes_hdf5_path, batch_size, black_list_csv, random_seed):
+        """Base class of train sampler.
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+          black_list_csv: string
+          random_seed: int
+        """
+        self.batch_size = batch_size
+        self.random_state = np.random.RandomState(random_seed)
+        # Black list
+        if black_list_csv:
+            self.black_list_names = read_black_list(black_list_csv)
+        else:
+            self.black_list_names = []
+        logging.info('Black list samples: {}'.format(len(self.black_list_names)))
+        # Load target
+        load_time = time.time()
+        with h5py.File(indexes_hdf5_path, 'r') as hf:
+            self.audio_names = [audio_name.decode() for audio_name in hf['audio_name'][:]]
+            self.hdf5_paths = [hdf5_path.decode() for hdf5_path in hf['hdf5_path'][:]]
+            self.indexes_in_hdf5 = hf['index_in_hdf5'][:]
+            self.targets = hf['target'][:].astype(np.float32)
+        (self.audios_num, self.classes_num) = self.targets.shape
+        logging.info('Training number: {}'.format(self.audios_num))
+        logging.info('Load target time: {:.3f} s'.format(time.time() - load_time))
+class TrainSampler(Base):
+    def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
+        random_seed=1234):
+        """Balanced sampler. Generate batch meta for training.
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+          black_list_csv: string
+          random_seed: int
+        """
+        super(TrainSampler, self).__init__(indexes_hdf5_path, batch_size,
+            black_list_csv, random_seed)
+        self.indexes = np.arange(self.audios_num)
+        # Shuffle indexes
+        self.random_state.shuffle(self.indexes)
+        self.pointer = 0
+    def __iter__(self):
+        """Generate batch meta for training.
+        Returns:
+          batch_meta: e.g.: [
+            {'hdf5_path': string, 'index_in_hdf5': int},
+            ...]
+        """
+        batch_size = self.batch_size
+        while True:
+            batch_meta = []
+            i = 0
+            while i < batch_size:
+                index = self.indexes[self.pointer]
+                self.pointer += 1
+                # Shuffle indexes and reset pointer
+                if self.pointer >= self.audios_num:
+                    self.pointer = 0
+                    self.random_state.shuffle(self.indexes)
+                # If audio in black list then continue
+                if self.audio_names[index] in self.black_list_names:
+                    continue
+                else:
+                    batch_meta.append({
+                        'hdf5_path': self.hdf5_paths[index],
+                        'index_in_hdf5': self.indexes_in_hdf5[index]})
+                    i += 1
+            yield batch_meta
+    def state_dict(self):
+        state = {
+            'indexes': self.indexes,
+            'pointer': self.pointer}
+        return state
+    def load_state_dict(self, state):
+        self.indexes = state['indexes']
+        self.pointer = state['pointer']
+class BalancedTrainSampler(Base):
+    def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
+        random_seed=1234):
+        """Balanced sampler. Generate batch meta for training. Data are equally
+        sampled from different sound classes.
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+          black_list_csv: string
+          random_seed: int
+        """
+        super(BalancedTrainSampler, self).__init__(indexes_hdf5_path,
+            batch_size, black_list_csv, random_seed)
+        self.samples_num_per_class = np.sum(self.targets, axis=0)
+        logging.info('samples_num_per_class: {}'.format(
+            self.samples_num_per_class.astype(np.int32)))
+        # Training indexes of all sound classes. E.g.:
+        # [[0, 11, 12, ...], [3, 4, 15, 16, ...], [7, 8, ...], ...]
+        self.indexes_per_class = []
+        for k in range(self.classes_num):
+            self.indexes_per_class.append(
+                np.where(self.targets[:, k] == 1)[0])
+        # Shuffle indexes
+        for k in range(self.classes_num):
+            self.random_state.shuffle(self.indexes_per_class[k])
+        self.queue = []
+        self.pointers_of_classes = [0] * self.classes_num
+    def expand_queue(self, queue):
+        classes_set = np.arange(self.classes_num).tolist()
+        self.random_state.shuffle(classes_set)
+        queue += classes_set
+        return queue
+    def __iter__(self):
+        """Generate batch meta for training.
+        Returns:
+          batch_meta: e.g.: [
+            {'hdf5_path': string, 'index_in_hdf5': int},
+            ...]
+        """
+        batch_size = self.batch_size
+        while True:
+            batch_meta = []
+            i = 0
+            while i < batch_size:
+                if len(self.queue) == 0:
+                    self.queue = self.expand_queue(self.queue)
+                class_id = self.queue.pop(0)
+                pointer = self.pointers_of_classes[class_id]
+                self.pointers_of_classes[class_id] += 1
+                index = self.indexes_per_class[class_id][pointer]
+                # When finish one epoch of a sound class, then shuffle its indexes and reset pointer
+                if self.pointers_of_classes[class_id] >= self.samples_num_per_class[class_id]:
+                    self.pointers_of_classes[class_id] = 0
+                    self.random_state.shuffle(self.indexes_per_class[class_id])
+                # If audio in black list then continue
+                if self.audio_names[index] in self.black_list_names:
+                    continue
+                else:
+                    batch_meta.append({
+                        'hdf5_path': self.hdf5_paths[index],
+                        'index_in_hdf5': self.indexes_in_hdf5[index]})
+                    i += 1
+            yield batch_meta
+    def state_dict(self):
+        state = {
+            'indexes_per_class': self.indexes_per_class,
+            'queue': self.queue,
+            'pointers_of_classes': self.pointers_of_classes}
+        return state
+    def load_state_dict(self, state):
+        self.indexes_per_class = state['indexes_per_class']
+        self.queue = state['queue']
+        self.pointers_of_classes = state['pointers_of_classes']
+class AlternateTrainSampler(Base):
+    def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
+        random_seed=1234):
+        """AlternateSampler is a combination of Sampler and Balanced Sampler.
+        AlternateSampler alternately sample data from Sampler and Blanced Sampler.
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+          black_list_csv: string
+          random_seed: int
+        """
+        self.sampler1 = TrainSampler(indexes_hdf5_path, batch_size,
+            black_list_csv, random_seed)
+        self.sampler2 = BalancedTrainSampler(indexes_hdf5_path, batch_size,
+            black_list_csv, random_seed)
+        self.batch_size = batch_size
+        self.count = 0
+    def __iter__(self):
+        """Generate batch meta for training.
+        Returns:
+          batch_meta: e.g.: [
+            {'hdf5_path': string, 'index_in_hdf5': int},
+            ...]
+        """
+        batch_size = self.batch_size
+        while True:
+            self.count += 1
+            if self.count % 2 == 0:
+                batch_meta = []
+                i = 0
+                while i < batch_size:
+                    index = self.sampler1.indexes[self.sampler1.pointer]
+                    self.sampler1.pointer += 1
+                    # Shuffle indexes and reset pointer
+                    if self.sampler1.pointer >= self.sampler1.audios_num:
+                        self.sampler1.pointer = 0
+                        self.sampler1.random_state.shuffle(self.sampler1.indexes)
+                    # If audio in black list then continue
+                    if self.sampler1.audio_names[index] in self.sampler1.black_list_names:
+                        continue
+                    else:
+                        batch_meta.append({
+                            'hdf5_path': self.sampler1.hdf5_paths[index],
+                            'index_in_hdf5': self.sampler1.indexes_in_hdf5[index]})
+                        i += 1
+            elif self.count % 2 == 1:
+                batch_meta = []
+                i = 0
+                while i < batch_size:
+                    if len(self.sampler2.queue) == 0:
+                        self.sampler2.queue = self.sampler2.expand_queue(self.sampler2.queue)
+                    class_id = self.sampler2.queue.pop(0)
+                    pointer = self.sampler2.pointers_of_classes[class_id]
+                    self.sampler2.pointers_of_classes[class_id] += 1
+                    index = self.sampler2.indexes_per_class[class_id][pointer]
+                    # When finish one epoch of a sound class, then shuffle its indexes and reset pointer
+                    if self.sampler2.pointers_of_classes[class_id] >= self.sampler2.samples_num_per_class[class_id]:
+                        self.sampler2.pointers_of_classes[class_id] = 0
+                        self.sampler2.random_state.shuffle(self.sampler2.indexes_per_class[class_id])
+                    # If audio in black list then continue
+                    if self.sampler2.audio_names[index] in self.sampler2.black_list_names:
+                        continue
+                    else:
+                        batch_meta.append({
+                            'hdf5_path': self.sampler2.hdf5_paths[index],
+                            'index_in_hdf5': self.sampler2.indexes_in_hdf5[index]})
+                        i += 1
+            yield batch_meta
+    def state_dict(self):
+        state = {
+            'sampler1': self.sampler1.state_dict(),
+            'sampler2': self.sampler2.state_dict()}
+        return state
+    def load_state_dict(self, state):
+        self.sampler1.load_state_dict(state['sampler1'])
+        self.sampler2.load_state_dict(state['sampler2'])
+class EvaluateSampler(object):
+    def __init__(self, indexes_hdf5_path, batch_size):
+        """Evaluate sampler. Generate batch meta for evaluation.
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+        """
+        self.batch_size = batch_size
+        with h5py.File(indexes_hdf5_path, 'r') as hf:
+            self.audio_names = [audio_name.decode() for audio_name in hf['audio_name'][:]]
+            self.hdf5_paths = [hdf5_path.decode() for hdf5_path in hf['hdf5_path'][:]]
+            self.indexes_in_hdf5 = hf['index_in_hdf5'][:]
+            self.targets = hf['target'][:].astype(np.float32)
+        self.audios_num = len(self.audio_names)
+    def __iter__(self):
+        """Generate batch meta for training.
+        Returns:
+          batch_meta: e.g.: [
+            {'hdf5_path': string,
+             'index_in_hdf5': int}
+            ...]
+        """
+        batch_size = self.batch_size
+        pointer = 0
+        while pointer < self.audios_num:
+            batch_indexes = np.arange(pointer,
+                min(pointer + batch_size, self.audios_num))
+            batch_meta = []
+            for index in batch_indexes:
+                batch_meta.append({
+                    'audio_name': self.audio_names[index],
+                    'hdf5_path': self.hdf5_paths[index],
+                    'index_in_hdf5': self.indexes_in_hdf5[index],
+                    'target': self.targets[index]})
+            pointer += batch_size
+            yield batch_meta
+def collate_fn(list_data_dict):
+    """Collate data.
+    Args:
+      list_data_dict, e.g., [{'audio_name': str, 'waveform': (clip_samples,), ...},
+                             {'audio_name': str, 'waveform': (clip_samples,), ...},
+                             ...]
+    Returns:
+      np_data_dict, dict, e.g.,
+          {'audio_name': (batch_size,), 'waveform': (batch_size, clip_samples), ...}
+    """
+    np_data_dict = {}
+    for key in list_data_dict[0].keys():
+        np_data_dict[key] = np.array([data_dict[key] for data_dict in list_data_dict])
+    return np_data_dict

audio_detection/audio_infer/utils/dataset.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import numpy as np
+import argparse
+import csv
+import os
+import glob
+import datetime
+import time
+import logging
+import h5py
+import librosa
+from utilities import (create_folder, get_filename, create_logging,
+    float32_to_int16, pad_or_truncate, read_metadata)
+import config
+def split_unbalanced_csv_to_partial_csvs(args):
+    """Split unbalanced csv to part csvs. Each part csv contains up to 50000 ids.
+    """
+    unbalanced_csv_path = args.unbalanced_csv
+    unbalanced_partial_csvs_dir = args.unbalanced_partial_csvs_dir
+    create_folder(unbalanced_partial_csvs_dir)
+    with open(unbalanced_csv_path, 'r') as f:
+        lines = f.readlines()
+    lines = lines[3:]   # Remove head info
+    audios_num_per_file = 50000
+    files_num = int(np.ceil(len(lines) / float(audios_num_per_file)))
+    for r in range(files_num):
+        lines_per_file = lines[r * audios_num_per_file :
+            (r + 1) * audios_num_per_file]
+        out_csv_path = os.path.join(unbalanced_partial_csvs_dir,
+            'unbalanced_train_segments_part{:02d}.csv'.format(r))
+        with open(out_csv_path, 'w') as f:
+            f.write('empty\n')
+            f.write('empty\n')
+            f.write('empty\n')
+            for line in lines_per_file:
+                f.write(line)
+        print('Write out csv to {}'.format(out_csv_path))
+def download_wavs(args):
+    """Download videos and extract audio in wav format.
+    """
+    # Paths
+    csv_path = args.csv_path
+    audios_dir = args.audios_dir
+    mini_data = args.mini_data
+    if mini_data:
+        logs_dir = '_logs/download_dataset/{}'.format(get_filename(csv_path))
+    else:
+        logs_dir = '_logs/download_dataset_minidata/{}'.format(get_filename(csv_path))
+    create_folder(audios_dir)
+    create_folder(logs_dir)
+    create_logging(logs_dir, filemode='w')
+    logging.info('Download log is saved to {}'.format(logs_dir))
+    # Read csv
+    with open(csv_path, 'r') as f:
+        lines = f.readlines()
+    lines = lines[3:]   # Remove csv head info
+    if mini_data:
+        lines = lines[0 : 10]   # Download partial data for debug
+    download_time = time.time()
+    # Download
+    for (n, line) in enumerate(lines):
+        items = line.split(', ')
+        audio_id = items[0]
+        start_time = float(items[1])
+        end_time = float(items[2])
+        duration = end_time - start_time
+        logging.info('{} {} start_time: {:.1f}, end_time: {:.1f}'.format(
+            n, audio_id, start_time, end_time))
+        # Download full video of whatever format
+        video_name = os.path.join(audios_dir, '_Y{}.%(ext)s'.format(audio_id))
+        os.system("youtube-dl --quiet -o '{}' -x https://www.youtube.com/watch?v={}"\
+            .format(video_name, audio_id))
+        video_paths = glob.glob(os.path.join(audios_dir, '_Y' + audio_id + '.*'))
+        # If download successful
+        if len(video_paths) > 0:
+            video_path = video_paths[0]     # Choose one video
+            # Add 'Y' to the head because some video ids are started with '-'
+            # which will cause problem
+            audio_path = os.path.join(audios_dir, 'Y' + audio_id + '.wav')
+            # Extract audio in wav format
+            os.system("ffmpeg -loglevel panic -i {} -ac 1 -ar 32000 -ss {} -t 00:00:{} {} "\
+                .format(video_path,
+                str(datetime.timedelta(seconds=start_time)), duration,
+                audio_path))
+            # Remove downloaded video
+            os.system("rm {}".format(video_path))
+            logging.info("Download and convert to {}".format(audio_path))
+    logging.info('Download finished! Time spent: {:.3f} s'.format(
+        time.time() - download_time))
+    logging.info('Logs can be viewed in {}'.format(logs_dir))
+def pack_waveforms_to_hdf5(args):
+    """Pack waveform and target of several audio clips to a single hdf5 file.
+    This can speed up loading and training.
+    """
+    # Arguments & parameters
+    audios_dir = args.audios_dir
+    csv_path = args.csv_path
+    waveforms_hdf5_path = args.waveforms_hdf5_path
+    mini_data = args.mini_data
+    clip_samples = config.clip_samples
+    classes_num = config.classes_num
+    sample_rate = config.sample_rate
+    id_to_ix = config.id_to_ix
+    # Paths
+    if mini_data:
+        prefix = 'mini_'
+        waveforms_hdf5_path += '.mini'
+    else:
+        prefix = ''
+    create_folder(os.path.dirname(waveforms_hdf5_path))
+    logs_dir = '_logs/pack_waveforms_to_hdf5/{}{}'.format(prefix, get_filename(csv_path))
+    create_folder(logs_dir)
+    create_logging(logs_dir, filemode='w')
+    logging.info('Write logs to {}'.format(logs_dir))
+    # Read csv file
+    meta_dict = read_metadata(csv_path, classes_num, id_to_ix)
+    if mini_data:
+        mini_num = 10
+        for key in meta_dict.keys():
+            meta_dict[key] = meta_dict[key][0 : mini_num]
+    audios_num = len(meta_dict['audio_name'])
+    # Pack waveform to hdf5
+    total_time = time.time()
+    with h5py.File(waveforms_hdf5_path, 'w') as hf:
+        hf.create_dataset('audio_name', shape=((audios_num,)), dtype='S20')
+        hf.create_dataset('waveform', shape=((audios_num, clip_samples)), dtype=np.int16)
+        hf.create_dataset('target', shape=((audios_num, classes_num)), dtype=np.bool)
+        hf.attrs.create('sample_rate', data=sample_rate, dtype=np.int32)
+        # Pack waveform & target of several audio clips to a single hdf5 file
+        for n in range(audios_num):
+            audio_path = os.path.join(audios_dir, meta_dict['audio_name'][n])
+            if os.path.isfile(audio_path):
+                logging.info('{} {}'.format(n, audio_path))
+                (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+                audio = pad_or_truncate(audio, clip_samples)
+                hf['audio_name'][n] = meta_dict['audio_name'][n].encode()
+                hf['waveform'][n] = float32_to_int16(audio)
+                hf['target'][n] = meta_dict['target'][n]
+            else:
+                logging.info('{} File does not exist! {}'.format(n, audio_path))
+    logging.info('Write to {}'.format(waveforms_hdf5_path))
+    logging.info('Pack hdf5 time: {:.3f}'.format(time.time() - total_time))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(dest='mode')
+    parser_split = subparsers.add_parser('split_unbalanced_csv_to_partial_csvs')
+    parser_split.add_argument('--unbalanced_csv', type=str, required=True, help='Path of unbalanced_csv file to read.')
+    parser_split.add_argument('--unbalanced_partial_csvs_dir', type=str, required=True, help='Directory to save out split unbalanced partial csv.')
+    parser_download_wavs = subparsers.add_parser('download_wavs')
+    parser_download_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.')
+    parser_download_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.')
+    parser_download_wavs.add_argument('--mini_data', action='store_true', default=True, help='Set true to only download 10 audios for debugging.')
+    parser_pack_wavs = subparsers.add_parser('pack_waveforms_to_hdf5')
+    parser_pack_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.')
+    parser_pack_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.')
+    parser_pack_wavs.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path to save out packed hdf5.')
+    parser_pack_wavs.add_argument('--mini_data', action='store_true', default=False, help='Set true to only download 10 audios for debugging.')
+    args = parser.parse_args()
+    if args.mode == 'split_unbalanced_csv_to_partial_csvs':
+        split_unbalanced_csv_to_partial_csvs(args)
+    elif args.mode == 'download_wavs':
+        download_wavs(args)
+    elif args.mode == 'pack_waveforms_to_hdf5':
+        pack_waveforms_to_hdf5(args)
+    else:
+        raise Exception('Incorrect arguments!')

audio_detection/audio_infer/utils/plot_for_paper.py ADDED Viewed

	@@ -0,0 +1,565 @@

+import os
+import sys
+import numpy as np
+import argparse
+import h5py
+import time
+import pickle
+import matplotlib.pyplot as plt
+import csv
+from sklearn import metrics
+from utilities import (create_folder, get_filename, d_prime)
+import config
+def load_statistics(statistics_path):
+    statistics_dict = pickle.load(open(statistics_path, 'rb'))
+    bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+    bal_map = np.mean(bal_map, axis=-1)
+    test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+    test_map = np.mean(test_map, axis=-1)
+    return bal_map, test_map
+def crop_label(label):
+    max_len = 16
+    if len(label) <= max_len:
+        return label
+    else:
+        words = label.split(' ')
+        cropped_label = ''
+        for w in words:
+            if len(cropped_label + ' ' + w) > max_len:
+                break
+            else:
+                cropped_label += ' {}'.format(w)
+    return cropped_label
+def add_comma(integer):
+    """E.g., 1234567 -> 1,234,567
+    """
+    integer = int(integer)
+    if integer >= 1000:
+        return str(integer // 1000) + ',' + str(integer % 1000)
+    else:
+        return str(integer)
+def plot_classwise_iteration_map(args):
+    # Paths
+    save_out_path = 'results/classwise_iteration_map.pdf'
+    create_folder(os.path.dirname(save_out_path))
+    # Load statistics
+    statistics_dict = pickle.load(open('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_WavegramLogmelCnn_balanced_mixup_bs32.pkl', 'rb'))
+    mAP_mat = np.array([e['average_precision'] for e in statistics_dict['test']])
+    mAP_mat = mAP_mat[0 : 300, :]   # 300 * 2000 = 600k iterations
+    sorted_indexes = np.argsort(config.full_samples_per_class)[::-1]
+    fig, axs = plt.subplots(1, 3, figsize=(20, 5))
+    ranges = [np.arange(0, 10), np.arange(250, 260), np.arange(517, 527)]
+    axs[0].set_ylabel('AP')
+    for col in range(0, 3):
+        axs[col].set_ylim(0, 1.)
+        axs[col].set_xlim(0, 301)
+        axs[col].set_xlabel('Iterations')
+        axs[col].set_ylabel('AP')
+        axs[col].xaxis.set_ticks(np.arange(0, 301, 100))
+        axs[col].xaxis.set_ticklabels(['0', '200k', '400k', '600k'])
+        lines = []
+        for _ix in ranges[col]:
+            _label = crop_label(config.labels[sorted_indexes[_ix]]) + \
+                ' ({})'.format(add_comma(config.full_samples_per_class[sorted_indexes[_ix]]))
+            line, = axs[col].plot(mAP_mat[:, sorted_indexes[_ix]], label=_label)
+            lines.append(line)
+        box = axs[col].get_position()
+        axs[col].set_position([box.x0, box.y0, box.width * 1., box.height])
+        axs[col].legend(handles=lines, bbox_to_anchor=(1., 1.))
+        axs[col].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+    plt.tight_layout(pad=4, w_pad=1, h_pad=1)
+    plt.savefig(save_out_path)
+    print(save_out_path)
+def plot_six_figures(args):
+    # Arguments & parameters
+    classes_num = config.classes_num
+    labels = config.labels
+    max_plot_iteration = 540000
+    iterations = np.arange(0, max_plot_iteration, 2000)
+    # Paths
+    class_labels_indices_path = os.path.join('metadata', 'class_labels_indices.csv')
+    save_out_path = 'results/six_figures.pdf'
+    create_folder(os.path.dirname(save_out_path))
+    # Plot
+    fig, ax = plt.subplots(2, 3, figsize=(14, 7))
+    bal_alpha = 0.3
+    test_alpha = 1.0
+    linewidth = 1.
+    # (a) Comparison of architectures
+    if True:
+        lines = []
+        # Wavegram-Logmel-CNN
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_WavegramLogmelCnn_balanced_mixup_bs32.pkl')
+        line, = ax[0, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # Cnn14
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[0, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # MobileNetV1
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_MobileNetV1_balanced_mixup_bs32.pkl')
+        line, = ax[0, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        ax[0, 0].legend(handles=lines, loc=2)
+        ax[0, 0].set_title('(a) Comparison of architectures')
+    # (b) Comparison of training data and augmentation'
+    if True:
+        lines = []
+        # Full data + balanced sampler + mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # Full data + balanced sampler + mixup in time domain
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_timedomain_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # Full data + balanced sampler + no mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_nomixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # Full data + uniform sampler + no mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_nobalanced_nomixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # Balanced data + balanced sampler + mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_balanced_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # Balanced data + balanced sampler + no mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_balanced_train_Cnn14_balanced_nomixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        ax[0, 1].legend(handles=lines, loc=2, fontsize=8)
+        ax[0, 1].set_title('(b) Comparison of training data and augmentation')
+    # (c) Comparison of embedding size
+    if True:
+        lines = []
+        # Embedding size 2048
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[0, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # Embedding size 128
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_emb128_balanced_mixup_bs32.pkl')
+        line, = ax[0, 2].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # Embedding size 32
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_emb32_balanced_mixup_bs32.pkl')
+        line, = ax[0, 2].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        ax[0, 2].legend(handles=lines, loc=2)
+        ax[0, 2].set_title('(c) Comparison of embedding size')
+    # (d) Comparison of amount of training data
+    if True:
+        lines = []
+        # 100% of full training data
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # 80% of full training data
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_0.8full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # 50% of full training data
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_0.5full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        ax[1, 0].legend(handles=lines, loc=2)
+        ax[1, 0].set_title('(d) Comparison of amount of training data')
+    # (e) Comparison of sampling rate
+    if True:
+        lines = []
+        # Cnn14 + 32 kHz
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # Cnn14 + 16 kHz
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_16k_balanced_mixup_bs32.pkl')
+        line, = ax[1, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # Cnn14 + 8 kHz
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_8k_balanced_mixup_bs32.pkl')
+        line, = ax[1, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        ax[1, 1].legend(handles=lines, loc=2)
+        ax[1, 1].set_title('(e) Comparison of sampling rate')
+    # (f) Comparison of mel bins number
+    if True:
+        lines = []
+        # Cnn14 + 128 mel bins
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel128_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 2].plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax[1, 2].plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # Cnn14 + 64 mel bins
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 2].plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        # Cnn14 + 32 mel bins
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel32_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 2].plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax[1, 2].plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+        ax[1, 2].legend(handles=lines, loc=2)
+        ax[1, 2].set_title('(f) Comparison of mel bins number')
+    for i in range(2):
+        for j in range(3):
+            ax[i, j].set_ylim(0, 0.8)
+            ax[i, j].set_xlim(0, len(iterations))
+            ax[i, j].set_xlabel('Iterations')
+            ax[i, j].set_ylabel('mAP')
+            ax[i, j].xaxis.set_ticks(np.arange(0, len(iterations), 50))
+            ax[i, j].xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k'])
+            ax[i, j].yaxis.set_ticks(np.arange(0, 0.81, 0.05))
+            ax[i, j].yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3',
+                '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8'])
+            ax[i, j].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+            ax[i, j].xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+    plt.tight_layout(0, 1, 0)
+    plt.savefig(save_out_path)
+    print('Save figure to {}'.format(save_out_path))
+def plot_complexity_map(args):
+    # Paths
+    save_out_path = 'results/complexity_mAP.pdf'
+    create_folder(os.path.dirname(save_out_path))
+    plt.figure(figsize=(5, 5))
+    fig, ax = plt.subplots(1, 1)
+    model_types = np.array(['Cnn6', 'Cnn10', 'Cnn14', 'ResNet22', 'ResNet38', 'ResNet54',
+        'MobileNetV1', 'MobileNetV2', 'DaiNet', 'LeeNet', 'LeeNet18',
+        'Res1dNet30', 'Res1dNet44', 'Wavegram-CNN', 'Wavegram-\nLogmel-CNN'])
+    flops = np.array([21.986, 28.166, 42.220, 30.081, 48.962, 54.563, 3.614, 2.810,
+        30.395, 4.741, 26.369, 32.688, 61.833, 44.234, 53.510])
+    mAPs = np.array([0.343, 0.380, 0.431, 0.430, 0.434, 0.429, 0.389, 0.383, 0.295,
+        0.266, 0.336, 0.365, 0.355, 0.389, 0.439])
+    sorted_indexes = np.sort(flops)
+    ax.scatter(flops, mAPs)
+    shift = [[-5.5, -0.004], [1, -0.004], [-1, -0.014], [-2, 0.006], [-7, 0.006],
+        [1, -0.01], [0.5, 0.004], [-1, -0.014], [1, -0.007], [0.8, -0.008],
+        [1, -0.007], [1, 0.002], [-6, -0.015], [1, -0.008], [0.8, 0]]
+    for i, model_type in enumerate(model_types):
+        ax.annotate(model_type, (flops[i] + shift[i][0], mAPs[i] + shift[i][1]))
+    ax.plot(flops[[0, 1, 2]], mAPs[[0, 1, 2]])
+    ax.plot(flops[[3, 4, 5]], mAPs[[3, 4, 5]])
+    ax.plot(flops[[6, 7]], mAPs[[6, 7]])
+    ax.plot(flops[[9, 10]], mAPs[[9, 10]])
+    ax.plot(flops[[11, 12]], mAPs[[11, 12]])
+    ax.plot(flops[[13, 14]], mAPs[[13, 14]])
+    ax.set_xlim(0, 70)
+    ax.set_ylim(0.2, 0.5)
+    ax.set_xlabel('Multi-load_statisticss (million)', fontsize=15)
+    ax.set_ylabel('mAP', fontsize=15)
+    ax.tick_params(axis='x', labelsize=12)
+    ax.tick_params(axis='y', labelsize=12)
+    plt.tight_layout(0, 0, 0)
+    plt.savefig(save_out_path)
+    print('Write out figure to {}'.format(save_out_path))
+def plot_long_fig(args):
+    # Paths
+    stats = pickle.load(open('paper_statistics/stats_for_long_fig.pkl', 'rb'))
+    save_out_path = 'results/long_fig.pdf'
+    create_folder(os.path.dirname(save_out_path))
+    # Load meta
+    N = len(config.labels)
+    sorted_indexes = stats['sorted_indexes_for_plot']
+    sorted_labels = np.array(config.labels)[sorted_indexes]
+    audio_clips_per_class = stats['official_balanced_training_samples'] + stats['official_unbalanced_training_samples']
+    audio_clips_per_class = audio_clips_per_class[sorted_indexes]
+    # Prepare axes for plot
+    (ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b) = prepare_plot_long_4_rows(sorted_labels)
+    # plot the number of training samples
+    ax1a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax2a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax3a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax4a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    # Load mAP of different systems
+    """Average instance system of [1] with an mAP of 0.317.
+    [1] Kong, Qiuqiang, Changsong Yu, Yong Xu, Turab Iqbal, Wenwu Wang, and
+    Mark D. Plumbley. "Weakly labelled audioset tagging with attention neural
+    networks." IEEE/ACM Transactions on Audio, Speech, and Language Processing
+    27, no. 11 (2019): 1791-1802."""
+    maps_avg_instances = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision']
+    maps_avg_instances = maps_avg_instances[sorted_indexes]
+    # PANNs Cnn14
+    maps_panns_cnn14 = stats['panns_cnn14']['eval']['average_precision']
+    maps_panns_cnn14 = maps_panns_cnn14[sorted_indexes]
+    # PANNs MobileNetV1
+    maps_panns_mobilenetv1 = stats['panns_mobilenetv1']['eval']['average_precision']
+    maps_panns_mobilenetv1 = maps_panns_mobilenetv1[sorted_indexes]
+    # PANNs Wavegram-Logmel-Cnn14
+    maps_panns_wavegram_logmel_cnn14 = stats['panns_wavegram_logmel_cnn14']['eval']['average_precision']
+    maps_panns_wavegram_logmel_cnn14 = maps_panns_wavegram_logmel_cnn14[sorted_indexes]
+    # Plot mAPs
+    _scatter_4_rows(maps_panns_wavegram_logmel_cnn14, ax1b, ax2b, ax3b, ax4b, s=5, c='g')
+    _scatter_4_rows(maps_panns_cnn14, ax1b, ax2b, ax3b, ax4b, s=5, c='r')
+    _scatter_4_rows(maps_panns_mobilenetv1, ax1b, ax2b, ax3b, ax4b, s=5, c='b')
+    _scatter_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, s=5, c='k')
+    linewidth = 0.7
+    line0te = _plot_4_rows(maps_panns_wavegram_logmel_cnn14, ax1b, ax2b, ax3b, ax4b,
+        c='g', linewidth=linewidth, label='AP with Wavegram-Logmel-CNN')
+    line1te = _plot_4_rows(maps_panns_cnn14, ax1b, ax2b, ax3b, ax4b, c='r',
+        linewidth=linewidth, label='AP with CNN14')
+    line2te = _plot_4_rows(maps_panns_mobilenetv1, ax1b, ax2b, ax3b, ax4b, c='b',
+        linewidth=linewidth, label='AP with MobileNetV1')
+    line3te = _plot_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, c='k',
+        linewidth=linewidth, label='AP with averaging instances (baseline)')
+    # Plot label quality
+    label_quality = stats['label_quality']
+    sorted_label_quality = np.array(label_quality)[sorted_indexes]
+    for k in range(len(sorted_label_quality)):
+        if sorted_label_quality[k] and sorted_label_quality[k] == 1:
+            sorted_label_quality[k] = 0.99
+    ax1b.scatter(np.arange(N)[sorted_label_quality != None],
+        sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
+    ax2b.scatter(np.arange(N)[sorted_label_quality != None],
+        sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
+    ax3b.scatter(np.arange(N)[sorted_label_quality != None],
+        sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
+    line_label_quality = ax4b.scatter(np.arange(N)[sorted_label_quality != None],
+        sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+', label='Label quality')
+    ax1b.scatter(np.arange(N)[sorted_label_quality == None],
+        0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax2b.scatter(np.arange(N)[sorted_label_quality == None],
+        0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax3b.scatter(np.arange(N)[sorted_label_quality == None],
+        0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax4b.scatter(np.arange(N)[sorted_label_quality == None],
+        0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+    plt.legend(handles=[line0te, line1te, line2te, line3te, line_label_quality], fontsize=6, loc=1)
+    plt.tight_layout(0, 0, 0)
+    plt.savefig(save_out_path)
+    print('Save fig to {}'.format(save_out_path))
+def prepare_plot_long_4_rows(sorted_lbs):
+    N = len(sorted_lbs)
+    f,(ax1a, ax2a, ax3a, ax4a) = plt.subplots(4, 1, sharey=False, facecolor='w', figsize=(10, 10.5))
+    fontsize = 5
+    K = 132
+    ax1a.set_xlim(0, K)
+    ax2a.set_xlim(K, 2 * K)
+    ax3a.set_xlim(2 * K, 3 * K)
+    ax4a.set_xlim(3 * K, N)
+    truncated_sorted_lbs = []
+    for lb in sorted_lbs:
+        lb = lb[0 : 25]
+        words = lb.split(' ')
+        if len(words[-1]) < 3:
+            lb = ' '.join(words[0:-1])
+        truncated_sorted_lbs.append(lb)
+    ax1a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax2a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax3a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax4a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax1a.set_yscale('log')
+    ax2a.set_yscale('log')
+    ax3a.set_yscale('log')
+    ax4a.set_yscale('log')
+    ax1b = ax1a.twinx()
+    ax2b = ax2a.twinx()
+    ax3b = ax3a.twinx()
+    ax4b = ax4a.twinx()
+    ax1b.set_ylim(0., 1.)
+    ax2b.set_ylim(0., 1.)
+    ax3b.set_ylim(0., 1.)
+    ax4b.set_ylim(0., 1.)
+    ax1b.set_ylabel('Average precision')
+    ax2b.set_ylabel('Average precision')
+    ax3b.set_ylabel('Average precision')
+    ax4b.set_ylabel('Average precision')
+    ax1b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax2b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax3b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax4b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax1a.xaxis.set_ticks(np.arange(K))
+    ax1a.xaxis.set_ticklabels(truncated_sorted_lbs[0:K], rotation=90, fontsize=fontsize)
+    ax1a.xaxis.tick_bottom()
+    ax1a.set_ylabel("Number of audio clips")
+    ax2a.xaxis.set_ticks(np.arange(K, 2*K))
+    ax2a.xaxis.set_ticklabels(truncated_sorted_lbs[K:2*K], rotation=90, fontsize=fontsize)
+    ax2a.xaxis.tick_bottom()
+    ax2a.set_ylabel("Number of audio clips")
+    ax3a.xaxis.set_ticks(np.arange(2*K, 3*K))
+    ax3a.xaxis.set_ticklabels(truncated_sorted_lbs[2*K:3*K], rotation=90, fontsize=fontsize)
+    ax3a.xaxis.tick_bottom()
+    ax3a.set_ylabel("Number of audio clips")
+    ax4a.xaxis.set_ticks(np.arange(3*K, N))
+    ax4a.xaxis.set_ticklabels(truncated_sorted_lbs[3*K:], rotation=90, fontsize=fontsize)
+    ax4a.xaxis.tick_bottom()
+    ax4a.set_ylabel("Number of audio clips")
+    ax1a.spines['right'].set_visible(False)
+    ax1b.spines['right'].set_visible(False)
+    ax2a.spines['left'].set_visible(False)
+    ax2b.spines['left'].set_visible(False)
+    ax2a.spines['right'].set_visible(False)
+    ax2b.spines['right'].set_visible(False)
+    ax3a.spines['left'].set_visible(False)
+    ax3b.spines['left'].set_visible(False)
+    ax3a.spines['right'].set_visible(False)
+    ax3b.spines['right'].set_visible(False)
+    ax4a.spines['left'].set_visible(False)
+    ax4b.spines['left'].set_visible(False)
+    plt.subplots_adjust(hspace = 0.8)
+    return ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b
+def _scatter_4_rows(x, ax, ax2, ax3, ax4, s, c, marker='.', alpha=1.):
+    N = len(x)
+    ax.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax2.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax3.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax4.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+def _plot_4_rows(x, ax, ax2, ax3, ax4, c, linewidth=1.0, alpha=1.0, label=""):
+    N = len(x)
+    ax.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    ax2.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    ax3.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    line, = ax4.plot(x, c=c, linewidth=linewidth, alpha=alpha, label=label)
+    return line
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    subparsers = parser.add_subparsers(dest='mode')
+    parser_classwise_iteration_map = subparsers.add_parser('plot_classwise_iteration_map')
+    parser_six_figures = subparsers.add_parser('plot_six_figures')
+    parser_complexity_map = subparsers.add_parser('plot_complexity_map')
+    parser_long_fig = subparsers.add_parser('plot_long_fig')
+    args = parser.parse_args()
+    if args.mode == 'plot_classwise_iteration_map':
+        plot_classwise_iteration_map(args)
+    elif args.mode == 'plot_six_figures':
+        plot_six_figures(args)
+    elif args.mode == 'plot_complexity_map':
+        plot_complexity_map(args)
+    elif args.mode == 'plot_long_fig':
+        plot_long_fig(args)
+    else:
+    	raise Exception('Incorrect argument!')

audio_detection/audio_infer/utils/plot_statistics.py ADDED Viewed

The diff for this file is too large to render. See raw diff

audio_detection/audio_infer/utils/utilities.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import os
+import logging
+import h5py
+import soundfile
+import librosa
+import numpy as np
+import pandas as pd
+from scipy import stats
+import datetime
+import pickle
+def create_folder(fd):
+    if not os.path.exists(fd):
+        os.makedirs(fd)
+def get_filename(path):
+    path = os.path.realpath(path)
+    na_ext = path.split('/')[-1]
+    na = os.path.splitext(na_ext)[0]
+    return na
+def get_sub_filepaths(folder):
+    paths = []
+    for root, dirs, files in os.walk(folder):
+        for name in files:
+            path = os.path.join(root, name)
+            paths.append(path)
+    return paths
+def create_logging(log_dir, filemode):
+    create_folder(log_dir)
+    i1 = 0
+    while os.path.isfile(os.path.join(log_dir, '{:04d}.log'.format(i1))):
+        i1 += 1
+    log_path = os.path.join(log_dir, '{:04d}.log'.format(i1))
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
+        datefmt='%a, %d %b %Y %H:%M:%S',
+        filename=log_path,
+        filemode=filemode)
+    # Print to console
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
+    console.setFormatter(formatter)
+    logging.getLogger('').addHandler(console)
+    return logging
+def read_metadata(csv_path, classes_num, id_to_ix):
+    """Read metadata of AudioSet from a csv file.
+    Args:
+      csv_path: str
+    Returns:
+      meta_dict: {'audio_name': (audios_num,), 'target': (audios_num, classes_num)}
+    """
+    with open(csv_path, 'r') as fr:
+        lines = fr.readlines()
+        lines = lines[3:]   # Remove heads
+    audios_num = len(lines)
+    targets = np.zeros((audios_num, classes_num), dtype=np.bool)
+    audio_names = []
+    for n, line in enumerate(lines):
+        items = line.split(', ')
+        """items: ['--4gqARaEJE', '0.000', '10.000', '"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"\n']"""
+        audio_name = 'Y{}.wav'.format(items[0])   # Audios are started with an extra 'Y' when downloading
+        label_ids = items[3].split('"')[1].split(',')
+        audio_names.append(audio_name)
+        # Target
+        for id in label_ids:
+            ix = id_to_ix[id]
+            targets[n, ix] = 1
+    meta_dict = {'audio_name': np.array(audio_names), 'target': targets}
+    return meta_dict
+def float32_to_int16(x):
+    assert np.max(np.abs(x)) <= 1.2
+    x = np.clip(x, -1, 1)
+    return (x * 32767.).astype(np.int16)
+def int16_to_float32(x):
+    return (x / 32767.).astype(np.float32)
+def pad_or_truncate(x, audio_length):
+    """Pad all audio to specific length."""
+    if len(x) <= audio_length:
+        return np.concatenate((x, np.zeros(audio_length - len(x))), axis=0)
+    else:
+        return x[0 : audio_length]
+def d_prime(auc):
+    d_prime = stats.norm().ppf(auc) * np.sqrt(2.0)
+    return d_prime
+class Mixup(object):
+    def __init__(self, mixup_alpha, random_seed=1234):
+        """Mixup coefficient generator.
+        """
+        self.mixup_alpha = mixup_alpha
+        self.random_state = np.random.RandomState(random_seed)
+    def get_lambda(self, batch_size):
+        """Get mixup random coefficients.
+        Args:
+          batch_size: int
+        Returns:
+          mixup_lambdas: (batch_size,)
+        """
+        mixup_lambdas = []
+        for n in range(0, batch_size, 2):
+            lam = self.random_state.beta(self.mixup_alpha, self.mixup_alpha, 1)[0]
+            mixup_lambdas.append(lam)
+            mixup_lambdas.append(1. - lam)
+        return np.array(mixup_lambdas)
+class StatisticsContainer(object):
+    def __init__(self, statistics_path):
+        """Contain statistics of different training iterations.
+        """
+        self.statistics_path = statistics_path
+        self.backup_statistics_path = '{}_{}.pkl'.format(
+            os.path.splitext(self.statistics_path)[0],
+            datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
+        self.statistics_dict = {'bal': [], 'test': []}
+    def append(self, iteration, statistics, data_type):
+        statistics['iteration'] = iteration
+        self.statistics_dict[data_type].append(statistics)
+    def dump(self):
+        pickle.dump(self.statistics_dict, open(self.statistics_path, 'wb'))
+        pickle.dump(self.statistics_dict, open(self.backup_statistics_path, 'wb'))
+        logging.info('    Dump statistics to {}'.format(self.statistics_path))
+        logging.info('    Dump statistics to {}'.format(self.backup_statistics_path))
+    def load_state_dict(self, resume_iteration):
+        self.statistics_dict = pickle.load(open(self.statistics_path, 'rb'))
+        resume_statistics_dict = {'bal': [], 'test': []}
+        for key in self.statistics_dict.keys():
+            for statistics in self.statistics_dict[key]:
+                if statistics['iteration'] <= resume_iteration:
+                    resume_statistics_dict[key].append(statistics)
+        self.statistics_dict = resume_statistics_dict

audio_detection/target_sound_detection/src/__pycache__/models.cpython-38.pyc ADDED Viewed

Binary file (37.9 kB). View file

audio_detection/target_sound_detection/src/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (11.1 kB). View file

audio_detection/target_sound_detection/src/models.py ADDED Viewed

	@@ -0,0 +1,1288 @@

+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2021/3/9 16:33
+# @Author  : dongchao yang
+# @File    : train.py
+from itertools import zip_longest
+import numpy as np
+from scipy import ndimage
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import time
+from torchlibrosa.augmentation import SpecAugmentation
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+import math
+from sklearn.cluster import KMeans
+import os
+import time
+from functools import partial
+# import timm
+# from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+import warnings
+from functools import partial
+# from timm.models.registry import register_model
+# from timm.models.vision_transformer import _cfg
+# from mmdet.utils import get_root_logger
+# from mmcv.runner import load_checkpoint
+# from mmcv.runner import _load_checkpoint, load_state_dict
+# import mmcv.runner
+import copy
+from collections import OrderedDict
+import io
+import re
+DEBUG=0
+event_labels = ['Alarm', 'Alarm_clock', 'Animal', 'Applause', 'Arrow', 'Artillery_fire',
+                'Babbling', 'Baby_laughter', 'Bark', 'Basketball_bounce', 'Battle_cry',
+                'Bell', 'Bird', 'Bleat', 'Bouncing', 'Breathing', 'Buzz', 'Camera',
+                'Cap_gun', 'Car', 'Car_alarm', 'Cat', 'Caw', 'Cheering', 'Child_singing',
+                'Choir', 'Chop', 'Chopping_(food)', 'Clapping', 'Clickety-clack', 'Clicking',
+                'Clip-clop', 'Cluck', 'Coin_(dropping)', 'Computer_keyboard', 'Conversation',
+                'Coo', 'Cough', 'Cowbell', 'Creak', 'Cricket', 'Croak', 'Crow', 'Crowd', 'DTMF',
+                'Dog', 'Door', 'Drill', 'Drip', 'Engine', 'Engine_starting', 'Explosion', 'Fart',
+                'Female_singing', 'Filing_(rasp)', 'Finger_snapping', 'Fire', 'Fire_alarm', 'Firecracker',
+                'Fireworks', 'Frog', 'Gasp', 'Gears', 'Giggle', 'Glass', 'Glass_shatter', 'Gobble', 'Groan',
+                'Growling', 'Hammer', 'Hands', 'Hiccup', 'Honk', 'Hoot', 'Howl', 'Human_sounds', 'Human_voice',
+                'Insect', 'Laughter', 'Liquid', 'Machine_gun', 'Male_singing', 'Mechanisms', 'Meow', 'Moo',
+                'Motorcycle', 'Mouse', 'Music', 'Oink', 'Owl', 'Pant', 'Pant_(dog)', 'Patter', 'Pig', 'Plop',
+                'Pour', 'Power_tool', 'Purr', 'Quack', 'Radio', 'Rain_on_surface', 'Rapping', 'Rattle',
+                'Reversing_beeps', 'Ringtone', 'Roar', 'Run', 'Rustle', 'Scissors', 'Scrape', 'Scratch',
+                'Screaming', 'Sewing_machine', 'Shout', 'Shuffle', 'Shuffling_cards', 'Singing',
+                'Single-lens_reflex_camera', 'Siren', 'Skateboard', 'Sniff', 'Snoring', 'Speech',
+                'Speech_synthesizer', 'Spray', 'Squeak', 'Squeal', 'Steam', 'Stir', 'Surface_contact',
+                'Tap', 'Tap_dance', 'Telephone_bell_ringing', 'Television', 'Tick', 'Tick-tock', 'Tools',
+                'Train', 'Train_horn', 'Train_wheels_squealing', 'Truck', 'Turkey', 'Typewriter', 'Typing',
+                'Vehicle', 'Video_game_sound', 'Water', 'Whimper_(dog)', 'Whip', 'Whispering', 'Whistle',
+                'Whistling', 'Whoop', 'Wind', 'Writing', 'Yip', 'and_pans', 'bird_song', 'bleep', 'clink',
+                'cock-a-doodle-doo', 'crinkling', 'dove', 'dribble', 'eructation', 'faucet', 'flapping_wings',
+                'footsteps', 'gunfire', 'heartbeat', 'infant_cry', 'kid_speaking', 'man_speaking', 'mastication',
+                'mice', 'river', 'rooster', 'silverware', 'skidding', 'smack', 'sobbing', 'speedboat', 'splatter',
+                'surf', 'thud', 'thwack', 'toot', 'truck_horn', 'tweet', 'vroom', 'waterfowl', 'woman_speaking']
+def load_checkpoint(model,
+                    filename,
+                    map_location=None,
+                    strict=False,
+                    logger=None,
+                    revise_keys=[(r'^module\.', '')]):
+    """Load checkpoint from a file or URI.
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        revise_keys (list): A list of customized keywords to modify the
+            state_dict in checkpoint. Each item is a (pattern, replacement)
+            pair of the regular expression operations. Default: strip
+            the prefix 'module.' by [(r'^module\\.', '')].
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location, logger)
+    '''
+    new_proj = torch.nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=2).unsqueeze(2).repeat(1,1,3,1))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=3).unsqueeze(3).repeat(1,1,1,3))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    '''
+    new_proj = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    # strip prefix of state_dict
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    for p, r in revise_keys:
+        state_dict = OrderedDict(
+            {re.sub(p, r, k): v
+             for k, v in state_dict.items()})
+    state_dict = OrderedDict({k.replace('backbone.',''):v for k,v in state_dict.items()})
+    # Keep metadata in state_dict
+    state_dict._metadata = metadata
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+def init_weights(m):
+    if isinstance(m, (nn.Conv2d, nn.Conv1d)):
+        nn.init.kaiming_normal_(m.weight)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.BatchNorm2d):
+        nn.init.constant_(m.weight, 1)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    if isinstance(m, nn.Linear):
+        nn.init.kaiming_uniform_(m.weight)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer. """
+    nn.init.xavier_uniform_(layer.weight)
+    if hasattr(layer, 'bias'):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+def init_bn(bn):
+    """Initialize a Batchnorm layer. """
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.)
+class MaxPool(nn.Module):
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+    def forward(self, logits, decision):
+        return torch.max(decision, dim=self.pooldim)[0]
+class LinearSoftPool(nn.Module):
+    """LinearSoftPool
+    Linear softmax, takes logits and returns a probability, near to the actual maximum value.
+    Taken from the paper:
+        A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling
+    https://arxiv.org/abs/1810.09050
+    """
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+    def forward(self, logits, time_decision):
+        return (time_decision**2).sum(self.pooldim) / (time_decision.sum(
+            self.pooldim)+1e-7)
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.conv2 = nn.Conv2d(in_channels=out_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+        self.init_weight()
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        return x
+class ConvBlock_GLU(nn.Module):
+    def __init__(self, in_channels, out_channels,kernel_size=(3,3)):
+        super(ConvBlock_GLU, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=kernel_size, stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.sigmoid = nn.Sigmoid()
+        self.init_weight()
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_bn(self.bn1)
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        x = input
+        x = self.bn1(self.conv1(x))
+        cnn1 = self.sigmoid(x[:, :x.shape[1]//2, :, :])
+        cnn2 = x[:,x.shape[1]//2:,:,:]
+        x = cnn1*cnn2
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        elif pool_type == 'None':
+            pass
+        elif pool_type == 'LP':
+            pass
+            #nn.LPPool2d(4, pool_size)
+        else:
+            raise Exception('Incorrect argument!')
+        return x
+class Mul_scale_GLU(nn.Module):
+    def __init__(self):
+        super(Mul_scale_GLU,self).__init__()
+        self.conv_block1_1 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(1,1)) # 1*1
+        self.conv_block1_2 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(3,3)) # 3*3
+        self.conv_block1_3 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(5,5)) # 5*5
+        self.conv_block2 = ConvBlock_GLU(in_channels=96, out_channels=128*2)
+        # self.conv_block3 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock_GLU(in_channels=128, out_channels=128*2)
+        self.conv_block4 = ConvBlock_GLU(in_channels=128, out_channels=256*2)
+        self.conv_block5 = ConvBlock_GLU(in_channels=256, out_channels=256*2)
+        self.conv_block6 = ConvBlock_GLU(in_channels=256, out_channels=512*2)
+        self.conv_block7 = ConvBlock_GLU(in_channels=512, out_channels=512*2)
+        self.padding = nn.ReplicationPad2d((0,1,0,1))
+    def forward(self, input, fi=None):
+        """
+        Input: (batch_size, data_length)"""
+        x1 = self.conv_block1_1(input, pool_size=(2, 2), pool_type='avg')
+        x1 = x1[:,:,:500,:32]
+        #print('x1 ',x1.shape)
+        x2 = self.conv_block1_2(input,pool_size=(2,2),pool_type='avg')
+        #print('x2 ',x2.shape)
+        x3 = self.conv_block1_3(input,pool_size=(2,2),pool_type='avg')
+        x3 = self.padding(x3)
+        #print('x3 ',x3.shape)
+        # assert 1==2
+        x = torch.cat([x1,x2],dim=1)
+        x = torch.cat([x,x3],dim=1)
+        #print('x ',x.shape)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='None')
+        x = self.conv_block3(x,pool_size=(2,2),pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training) #
+        #print('x2,3 ',x.shape)
+        x = self.conv_block4(x, pool_size=(2, 4), pool_type='None')
+        x = self.conv_block5(x,pool_size=(2,4),pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        #print('x4,5 ',x.shape)
+        x = self.conv_block6(x, pool_size=(1, 4), pool_type='None')
+        x = self.conv_block7(x, pool_size=(1, 4), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        # print('x6,7 ',x.shape)
+        # assert 1==2
+        return x
+class Cnn14(nn.Module):
+    def __init__(self, sample_rate=32000, window_size=1024, hop_size=320, mel_bins=64, fmin=50,
+        fmax=14000, classes_num=527):
+        super(Cnn14, self).__init__()
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+            freq_drop_width=8, freq_stripes_num=2)
+        self.bn0 = nn.BatchNorm2d(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+        self.fc1 = nn.Linear(2048, 128, bias=True)
+        self.fc_audioset = nn.Linear(128, classes_num, bias=True)
+        self.init_weight()
+    def init_weight(self):
+        init_layer(self.fc1)
+        init_layer(self.fc_audioset)
+    def forward(self, input_, mixup_lambda=None):
+        """
+        Input: (batch_size, data_length)"""
+        input_ = input_.unsqueeze(1)
+        x = self.conv_block1(input_, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        # print(x.shape)
+        # x = torch.mean(x, dim=3)
+        x = x.transpose(1, 2).contiguous().flatten(-2)
+        x = self.fc1(x)
+        # print(x.shape)
+        # assert 1==2
+        # (x1,_) = torch.max(x, dim=2)
+        # x2 = torch.mean(x, dim=2)
+        # x = x1 + x2
+        # x = F.dropout(x, p=0.5, training=self.training)
+        # x = F.relu_(self.fc1(x))
+        # embedding = F.dropout(x, p=0.5, training=self.training)
+        return x
+class Cnn10_fi(nn.Module):
+    def __init__(self):
+        super(Cnn10_fi, self).__init__()
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        # self.fc1 = nn.Linear(512, 512, bias=True)
+        # self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+        # self.init_weight()
+    def forward(self, input, fi=None):
+        """
+        Input: (batch_size, data_length)"""
+        x = self.conv_block1(input, pool_size=(2, 2), pool_type='avg')
+        if fi != None:
+            gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            x = (gamma)*x + beta
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        if fi != None:
+            gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            x = (gamma)*x + beta
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 4), pool_type='avg')
+        if fi != None:
+            gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            x = (gamma)*x + beta
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(1, 4), pool_type='avg')
+        if fi != None:
+            gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            x = (gamma)*x + beta
+        x = F.dropout(x, p=0.2, training=self.training)
+        return x
+class Cnn10_mul_scale(nn.Module):
+    def __init__(self,scale=8):
+        super(Cnn10_mul_scale, self).__init__()
+        self.conv_block1_1 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(1,1))
+        self.conv_block1_2 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(3,3))
+        self.conv_block1_3 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(5,5))
+        self.conv_block2 = ConvBlock(in_channels=96, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.scale = scale
+        self.padding = nn.ReplicationPad2d((0,1,0,1))
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        """
+        Input: (batch_size, data_length)"""
+        if self.scale == 8:
+            pool_size1 = (2,2)
+            pool_size2 = (2,2)
+            pool_size3 = (2,4)
+            pool_size4 = (1,4)
+        elif self.scale == 4:
+            pool_size1 = (2,2)
+            pool_size2 = (2,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        elif self.scale == 2:
+            pool_size1 = (2,2)
+            pool_size2 = (1,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        else:
+            pool_size1 = (1,2)
+            pool_size2 = (1,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        # print('input ',input.shape)
+        x1 = self.conv_block1_1(input, pool_size=pool_size1, pool_type='avg')
+        x1 = x1[:,:,:500,:32]
+        #print('x1 ',x1.shape)
+        x2 = self.conv_block1_2(input, pool_size=pool_size1, pool_type='avg')
+        #print('x2 ',x2.shape)
+        x3 = self.conv_block1_3(input, pool_size=pool_size1, pool_type='avg')
+        x3 = self.padding(x3)
+        #print('x3 ',x3.shape)
+        # assert 1==2
+        m_i = min(x3.shape[2],min(x1.shape[2],x2.shape[2]))
+        #print('m_i ', m_i)
+        x = torch.cat([x1[:,:,:m_i,:],x2[:,:, :m_i,:],x3[:,:, :m_i,:]],dim=1)
+        # x = torch.cat([x,x3],dim=1)
+        # x = self.conv_block1(input, pool_size=pool_size1, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=pool_size2, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=pool_size3, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=pool_size4, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        return x
+class Cnn10(nn.Module):
+    def __init__(self,scale=8):
+        super(Cnn10, self).__init__()
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.scale = scale
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        """
+        Input: (batch_size, data_length)"""
+        if self.scale == 8:
+            pool_size1 = (2,2)
+            pool_size2 = (2,2)
+            pool_size3 = (2,4)
+            pool_size4 = (1,4)
+        elif self.scale == 4:
+            pool_size1 = (2,2)
+            pool_size2 = (2,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        elif self.scale == 2:
+            pool_size1 = (2,2)
+            pool_size2 = (1,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        else:
+            pool_size1 = (1,2)
+            pool_size2 = (1,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        x = self.conv_block1(input, pool_size=pool_size1, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=pool_size2, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=pool_size3, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=pool_size4, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        return x
+class MeanPool(nn.Module):
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+    def forward(self, logits, decision):
+        return torch.mean(decision, dim=self.pooldim)
+class ResPool(nn.Module):
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+        self.linPool = LinearSoftPool(pooldim=1)
+class AutoExpPool(nn.Module):
+    def __init__(self, outputdim=10, pooldim=1):
+        super().__init__()
+        self.outputdim = outputdim
+        self.alpha = nn.Parameter(torch.full((outputdim, ), 1))
+        self.pooldim = pooldim
+    def forward(self, logits, decision):
+        scaled = self.alpha * decision  # \alpha * P(Y|x) in the paper
+        return (logits * torch.exp(scaled)).sum(
+            self.pooldim) / torch.exp(scaled).sum(self.pooldim)
+class SoftPool(nn.Module):
+    def __init__(self, T=1, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+        self.T = T
+    def forward(self, logits, decision):
+        w = torch.softmax(decision / self.T, dim=self.pooldim)
+        return torch.sum(decision * w, dim=self.pooldim)
+class AutoPool(nn.Module):
+    """docstring for AutoPool"""
+    def __init__(self, outputdim=10, pooldim=1):
+        super().__init__()
+        self.outputdim = outputdim
+        self.alpha = nn.Parameter(torch.ones(outputdim))
+        self.dim = pooldim
+    def forward(self, logits, decision):
+        scaled = self.alpha * decision  # \alpha * P(Y|x) in the paper
+        weight = torch.softmax(scaled, dim=self.dim)
+        return torch.sum(decision * weight, dim=self.dim)  # B x C
+class ExtAttentionPool(nn.Module):
+    def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
+        super().__init__()
+        self.inputdim = inputdim
+        self.outputdim = outputdim
+        self.pooldim = pooldim
+        self.attention = nn.Linear(inputdim, outputdim)
+        nn.init.zeros_(self.attention.weight)
+        nn.init.zeros_(self.attention.bias)
+        self.activ = nn.Softmax(dim=self.pooldim)
+    def forward(self, logits, decision):
+        # Logits of shape (B, T, D), decision of shape (B, T, C)
+        w_x = self.activ(self.attention(logits) / self.outputdim)
+        h = (logits.permute(0, 2, 1).contiguous().unsqueeze(-2) *
+             w_x.unsqueeze(-1)).flatten(-2).contiguous()
+        return torch.sum(h, self.pooldim)
+class AttentionPool(nn.Module):
+    """docstring for AttentionPool"""
+    def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
+        super().__init__()
+        self.inputdim = inputdim
+        self.outputdim = outputdim
+        self.pooldim = pooldim
+        self.transform = nn.Linear(inputdim, outputdim)
+        self.activ = nn.Softmax(dim=self.pooldim)
+        self.eps = 1e-7
+    def forward(self, logits, decision):
+        # Input is (B, T, D)
+        # B, T , D
+        w = self.activ(torch.clamp(self.transform(logits), -15, 15))
+        detect = (decision * w).sum(
+            self.pooldim) / (w.sum(self.pooldim) + self.eps)
+        # B, T, D
+        return detect
+class Block2D(nn.Module):
+    def __init__(self, cin, cout, kernel_size=3, padding=1):
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.BatchNorm2d(cin),
+            nn.Conv2d(cin,
+                      cout,
+                      kernel_size=kernel_size,
+                      padding=padding,
+                      bias=False),
+            nn.LeakyReLU(inplace=True, negative_slope=0.1))
+    def forward(self, x):
+        return self.block(x)
+class AudioCNN(nn.Module):
+    def __init__(self, classes_num):
+        super(AudioCNN, self).__init__()
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.fc1 = nn.Linear(512,128,bias=True)
+        self.fc = nn.Linear(128, classes_num, bias=True)
+        self.init_weights()
+    def init_weights(self):
+        init_layer(self.fc)
+    def forward(self, input):
+        '''
+        Input: (batch_size, times_steps, freq_bins)'''
+        # [128, 801, 168] --> [128,1,801,168]
+        x = input[:, None, :, :]
+        '''(batch_size, 1, times_steps, freq_bins)'''
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') # 128,64,400,84
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') # 128,128,200,42
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') # 128,256,100,21
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') # 128,512,50,10
+        '''(batch_size, feature_maps, time_steps, freq_bins)'''
+        x = torch.mean(x, dim=3)        # (batch_size, feature_maps, time_stpes) # 128,512,50
+        (x, _) = torch.max(x, dim=2)    # (batch_size, feature_maps) 128,512
+        x = self.fc1(x) # 128,128
+        output = self.fc(x) # 128,10
+        return x,output
+    def extract(self,input):
+        '''Input: (batch_size, times_steps, freq_bins)'''
+        x = input[:, None, :, :]
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        '''(batch_size, feature_maps, time_steps, freq_bins)'''
+        x = torch.mean(x, dim=3)        # (batch_size, feature_maps, time_stpes)
+        (x, _) = torch.max(x, dim=2)    # (batch_size, feature_maps)
+        x = self.fc1(x) # 128,128
+        return x
+def parse_poolingfunction(poolingfunction_name='mean', **kwargs):
+    """parse_poolingfunction
+    A heler function to parse any temporal pooling
+    Pooling is done on dimension 1
+    :param poolingfunction_name:
+    :param **kwargs:
+    """
+    poolingfunction_name = poolingfunction_name.lower()
+    if poolingfunction_name == 'mean':
+        return MeanPool(pooldim=1)
+    elif poolingfunction_name == 'max':
+        return MaxPool(pooldim=1)
+    elif poolingfunction_name == 'linear':
+        return LinearSoftPool(pooldim=1)
+    elif poolingfunction_name == 'expalpha':
+        return AutoExpPool(outputdim=kwargs['outputdim'], pooldim=1)
+    elif poolingfunction_name == 'soft':
+        return SoftPool(pooldim=1)
+    elif poolingfunction_name == 'auto':
+        return AutoPool(outputdim=kwargs['outputdim'])
+    elif poolingfunction_name == 'attention':
+        return AttentionPool(inputdim=kwargs['inputdim'],
+                             outputdim=kwargs['outputdim'])
+class conv1d(nn.Module):
+    def __init__(self, nin, nout, kernel_size=3, stride=1, padding='VALID', dilation=1):
+        super(conv1d, self).__init__()
+        if padding == 'VALID':
+            dconv_pad = 0
+        elif padding == 'SAME':
+            dconv_pad = dilation * ((kernel_size - 1) // 2)
+        else:
+            raise ValueError("Padding Mode Error!")
+        self.conv = nn.Conv1d(nin, nout, kernel_size=kernel_size, stride=stride, padding=dconv_pad)
+        self.act = nn.ReLU()
+        self.init_layer(self.conv)
+    def init_layer(self, layer, nonlinearity='relu'):
+        """Initialize a Linear or Convolutional layer. """
+        nn.init.kaiming_normal_(layer.weight, nonlinearity=nonlinearity)
+        nn.init.constant_(layer.bias, 0.1)
+    def forward(self, x):
+        out = self.act(self.conv(x))
+        return out
+class Atten_1(nn.Module):
+    def __init__(self, input_dim, context=2, dropout_rate=0.2):
+        super(Atten_1, self).__init__()
+        self._matrix_k = nn.Linear(input_dim, input_dim // 4)
+        self._matrix_q = nn.Linear(input_dim, input_dim // 4)
+        self.relu = nn.ReLU()
+        self.context = context
+        self._dropout_layer = nn.Dropout(dropout_rate)
+        self.init_layer(self._matrix_k)
+        self.init_layer(self._matrix_q)
+    def init_layer(self, layer, nonlinearity='leaky_relu'):
+        """Initialize a Linear or Convolutional layer. """
+        nn.init.kaiming_uniform_(layer.weight, nonlinearity=nonlinearity)
+        if hasattr(layer, 'bias'):
+            if layer.bias is not None:
+                layer.bias.data.fill_(0.)
+    def forward(self, input_x):
+        k_x = input_x
+        k_x = self.relu(self._matrix_k(k_x))
+        k_x = self._dropout_layer(k_x)
+        # print('k_x ',k_x.shape)
+        q_x = input_x[:, self.context, :]
+        # print('q_x ',q_x.shape)
+        q_x = q_x[:, None, :]
+        # print('q_x1 ',q_x.shape)
+        q_x = self.relu(self._matrix_q(q_x))
+        q_x = self._dropout_layer(q_x)
+        # print('q_x2 ',q_x.shape)
+        x_ = torch.matmul(k_x, q_x.transpose(-2, -1) / math.sqrt(k_x.size(-1)))
+        # print('x_ ',x_.shape)
+        x_ = x_.squeeze(2)
+        alpha = F.softmax(x_, dim=-1)
+        att_ = alpha
+        # print('alpha ',alpha)
+        alpha = alpha.unsqueeze(2).repeat(1,1,input_x.shape[2])
+        # print('alpha ',alpha)
+        # alpha = alpha.view(alpha.size(0), alpha.size(1), alpha.size(2), 1)
+        out = alpha * input_x
+        # print('out ', out.shape)
+        # out = out.mean(2)
+        out = out.mean(1)
+        # print('out ',out.shape)
+        # assert 1==2
+        #y = alpha * input_x
+        #return y, att_
+        out = input_x[:, self.context, :] + out
+        return out
+class Fusion(nn.Module):
+    def __init__(self, inputdim, inputdim2, n_fac):
+        super().__init__()
+        self.fuse_layer1 = conv1d(inputdim, inputdim2*n_fac,1)
+        self.fuse_layer2 = conv1d(inputdim2, inputdim2*n_fac,1)
+        self.avg_pool = nn.AvgPool1d(n_fac, stride=n_fac) # 沿着最后一个维度进行pooling
+    def forward(self,embedding,mix_embed):
+        embedding = embedding.permute(0,2,1)
+        fuse1_out = self.fuse_layer1(embedding) # [2, 501, 2560] ,512*5, 1D卷积融合,spk_embeding ,扩大其维度
+        fuse1_out = fuse1_out.permute(0,2,1)
+        mix_embed = mix_embed.permute(0,2,1)
+        fuse2_out = self.fuse_layer2(mix_embed) # [2, 501, 2560] ,512*5, 1D卷积融合,spk_embeding ,扩大其维度
+        fuse2_out = fuse2_out.permute(0,2,1)
+        as_embs = torch.mul(fuse1_out, fuse2_out) # 相乘 [2, 501, 2560]
+        # (10, 501, 512)
+        as_embs = self.avg_pool(as_embs) # [2, 501, 512] 相当于 2560//5
+        return as_embs
+class CDur_fusion(nn.Module):
+    def __init__(self, inputdim, outputdim, **kwargs):
+        super().__init__()
+        self.features = nn.Sequential(
+            Block2D(1, 32),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(32, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(128, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (1, 4)),
+            nn.Dropout(0.3),
+        )
+        with torch.no_grad():
+            rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        self.gru = nn.GRU(128, 128, bidirectional=True, batch_first=True)
+        self.fusion = Fusion(128,2)
+        self.fc = nn.Linear(256,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    def forward(self, x, embedding): #
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d)
+        x = self.features(x) #
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = self.fusion(embedding,x)
+        #x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+class CDur(nn.Module):
+    def __init__(self, inputdim, outputdim,time_resolution, **kwargs):
+        super().__init__()
+        self.features = nn.Sequential(
+            Block2D(1, 32),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(32, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(128, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 4)),
+            nn.Dropout(0.3),
+        )
+        with torch.no_grad():
+            rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        self.gru = nn.GRU(256, 256, bidirectional=True, batch_first=True)
+        self.fc = nn.Linear(512,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    def forward(self, x, embedding,one_hot=None): #
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d)
+        x = self.features(x) #
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+class CDur_big(nn.Module):
+    def __init__(self, inputdim, outputdim, **kwargs):
+        super().__init__()
+        self.features = nn.Sequential(
+            Block2D(1, 64),
+            Block2D(64, 64),
+            nn.LPPool2d(4, (2, 2)),
+            Block2D(64, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 2)),
+            Block2D(128, 256),
+            Block2D(256, 256),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(256, 512),
+            Block2D(512, 512),
+            nn.LPPool2d(4, (1, 4)),
+            nn.Dropout(0.3),)
+        with torch.no_grad():
+            rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True)
+        self.fc = nn.Linear(1024,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    def forward(self, x, embedding): #
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d)
+        x = self.features(x) #
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+class CDur_GLU(nn.Module):
+    def __init__(self, inputdim, outputdim, **kwargs):
+        super().__init__()
+        self.features = Mul_scale_GLU()
+        # with torch.no_grad():
+        #     rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+        #     rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        self.gru = nn.GRU(640, 512,1, bidirectional=True, batch_first=True) # previous is 640
+        # self.gru = LSTMModel(640, 512,1)
+        self.fc = nn.Linear(1024,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        # self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    def forward(self, x, embedding,one_hot=None): #
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d)
+        x = self.features(x) #
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        # print('x ',x.shape)
+        # assert 1==2
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        # x = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+class CDur_CNN14(nn.Module):
+    def __init__(self, inputdim, outputdim,time_resolution,**kwargs):
+        super().__init__()
+        if time_resolution==125:
+            self.features = Cnn10(8)
+        elif time_resolution == 250:
+            #print('time_resolution ',time_resolution)
+            self.features = Cnn10(4)
+        elif time_resolution == 500:
+            self.features = Cnn10(2)
+        else:
+            self.features = Cnn10(0)
+        with torch.no_grad():
+            rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        # self.features = Cnn10()
+        self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True)
+        # self.gru = LSTMModel(640, 512,1)
+        self.fc = nn.Linear(1024,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        # self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    def forward(self, x, embedding,one_hot=None):
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d)
+        x = self.features(x) #
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        # print('x ',x.shape)
+        # assert 1==2
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        # x = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+class CDur_CNN_mul_scale(nn.Module):
+    def __init__(self, inputdim, outputdim,time_resolution,**kwargs):
+        super().__init__()
+        if time_resolution==125:
+            self.features = Cnn10_mul_scale(8)
+        elif time_resolution == 250:
+            #print('time_resolution ',time_resolution)
+            self.features = Cnn10_mul_scale(4)
+        elif time_resolution == 500:
+            self.features = Cnn10_mul_scale(2)
+        else:
+            self.features = Cnn10_mul_scale(0)
+        # with torch.no_grad():
+        #     rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+        #     rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        # self.features = Cnn10()
+        self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True)
+        # self.gru = LSTMModel(640, 512,1)
+        self.fc = nn.Linear(1024,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        # self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    def forward(self, x, embedding,one_hot=None):
+        # print('x ',x.shape)
+        # assert 1==2
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d)
+        x = self.features(x) #
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        # print('x ',x.shape)
+        # assert 1==2
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        # x = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+class CDur_CNN_mul_scale_fusion(nn.Module):
+    def __init__(self, inputdim, outputdim, time_resolution,**kwargs):
+        super().__init__()
+        if time_resolution==125:
+            self.features = Cnn10_mul_scale(8)
+        elif time_resolution == 250:
+            #print('time_resolution ',time_resolution)
+            self.features = Cnn10_mul_scale(4)
+        elif time_resolution == 500:
+            self.features = Cnn10_mul_scale(2)
+        else:
+            self.features = Cnn10_mul_scale(0)
+        # with torch.no_grad():
+        #     rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+        #     rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        # self.features = Cnn10()
+        self.gru = nn.GRU(512, 512, bidirectional=True, batch_first=True)
+        # self.gru = LSTMModel(640, 512,1)
+        self.fc = nn.Linear(1024,256)
+        self.fusion = Fusion(128,512,2)
+        self.outputlayer = nn.Linear(256, outputdim)
+        # self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    def forward(self, x, embedding,one_hot=None):
+        # print('x ',x.shape)
+        # assert 1==2
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d)
+        x = self.features(x) #
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        # print('x ',x.shape)
+        # assert 1==2
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = self.fusion(embedding, x)
+        #x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        # x = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+class RaDur_fusion(nn.Module):
+    def __init__(self, model_config, inputdim, outputdim, time_resolution, **kwargs):
+        super().__init__()
+        self.encoder = Cnn14()
+        self.detection = CDur_CNN_mul_scale_fusion(inputdim, outputdim, time_resolution)
+        self.softmax = nn.Softmax(dim=2)
+        #self.temperature = 5
+        # if model_config['pre_train']:
+        #     self.encoder.load_state_dict(torch.load(model_config['encoder_path'])['model'])
+        #     self.detection.load_state_dict(torch.load(model_config['CDur_path']))
+        self.q = nn.Linear(128,128)
+        self.k = nn.Linear(128,128)
+        self.q_ee = nn.Linear(128, 128)
+        self.k_ee = nn.Linear(128, 128)
+        self.temperature = 11.3 # sqrt(128)
+        self.att_pool = model_config['att_pool']
+        self.enhancement = model_config['enhancement']
+        self.tao = model_config['tao']
+        self.top = model_config['top']
+        self.bn = nn.BatchNorm1d(128)
+        self.EE_fusion = Fusion(128, 128, 4)
+    def get_w(self,q,k):
+        q = self.q(q)
+        k = self.k(k)
+        q = q.unsqueeze(1)
+        attn = torch.bmm(q, k.transpose(1, 2))
+        attn = attn/self.temperature
+        attn = self.softmax(attn)
+        return attn
+    def get_w_ee(self,q,k):
+        q = self.q_ee(q)
+        k = self.k_ee(k)
+        q = q.unsqueeze(1)
+        attn = torch.bmm(q, k.transpose(1, 2))
+        attn = attn/self.temperature
+        attn = self.softmax(attn)
+        return attn
+    def attention_pooling(self, embeddings, mean_embedding):
+        att_pool_w = self.get_w(mean_embedding,embeddings)
+        embedding = torch.bmm(att_pool_w, embeddings).squeeze(1)
+        # print(embedding.shape)
+        # print(att_pool_w.shape)
+        # print(att_pool_w[0])
+        # assert 1==2
+        return embedding
+    def select_topk_embeddings(self, scores, embeddings, k):
+        _, idx_DESC = scores.sort(descending=True, dim=1) # 根据分数进行排序
+        top_k = _[:,:k]
+        # print('top_k ', top_k)
+        # top_k = top_k.mean(1)
+        idx_topk = idx_DESC[:, :k] # 取top_k个
+        # print('index ', idx_topk)
+        idx_topk = idx_topk.unsqueeze(2).expand([-1, -1, embeddings.shape[2]])
+        selected_embeddings = torch.gather(embeddings, 1, idx_topk)
+        return selected_embeddings,top_k
+    def sum_with_attention(self, embedding, top_k, selected_embeddings):
+        # print('embedding ',embedding)
+        # print('selected_embeddings ',selected_embeddings.shape)
+        att_1 = self.get_w_ee(embedding, selected_embeddings)
+        att_1 = att_1.squeeze(1)
+        #print('att_1 ',att_1.shape)
+        larger = top_k > self.tao
+        # print('larger ',larger)
+        top_k = top_k*larger
+        # print('top_k ',top_k.shape)
+        # print('top_k ',top_k)
+        att_1 = att_1*top_k
+        #print('att_1 ',att_1.shape)
+        # assert 1==2
+        att_2 = att_1.unsqueeze(2).repeat(1,1,128)
+        Es = selected_embeddings*att_2
+        return Es
+    def orcal_EE(self, x, embedding, label):
+        batch, time, dim = x.shape
+        mixture_embedding = self.encoder(x) # 8, 125, 128
+        mixture_embedding = mixture_embedding.transpose(1,2)
+        mixture_embedding = self.bn(mixture_embedding)
+        mixture_embedding = mixture_embedding.transpose(1,2)
+        x = x.unsqueeze(1) # (b,1,t,d)
+        x = self.detection.features(x) #
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+        embedding_pre = embedding.unsqueeze(1)
+        embedding_pre = embedding_pre.repeat(1, x.shape[1], 1)
+        f = self.detection.fusion(embedding_pre, x) # the first stage results
+        #f = torch.cat((x, embedding_pre), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.detection.gru.flatten_parameters()
+        f, _ = self.detection.gru(f) #  x  torch.Size([16, 125, 256])
+        f = self.detection.fc(f)
+        decision_time = torch.softmax(self.detection.outputlayer(f),dim=2) # x  torch.Size([16, 125, 2])
+        selected_embeddings, top_k = self.select_topk_embeddings(decision_time[:,:,0], mixture_embedding, self.top)
+        selected_embeddings = self.sum_with_attention(embedding, top_k, selected_embeddings) # add the weight
+        mix_embedding = selected_embeddings.mean(1).unsqueeze(1) #
+        mix_embedding = mix_embedding.repeat(1, x.shape[1], 1)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        mix_embedding = self.EE_fusion(mix_embedding, embedding) # 使用神经网络进行融合
+        # mix_embedding2 = selected_embeddings2.mean(1)
+        #mix_embedding =  embedding + mix_embedding # 直接相加
+        # new detection results
+        # embedding_now = mix_embedding.unsqueeze(1)
+        # embedding_now = embedding_now.repeat(1, x.shape[1], 1)
+        f_now = self.detection.fusion(mix_embedding, x)
+        #f_now = torch.cat((x, embedding_now), dim=2) #
+        f_now, _ = self.detection.gru(f_now) #  x  torch.Size([16, 125, 256])
+        f_now = self.detection.fc(f_now)
+        decision_time_now = torch.softmax(self.detection.outputlayer(f_now), dim=2) # x  torch.Size([16, 125, 2])
+        top_k = top_k.mean(1)  # get avg score,higher score will have more weight
+        larger = top_k > self.tao
+        top_k = top_k * larger
+        top_k = top_k/2.0
+        # print('top_k ',top_k)
+        # assert 1==2
+        # print('tok_k[ ',top_k.shape)
+        # print('decision_time ',decision_time.shape)
+        # print('decision_time_now ',decision_time_now.shape)
+        neg_w = top_k.unsqueeze(1).unsqueeze(2)
+        neg_w = neg_w.repeat(1, decision_time_now.shape[1], decision_time_now.shape[2])
+        # print('neg_w ',neg_w.shape)
+        #print('neg_w ',neg_w[:,0:10,0])
+        pos_w = 1-neg_w
+        #print('pos_w ',pos_w[:,0:10,0])
+        decision_time_final = decision_time*pos_w + neg_w*decision_time_now
+        #print('decision_time_final ',decision_time_final[0,0:10,0])
+        # print(decision_time_final[0,:,:])
+        #assert 1==2
+        return decision_time_final
+    def forward(self, x, ref, label=None):
+        batch, time, dim = x.shape
+        logit = torch.zeros(1).cuda()
+        embeddings  = self.encoder(ref)
+        mean_embedding = embeddings.mean(1)
+        if self.att_pool == True:
+            mean_embedding = self.bn(mean_embedding)
+            embeddings = embeddings.transpose(1,2)
+            embeddings = self.bn(embeddings)
+            embeddings = embeddings.transpose(1,2)
+            embedding = self.attention_pooling(embeddings, mean_embedding)
+        else:
+            embedding = mean_embedding
+        if self.enhancement == True:
+            decision_time = self.orcal_EE(x, embedding, label)
+            decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+            return decision_time[:,:,0], decision_up, logit
+        x = x.unsqueeze(1) # (b,1,t,d)
+        x = self.detection.features(x) #
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        # x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        x = self.detection.fusion(embedding, x)
+        # embedding = embedding.unsqueeze(1)
+        # embedding = embedding.repeat(1, x.shape[1], 1)
+        # x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.detection.gru.flatten_parameters()
+        x, _ = self.detection.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.detection.fc(x)
+        decision_time = torch.softmax(self.detection.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2),
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0], decision_up, logit

audio_detection/target_sound_detection/src/utils.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2021/3/9 16:33
+# @Author  : dongchao yang
+# @File    : train.py
+import collections
+import sys
+from loguru import logger
+from pprint import pformat
+import numpy as np
+import pandas as pd
+import scipy
+import six
+import sklearn.preprocessing as pre
+import torch
+import tqdm
+import yaml
+from scipy.interpolate import interp1d
+def parse_config_or_kwargs(config_file, **kwargs):
+    """parse_config_or_kwargs
+    :param config_file: Config file that has parameters, yaml format
+    :param **kwargs: Other alternative parameters or overwrites for config
+    """
+    with open(config_file) as con_read:
+        yaml_config = yaml.load(con_read, Loader=yaml.FullLoader)
+    arguments = dict(yaml_config, **kwargs)
+    return arguments
+def find_contiguous_regions(activity_array): # in this part, if you cannot understand the binary operation, I think you can write a O(n) complexity method
+    """Find contiguous regions from bool valued numpy.array.
+    Copy of https://dcase-repo.github.io/dcase_util/_modules/dcase_util/data/decisions.html#DecisionEncoder
+    Reason is:
+    1. This does not belong to a class necessarily
+    2. Import DecisionEncoder requires sndfile over some other imports..which causes some problems on clusters
+    """
+    change_indices = np.logical_xor(activity_array[1:], activity_array[:-1]).nonzero()[0]
+    change_indices += 1
+    if activity_array[0]:
+        # If the first element of activity_array is True add 0 at the beginning
+        change_indices = np.r_[0, change_indices]
+    if activity_array[-1]:
+        # If the last element of activity_array is True, add the length of the array
+        change_indices = np.r_[change_indices, activity_array.size]
+    # print(change_indices.reshape((-1, 2)))
+    # Reshape the result into two columns
+    return change_indices.reshape((-1, 2))
+def split_train_cv(
+        data_frame: pd.DataFrame,
+        frac: float = 0.9,
+        y=None,  # Only for stratified, computes necessary split
+        **kwargs):
+    """split_train_cv
+    :param data_frame:
+    :type data_frame: pd.DataFrame
+    :param frac:
+    :type frac: float
+    """
+    if kwargs.get('mode',
+                  None) == 'urbansed':  # Filenames are DATA_-1 DATA_-2 etc
+        data_frame.loc[:, 'id'] = data_frame.groupby(
+            data_frame['filename'].str.split('_').apply(
+                lambda x: '_'.join(x[:-1]))).ngroup()
+        sampler = np.random.permutation(data_frame['id'].nunique())
+        num_train = int(frac * len(sampler))
+        train_indexes = sampler[:num_train]
+        cv_indexes = sampler[num_train:]
+        train_data = data_frame[data_frame['id'].isin(train_indexes)]
+        cv_data = data_frame[data_frame['id'].isin(cv_indexes)]
+        del train_data['id']
+        del cv_data['id']
+    elif kwargs.get('mode', None) == 'stratified': #  stratified --> 分层的 ?
+        # Use statified sampling
+        from skmultilearn.model_selection import iterative_train_test_split
+        index_train, _, index_cv, _ = iterative_train_test_split(
+            data_frame.index.values.reshape(-1, 1), y, test_size=1. - frac)
+        train_data = data_frame[data_frame.index.isin(index_train.squeeze())]
+        cv_data = data_frame[data_frame.index.isin(index_cv.squeeze())] # cv --> cross validation
+    else:
+        # Simply split train_test
+        train_data = data_frame.sample(frac=frac, random_state=10)
+        cv_data = data_frame[~data_frame.index.isin(train_data.index)]
+    return train_data, cv_data
+def pprint_dict(in_dict, outputfun=sys.stdout.write, formatter='yaml'): # print yaml file
+    """pprint_dict
+    :param outputfun: function to use, defaults to sys.stdout
+    :param in_dict: dict to print
+    """
+    if formatter == 'yaml':
+        format_fun = yaml.dump
+    elif formatter == 'pretty':
+        format_fun = pformat
+    for line in format_fun(in_dict).split('\n'):
+        outputfun(line)
+def getfile_outlogger(outputfile):
+    log_format = "[<green>{time:YYYY-MM-DD HH:mm:ss}</green>] {message}"
+    logger.configure(handlers=[{"sink": sys.stderr, "format": log_format}])
+    if outputfile:
+        logger.add(outputfile, enqueue=True, format=log_format)
+    return logger
+# according label, get encoder
+def train_labelencoder(labels: pd.Series, sparse=True):
+    """encode_labels
+    Encodes labels
+    :param labels: pd.Series representing the raw labels e.g., Speech, Water
+    :param encoder (optional): Encoder already fitted
+    returns encoded labels (many hot) and the encoder
+    """
+    assert isinstance(labels, pd.Series), "Labels need to be series"
+    if isinstance(labels[0], six.string_types):
+        # In case of using non processed strings, e.g., Vaccum, Speech
+        label_array = labels.str.split(',').values.tolist() # split label according to ','
+    elif isinstance(labels[0], np.ndarray):
+        # Encoder does not like to see numpy array
+        label_array = [lab.tolist() for lab in labels]
+    elif isinstance(labels[0], collections.Iterable):
+        label_array = labels
+    encoder = pre.MultiLabelBinarizer(sparse_output=sparse)
+    encoder.fit(label_array)
+    return encoder
+def encode_labels(labels: pd.Series, encoder=None, sparse=True):
+    """encode_labels
+    Encodes labels
+    :param labels: pd.Series representing the raw labels e.g., Speech, Water
+    :param encoder (optional): Encoder already fitted
+    returns encoded labels (many hot) and the encoder
+    """
+    assert isinstance(labels, pd.Series), "Labels need to be series"
+    instance = labels.iloc[0]
+    if isinstance(instance, six.string_types):
+        # In case of using non processed strings, e.g., Vaccum, Speech
+        label_array = labels.str.split(',').values.tolist()
+    elif isinstance(instance, np.ndarray):
+        # Encoder does not like to see numpy array
+        label_array = [lab.tolist() for lab in labels]
+    elif isinstance(instance, collections.Iterable):
+        label_array = labels
+    # get label_array, it is a list ,contain a lot of label, this label are string type
+    if not encoder:
+        encoder = pre.MultiLabelBinarizer(sparse_output=sparse) # if we encoder is None, we should init a encoder firstly.
+        encoder.fit(label_array)
+    labels_encoded = encoder.transform(label_array) # transform string to digit
+    return labels_encoded, encoder
+    # return pd.arrays.SparseArray(
+    # [row.toarray().ravel() for row in labels_encoded]), encoder
+def decode_with_timestamps(events,labels: np.array):
+    """decode_with_timestamps
+    Decodes the predicted label array (2d) into a list of
+    [(Labelname, onset, offset), ...]
+    :param encoder: Encoder during training
+    :type encoder: pre.MultiLabelBinarizer
+    :param labels: n-dim array
+    :type labels: np.array
+    """
+    # print('events ',events)
+    # print('labels ',labels.shape)
+    #assert 1==2
+    if labels.ndim == 2:
+        #print('...')
+        return [_decode_with_timestamps(events[i],labels[i]) for i in range(labels.shape[0])]
+    else:
+        return _decode_with_timestamps(events,labels)
+def median_filter(x, window_size, threshold=0.5):
+    """median_filter
+    :param x: input prediction array of shape (B, T, C) or (B, T).
+        Input is a sequence of probabilities 0 <= x <= 1
+    :param window_size: An integer to use
+    :param threshold: Binary thresholding threshold
+    """
+    x = binarize(x, threshold=threshold) # transfer to 0 or 1
+    if x.ndim == 3:
+        size = (1, window_size, 1)
+    elif x.ndim == 2 and x.shape[0] == 1:
+        # Assume input is class-specific median filtering
+        # E.g, Batch x Time  [1, 501]
+        size = (1, window_size)
+    elif x.ndim == 2 and x.shape[0] > 1:
+        # Assume input is standard median pooling, class-independent
+        # E.g., Time x Class [501, 10]
+        size = (window_size, 1)
+    return scipy.ndimage.median_filter(x, size=size)
+def _decode_with_timestamps(events,labels):
+    result_labels = []
+    # print('.......')
+    # print('labels ',labels.shape)
+    # print(labels)
+    change_indices = find_contiguous_regions(labels)
+    # print(change_indices)
+    # assert 1==2
+    for row in change_indices:
+        result_labels.append((events,row[0], row[1]))
+    return result_labels
+def inverse_transform_labels(encoder, pred):
+    if pred.ndim == 3:
+        return [encoder.inverse_transform(x) for x in pred]
+    else:
+        return encoder.inverse_transform(pred)
+def binarize(pred, threshold=0.5):
+    # Batch_wise
+    if pred.ndim == 3:
+        return np.array(
+            [pre.binarize(sub, threshold=threshold) for sub in pred])
+    else:
+        return pre.binarize(pred, threshold=threshold)
+def double_threshold(x, high_thres, low_thres, n_connect=1):
+    """double_threshold
+    Helper function to calculate double threshold for n-dim arrays
+    :param x: input array
+    :param high_thres: high threshold value
+    :param low_thres: Low threshold value
+    :param n_connect: Distance of <= n clusters will be merged
+    """
+    assert x.ndim <= 3, "Whoops something went wrong with the input ({}), check if its <= 3 dims".format(
+        x.shape)
+    if x.ndim == 3:
+        apply_dim = 1
+    elif x.ndim < 3:
+        apply_dim = 0
+    # x is assumed to be 3d: (batch, time, dim)
+    # Assumed to be 2d : (time, dim)
+    # Assumed to be 1d : (time)
+    # time axis is therefore at 1 for 3d and 0 for 2d (
+    return np.apply_along_axis(lambda x: _double_threshold(
+        x, high_thres, low_thres, n_connect=n_connect),
+                               axis=apply_dim,
+                               arr=x)
+def _double_threshold(x, high_thres, low_thres, n_connect=1, return_arr=True): # in nature, double_threshold considers boundary question
+    """_double_threshold
+    Computes a double threshold over the input array
+    :param x: input array, needs to be 1d
+    :param high_thres: High threshold over the array
+    :param low_thres: Low threshold over the array
+    :param n_connect: Postprocessing, maximal distance between clusters to connect
+    :param return_arr: By default this function returns the filtered indiced, but if return_arr = True it returns an array of tsame size as x filled with ones and zeros.
+    """
+    assert x.ndim == 1, "Input needs to be 1d"
+    high_locations = np.where(x > high_thres)[0] # return the index, where value is greater than high_thres
+    locations = x > low_thres # return true of false
+    encoded_pairs = find_contiguous_regions(locations)
+    # print('encoded_pairs ',encoded_pairs)
+    filtered_list = list(
+        filter(
+            lambda pair:
+            ((pair[0] <= high_locations) & (high_locations <= pair[1])).any(),
+            encoded_pairs)) # find encoded_pair where inclide a high_lacations
+    #print('filtered_list ',filtered_list)
+    filtered_list = connect_(filtered_list, n_connect) # if the distance of two pair is less than n_connect, we can merge them
+    if return_arr:
+        zero_one_arr = np.zeros_like(x, dtype=int)
+        for sl in filtered_list:
+            zero_one_arr[sl[0]:sl[1]] = 1
+        return zero_one_arr
+    return filtered_list
+def connect_clusters(x, n=1):
+    if x.ndim == 1:
+        return connect_clusters_(x, n)
+    if x.ndim >= 2:
+        return np.apply_along_axis(lambda a: connect_clusters_(a, n=n), -2, x)
+def connect_clusters_(x, n=1):
+    """connect_clusters_
+    Connects clustered predictions (0,1) in x with range n
+    :param x: Input array. zero-one format
+    :param n: Number of frames to skip until connection can be made
+    """
+    assert x.ndim == 1, "input needs to be 1d"
+    reg = find_contiguous_regions(x)
+    start_end = connect_(reg, n=n)
+    zero_one_arr = np.zeros_like(x, dtype=int)
+    for sl in start_end:
+        zero_one_arr[sl[0]:sl[1]] = 1
+    return zero_one_arr
+def connect_(pairs, n=1):
+    """connect_
+    Connects two adjacent clusters if their distance is <= n
+    :param pairs: Clusters of iterateables e.g., [(1,5),(7,10)]
+    :param n: distance between two clusters
+    """
+    if len(pairs) == 0:
+        return []
+    start_, end_ = pairs[0]
+    new_pairs = []
+    for i, (next_item, cur_item) in enumerate(zip(pairs[1:], pairs[0:])):
+        end_ = next_item[1]
+        if next_item[0] - cur_item[1] <= n:
+            pass
+        else:
+            new_pairs.append((start_, cur_item[1]))
+            start_ = next_item[0]
+    new_pairs.append((start_, end_))
+    return new_pairs
+def predictions_to_time(df, ratio):
+    df.onset = df.onset * ratio
+    df.offset = df.offset * ratio
+    return df
+def upgrade_resolution(arr, scale):
+    print('arr ',arr.shape)
+    x = np.arange(0, arr.shape[0])
+    f = interp1d(x, arr, kind='linear', axis=0, fill_value='extrapolate')
+    scale_x = np.arange(0, arr.shape[0], 1 / scale)
+    up_scale = f(scale_x)
+    return up_scale
+# a = [0.1,0.2,0.3,0.8,0.4,0.1,0.3,0.9,0.4]
+# a = np.array(a)
+# b = a>0.2
+# _double_threshold(a,0.7,0.2)

audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e4525ad12621117c3a0fcfe974fd55e51583cd219106bf510438f4bec4edc18
+size 140604911

audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1331dab1e4c3ac2bc5850156f2000a95fe333bdf06d08ce9b490550726548ab0
+size 2479

audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9b44e30c4800462c177806bbd7009953d70d531c873e3791ca9aa85375d524d
+size 343538489

audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de482358747778181e4dc530ec61ae94f53ae0b202ac92e99491fe4ceb3cbb1c
+size 255398