keithhon commited on
Commit
9e6195f
1 Parent(s): b093d90

Upload encoder/preprocess.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. encoder/preprocess.py +175 -0
encoder/preprocess.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from multiprocess.pool import ThreadPool
2
+ from encoder.params_data import *
3
+ from encoder.config import librispeech_datasets, anglophone_nationalites
4
+ from datetime import datetime
5
+ from encoder import audio
6
+ from pathlib import Path
7
+ from tqdm import tqdm
8
+ import numpy as np
9
+
10
+
11
+ class DatasetLog:
12
+ """
13
+ Registers metadata about the dataset in a text file.
14
+ """
15
+ def __init__(self, root, name):
16
+ self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
17
+ self.sample_data = dict()
18
+
19
+ start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
20
+ self.write_line("Creating dataset %s on %s" % (name, start_time))
21
+ self.write_line("-----")
22
+ self._log_params()
23
+
24
+ def _log_params(self):
25
+ from encoder import params_data
26
+ self.write_line("Parameter values:")
27
+ for param_name in (p for p in dir(params_data) if not p.startswith("__")):
28
+ value = getattr(params_data, param_name)
29
+ self.write_line("\t%s: %s" % (param_name, value))
30
+ self.write_line("-----")
31
+
32
+ def write_line(self, line):
33
+ self.text_file.write("%s\n" % line)
34
+
35
+ def add_sample(self, **kwargs):
36
+ for param_name, value in kwargs.items():
37
+ if not param_name in self.sample_data:
38
+ self.sample_data[param_name] = []
39
+ self.sample_data[param_name].append(value)
40
+
41
+ def finalize(self):
42
+ self.write_line("Statistics:")
43
+ for param_name, values in self.sample_data.items():
44
+ self.write_line("\t%s:" % param_name)
45
+ self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
46
+ self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
47
+ self.write_line("-----")
48
+ end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
49
+ self.write_line("Finished on %s" % end_time)
50
+ self.text_file.close()
51
+
52
+
53
+ def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
54
+ dataset_root = datasets_root.joinpath(dataset_name)
55
+ if not dataset_root.exists():
56
+ print("Couldn\'t find %s, skipping this dataset." % dataset_root)
57
+ return None, None
58
+ return dataset_root, DatasetLog(out_dir, dataset_name)
59
+
60
+
61
+ def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
62
+ skip_existing, logger):
63
+ print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
64
+
65
+ # Function to preprocess utterances for one speaker
66
+ def preprocess_speaker(speaker_dir: Path):
67
+ # Give a name to the speaker that includes its dataset
68
+ speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
69
+
70
+ # Create an output directory with that name, as well as a txt file containing a
71
+ # reference to each source file.
72
+ speaker_out_dir = out_dir.joinpath(speaker_name)
73
+ speaker_out_dir.mkdir(exist_ok=True)
74
+ sources_fpath = speaker_out_dir.joinpath("_sources.txt")
75
+
76
+ # There's a possibility that the preprocessing was interrupted earlier, check if
77
+ # there already is a sources file.
78
+ if sources_fpath.exists():
79
+ try:
80
+ with sources_fpath.open("r") as sources_file:
81
+ existing_fnames = {line.split(",")[0] for line in sources_file}
82
+ except:
83
+ existing_fnames = {}
84
+ else:
85
+ existing_fnames = {}
86
+
87
+ # Gather all audio files for that speaker recursively
88
+ sources_file = sources_fpath.open("a" if skip_existing else "w")
89
+ for in_fpath in speaker_dir.glob("**/*.%s" % extension):
90
+ # Check if the target output file already exists
91
+ out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
92
+ out_fname = out_fname.replace(".%s" % extension, ".npy")
93
+ if skip_existing and out_fname in existing_fnames:
94
+ continue
95
+
96
+ # Load and preprocess the waveform
97
+ wav = audio.preprocess_wav(in_fpath)
98
+ if len(wav) == 0:
99
+ continue
100
+
101
+ # Create the mel spectrogram, discard those that are too short
102
+ frames = audio.wav_to_mel_spectrogram(wav)
103
+ if len(frames) < partials_n_frames:
104
+ continue
105
+
106
+ out_fpath = speaker_out_dir.joinpath(out_fname)
107
+ np.save(out_fpath, frames)
108
+ logger.add_sample(duration=len(wav) / sampling_rate)
109
+ sources_file.write("%s,%s\n" % (out_fname, in_fpath))
110
+
111
+ sources_file.close()
112
+
113
+ # Process the utterances for each speaker
114
+ with ThreadPool(8) as pool:
115
+ list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
116
+ unit="speakers"))
117
+ logger.finalize()
118
+ print("Done preprocessing %s.\n" % dataset_name)
119
+
120
+
121
+ def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
122
+ for dataset_name in librispeech_datasets["train"]["other"]:
123
+ # Initialize the preprocessing
124
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
125
+ if not dataset_root:
126
+ return
127
+
128
+ # Preprocess all speakers
129
+ speaker_dirs = list(dataset_root.glob("*"))
130
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
131
+ skip_existing, logger)
132
+
133
+
134
+ def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
135
+ # Initialize the preprocessing
136
+ dataset_name = "VoxCeleb1"
137
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
138
+ if not dataset_root:
139
+ return
140
+
141
+ # Get the contents of the meta file
142
+ with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
143
+ metadata = [line.split("\t") for line in metafile][1:]
144
+
145
+ # Select the ID and the nationality, filter out non-anglophone speakers
146
+ nationalities = {line[0]: line[3] for line in metadata}
147
+ keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
148
+ nationality.lower() in anglophone_nationalites]
149
+ print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
150
+ (len(keep_speaker_ids), len(nationalities)))
151
+
152
+ # Get the speaker directories for anglophone speakers only
153
+ speaker_dirs = dataset_root.joinpath("wav").glob("*")
154
+ speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
155
+ speaker_dir.name in keep_speaker_ids]
156
+ print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
157
+ (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
158
+
159
+ # Preprocess all speakers
160
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
161
+ skip_existing, logger)
162
+
163
+
164
+ def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
165
+ # Initialize the preprocessing
166
+ dataset_name = "VoxCeleb2"
167
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
168
+ if not dataset_root:
169
+ return
170
+
171
+ # Get the speaker directories
172
+ # Preprocess all speakers
173
+ speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
174
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
175
+ skip_existing, logger)