Delete reazon_custom_loader.py
Browse files- reazon_custom_loader.py +0 -81
reazon_custom_loader.py
DELETED
@@ -1,81 +0,0 @@
|
|
1 |
-
"""custom HF data loader to load a large audio dataset from local
|
2 |
-
- run `reazon_downloader.py` first to download the desired data type (["tiny", "small", "medium", "large", "all"]) locally.
|
3 |
-
- credit: https://huggingface.co/datasets/reazon-research/reazonspeech/blob/main/reazonspeech.py
|
4 |
-
|
5 |
-
Example:
|
6 |
-
```
|
7 |
-
import os
|
8 |
-
from datasets import load_dataset
|
9 |
-
|
10 |
-
dataset = load_dataset(
|
11 |
-
f"{os.getcwd()}/reazon_custom_loader.py",
|
12 |
-
"tiny",
|
13 |
-
split="train",
|
14 |
-
trust_remote_code=True
|
15 |
-
)
|
16 |
-
```
|
17 |
-
"""
|
18 |
-
import os
|
19 |
-
from glob import glob
|
20 |
-
|
21 |
-
import datasets
|
22 |
-
from datasets.tasks import AutomaticSpeechRecognition
|
23 |
-
|
24 |
-
_SIZE = ["tiny", "small", "medium", "large", "all"]
|
25 |
-
|
26 |
-
|
27 |
-
class ReazonSpeechConfig(datasets.BuilderConfig):
|
28 |
-
|
29 |
-
def __init__(self, *args, **kwargs):
|
30 |
-
super().__init__(*args, **kwargs)
|
31 |
-
|
32 |
-
|
33 |
-
class ReazonSpeech(datasets.GeneratorBasedBuilder):
|
34 |
-
BUILDER_CONFIGS = [ReazonSpeechConfig(name=name) for name in _SIZE]
|
35 |
-
DEFAULT_CONFIG_NAME = "tiny"
|
36 |
-
DEFAULT_WRITER_BATCH_SIZE = 256
|
37 |
-
|
38 |
-
def _info(self):
|
39 |
-
return datasets.DatasetInfo(
|
40 |
-
task_templates=[AutomaticSpeechRecognition()],
|
41 |
-
features=datasets.Features(
|
42 |
-
{
|
43 |
-
"name": datasets.Value("string"),
|
44 |
-
"audio": datasets.Audio(sampling_rate=16000),
|
45 |
-
"transcription": datasets.Value("string"),
|
46 |
-
}
|
47 |
-
)
|
48 |
-
)
|
49 |
-
|
50 |
-
def _split_generators(self, dl_manager):
|
51 |
-
data_dir = f"{os.path.expanduser('~')}/.cache/reazon_manual_download/{self.config.name}"
|
52 |
-
audio_files = glob(f"{data_dir}/*.tar")
|
53 |
-
audio = [dl_manager.iter_archive(path) for path in audio_files]
|
54 |
-
transcript_file = f"{data_dir}/{self.config.name}.{self.config.name}.tsv"
|
55 |
-
return [
|
56 |
-
datasets.SplitGenerator(
|
57 |
-
name=datasets.Split.TRAIN,
|
58 |
-
gen_kwargs={"audio_files": audio_files, "transcript_file": transcript_file, "audio": audio},
|
59 |
-
),
|
60 |
-
]
|
61 |
-
|
62 |
-
def _generate_examples(self, audio_files, transcript_file, audio):
|
63 |
-
|
64 |
-
# hashTable of a file and the associated transcript
|
65 |
-
meta = {}
|
66 |
-
with open(transcript_file, "r", encoding="utf-8") as fp:
|
67 |
-
for line in fp:
|
68 |
-
filename, transcription = line.rstrip("\n").split("\t")
|
69 |
-
meta[filename] = transcription
|
70 |
-
|
71 |
-
# iterator over audio
|
72 |
-
for i, audio_single_dump in enumerate(audio):
|
73 |
-
for filename, file in audio_single_dump:
|
74 |
-
filename = filename.lstrip("./")
|
75 |
-
if filename not in meta: # skip audio without transcription
|
76 |
-
continue
|
77 |
-
yield filename, {
|
78 |
-
"name": filename,
|
79 |
-
"audio": {"path": os.path.join(audio_files[i], filename), "bytes": file.read()},
|
80 |
-
"transcription": meta[filename],
|
81 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|