asahi417's picture
Saving train state of step 2500
61a08ca
raw
history blame
2.85 kB
"""custom HF data loader to load a large audio dataset from local
- run `reazon_downloader.py` first to download the desired data type (["tiny", "small", "medium", "large", "all"]) locally.
- credit: https://huggingface.co/datasets/reazon-research/reazonspeech/blob/main/reazonspeech.py
Example:
```
import os
from datasets import load_dataset
dataset = load_dataset(
f"{os.getcwd()}/reazon_custom_loader.py",
"tiny",
split="train",
trust_remote_code=True
)
```
"""
import os
from glob import glob
import datasets
from datasets.tasks import AutomaticSpeechRecognition
_SIZE = ["tiny", "small", "medium", "large", "all"]
class ReazonSpeechConfig(datasets.BuilderConfig):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
class ReazonSpeech(datasets.GeneratorBasedBuilder):
BUILDER_CONFIGS = [ReazonSpeechConfig(name=name) for name in _SIZE]
DEFAULT_CONFIG_NAME = "tiny"
DEFAULT_WRITER_BATCH_SIZE = 256
def _info(self):
return datasets.DatasetInfo(
task_templates=[AutomaticSpeechRecognition()],
features=datasets.Features(
{
"name": datasets.Value("string"),
"audio": datasets.Audio(sampling_rate=16000),
"transcription": datasets.Value("string"),
}
)
)
def _split_generators(self, dl_manager):
data_dir = f"{os.path.expanduser('~')}/.cache/reazon_manual_download/{self.config.name}"
audio_files = glob(f"{data_dir}/*.tar")
audio = [dl_manager.iter_archive(path) for path in audio_files]
transcript_file = f"{data_dir}/{self.config.name}.{self.config.name}.tsv"
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"audio_files": audio_files, "transcript_file": transcript_file, "audio": audio},
),
]
def _generate_examples(self, audio_files, transcript_file, audio):
# hashTable of a file and the associated transcript
meta = {}
with open(transcript_file, "r", encoding="utf-8") as fp:
for line in fp:
filename, transcription = line.rstrip("\n").split("\t")
meta[filename] = transcription
# iterator over audio
for i, audio_single_dump in enumerate(audio):
for filename, file in audio_single_dump:
filename = filename.lstrip("./")
if filename not in meta: # skip audio without transcription
continue
yield filename, {
"name": filename,
"audio": {"path": os.path.join(audio_files[i], filename), "bytes": file.read()},
"transcription": meta[filename],
}