|
"""custom HF data loader to load a large audio dataset from local |
|
- run `reazon_downloader.py` first to download the desired data type (["tiny", "small", "medium", "large", "all"]) locally. |
|
- credit: https://huggingface.co/datasets/reazon-research/reazonspeech/blob/main/reazonspeech.py |
|
|
|
Example: |
|
``` |
|
import os |
|
from datasets import load_dataset |
|
|
|
dataset = load_dataset( |
|
f"{os.getcwd()}/reazon_custom_loader.py", |
|
"tiny", |
|
split="train", |
|
trust_remote_code=True |
|
) |
|
``` |
|
""" |
|
import os |
|
from glob import glob |
|
|
|
import datasets |
|
from datasets.tasks import AutomaticSpeechRecognition |
|
|
|
_SIZE = ["tiny", "small", "medium", "large", "all"] |
|
|
|
|
|
class ReazonSpeechConfig(datasets.BuilderConfig): |
|
|
|
def __init__(self, *args, **kwargs): |
|
super().__init__(*args, **kwargs) |
|
|
|
|
|
class ReazonSpeech(datasets.GeneratorBasedBuilder): |
|
BUILDER_CONFIGS = [ReazonSpeechConfig(name=name) for name in _SIZE] |
|
DEFAULT_CONFIG_NAME = "tiny" |
|
DEFAULT_WRITER_BATCH_SIZE = 256 |
|
|
|
def _info(self): |
|
return datasets.DatasetInfo( |
|
task_templates=[AutomaticSpeechRecognition()], |
|
features=datasets.Features( |
|
{ |
|
"name": datasets.Value("string"), |
|
"audio": datasets.Audio(sampling_rate=16000), |
|
"transcription": datasets.Value("string"), |
|
} |
|
) |
|
) |
|
|
|
def _split_generators(self, dl_manager): |
|
data_dir = f"{os.path.expanduser('~')}/.cache/reazon_manual_download/{self.config.name}" |
|
audio_files = glob(f"{data_dir}/*.tar") |
|
audio = [dl_manager.iter_archive(path) for path in audio_files] |
|
transcript_file = f"{data_dir}/{self.config.name}.{self.config.name}.tsv" |
|
return [ |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TRAIN, |
|
gen_kwargs={"audio_files": audio_files, "transcript_file": transcript_file, "audio": audio}, |
|
), |
|
] |
|
|
|
def _generate_examples(self, audio_files, transcript_file, audio): |
|
|
|
|
|
meta = {} |
|
with open(transcript_file, "r", encoding="utf-8") as fp: |
|
for line in fp: |
|
filename, transcription = line.rstrip("\n").split("\t") |
|
meta[filename] = transcription |
|
|
|
|
|
for i, audio_single_dump in enumerate(audio): |
|
for filename, file in audio_single_dump: |
|
filename = filename.lstrip("./") |
|
if filename not in meta: |
|
continue |
|
yield filename, { |
|
"name": filename, |
|
"audio": {"path": os.path.join(audio_files[i], filename), "bytes": file.read()}, |
|
"transcription": meta[filename], |
|
} |
|
|