anton-l
/

common_voice_generator

Model card Files Files and versions Community

common_voice_generator / dataset_script.py

anton-l HF staff

fix memory issue (#1)

cdccd9a over 2 years ago

raw

history blame

11.4 kB

	# coding=utf-8
	# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" Common Voice Dataset"""


	import csv
	import os
	import urllib

	import datasets
	import requests
	from datasets.utils.py_utils import size_str
	from huggingface_hub import HfApi, HfFolder

	from .languages import LANGUAGES
	from .release_stats import STATS

	_CITATION = """\
	@inproceedings{commonvoice:2020,
	author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},
	title = {Common Voice: A Massively-Multilingual Speech Corpus},
	booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},
	pages = {4211--4215},
	year = 2020
	}
	"""

	_HOMEPAGE = "https://commonvoice.mozilla.org/en/datasets"

	_LICENSE = "https://creativecommons.org/publicdomain/zero/1.0/"

	_API_URL = "https://commonvoice.mozilla.org/api/v1"


	class CommonVoiceConfig(datasets.BuilderConfig):
	"""BuilderConfig for CommonVoice."""

	def __init__(self, name, version, **kwargs):
	self.language = kwargs.pop("language", None)
	self.release_date = kwargs.pop("release_date", None)
	self.num_clips = kwargs.pop("num_clips", None)
	self.num_speakers = kwargs.pop("num_speakers", None)
	self.validated_hr = kwargs.pop("validated_hr", None)
	self.total_hr = kwargs.pop("total_hr", None)
	self.size_bytes = kwargs.pop("size_bytes", None)
	self.size_human = size_str(self.size_bytes)
	description = (
	f"Common Voice speech to text dataset in {self.language} released on {self.release_date}. "
	f"The dataset comprises {self.validated_hr} hours of validated transcribed speech data "
	f"out of {self.total_hr} hours in total from {self.num_speakers} speakers. "
	f"The dataset contains {self.num_clips} audio clips and has a size of {self.size_human}."
	)
	super(CommonVoiceConfig, self).__init__(
	name=name,
	version=datasets.Version(version),
	description=description,
	**kwargs,
	)


	class CommonVoice(datasets.GeneratorBasedBuilder):
	DEFAULT_CONFIG_NAME = "en"
	DEFAULT_WRITER_BATCH_SIZE = 1000

	BUILDER_CONFIGS = [
	CommonVoiceConfig(
	name=lang,
	version=STATS["version"],
	language=LANGUAGES[lang],
	release_date=STATS["date"],
	num_clips=lang_stats["clips"],
	num_speakers=lang_stats["users"],
	validated_hr=float(lang_stats["validHrs"]) if lang_stats["validHrs"] else None,
	total_hr=float(lang_stats["totalHrs"]) if lang_stats["totalHrs"] else None,
	size_bytes=int(lang_stats["size"]) if lang_stats["size"] else None,
	)
	for lang, lang_stats in STATS["locales"].items()
	]

	def _info(self):
	total_languages = len(STATS["locales"])
	total_valid_hours = STATS["totalValidHrs"]
	description = (
	"Common Voice is Mozilla's initiative to help teach machines how real people speak. "
	f"The dataset currently consists of {total_valid_hours} validated hours of speech "
	f" in {total_languages} languages, but more voices and languages are always added."
	)
	features = datasets.Features(
	{
	"client_id": datasets.Value("string"),
	"path": datasets.Value("string"),
	"audio": datasets.features.Audio(sampling_rate=48_000),
	"sentence": datasets.Value("string"),
	"up_votes": datasets.Value("int64"),
	"down_votes": datasets.Value("int64"),
	"age": datasets.Value("string"),
	"gender": datasets.Value("string"),
	"accent": datasets.Value("string"),
	"locale": datasets.Value("string"),
	"segment": datasets.Value("string"),
	}
	)

	return datasets.DatasetInfo(
	description=description,
	features=features,
	supervised_keys=None,
	homepage=_HOMEPAGE,
	license=_LICENSE,
	citation=_CITATION,
	version=self.config.version,
	# task_templates=[
	# AutomaticSpeechRecognition(audio_file_path_column="path", transcription_column="sentence")
	# ],
	)

	def _get_bundle_url(self, locale, url_template):
	# path = encodeURIComponent(path)
	path = url_template.replace("{locale}", locale)
	path = urllib.parse.quote(path.encode("utf-8"), safe="~()*!.'")
	# use_cdn = self.config.size_bytes < 20 * 1024 * 1024 * 1024
	# response = requests.get(f"{_API_URL}/bucket/dataset/{path}/{use_cdn}", timeout=10.0).json()
	response = requests.get(f"{_API_URL}/bucket/dataset/{path}", timeout=10.0).json()
	return response["url"]

	def _log_download(self, locale, bundle_version, auth_token):
	if isinstance(auth_token, bool):
	auth_token = HfFolder().get_token()
	whoami = HfApi().whoami(auth_token)
	email = whoami["email"] if "email" in whoami else ""
	payload = {"email": email, "locale": locale, "dataset": bundle_version}
	requests.post(f"{_API_URL}/{locale}/downloaders", json=payload).json()

	def _split_generators(self, dl_manager):
	"""Returns SplitGenerators."""
	hf_auth_token = dl_manager.download_config.use_auth_token
	if hf_auth_token is None:
	raise ConnectionError(
	"Please set use_auth_token=True or use_auth_token='<TOKEN>' to download this dataset"
	)

	bundle_url_template = STATS["bundleURLTemplate"]
	bundle_version = bundle_url_template.split("/")[0]
	dl_manager.download_config.ignore_url_params = True

	self._log_download(self.config.name, bundle_version, hf_auth_token)
	archive_path = dl_manager.download(self._get_bundle_url(self.config.name, bundle_url_template))
	local_extracted_archive = dl_manager.extract(archive_path) if not dl_manager.is_streaming else None

	if self.config.version < datasets.Version("5.0.0"):
	path_to_data = ""
	else:
	path_to_data = "/".join([bundle_version, self.config.name])
	path_to_clips = "/".join([path_to_data, "clips"]) if path_to_data else "clips"

	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	gen_kwargs={
	"local_extracted_archive": local_extracted_archive,
	"archive_iterator": dl_manager.iter_archive(archive_path),
	"metadata_filepath": "/".join([path_to_data, "train.tsv"]) if path_to_data else "train.tsv",
	"path_to_clips": path_to_clips,
	},
	),
	datasets.SplitGenerator(
	name=datasets.Split.TEST,
	gen_kwargs={
	"local_extracted_archive": local_extracted_archive,
	"archive_iterator": dl_manager.iter_archive(archive_path),
	"metadata_filepath": "/".join([path_to_data, "test.tsv"]) if path_to_data else "test.tsv",
	"path_to_clips": path_to_clips,
	},
	),
	datasets.SplitGenerator(
	name=datasets.Split.VALIDATION,
	gen_kwargs={
	"local_extracted_archive": local_extracted_archive,
	"archive_iterator": dl_manager.iter_archive(archive_path),
	"metadata_filepath": "/".join([path_to_data, "dev.tsv"]) if path_to_data else "dev.tsv",
	"path_to_clips": path_to_clips,
	},
	),
	datasets.SplitGenerator(
	name="other",
	gen_kwargs={
	"local_extracted_archive": local_extracted_archive,
	"archive_iterator": dl_manager.iter_archive(archive_path),
	"metadata_filepath": "/".join([path_to_data, "other.tsv"]) if path_to_data else "other.tsv",
	"path_to_clips": path_to_clips,
	},
	),
	datasets.SplitGenerator(
	name="invalidated",
	gen_kwargs={
	"local_extracted_archive": local_extracted_archive,
	"archive_iterator": dl_manager.iter_archive(archive_path),
	"metadata_filepath": "/".join([path_to_data, "invalidated.tsv"])
	if path_to_data
	else "invalidated.tsv",
	"path_to_clips": path_to_clips,
	},
	),
	]

	def _generate_examples(
	self,
	local_extracted_archive,
	archive_iterator,
	metadata_filepath,
	path_to_clips,
	):
	"""Yields examples."""
	data_fields = list(self._info().features.keys())
	metadata = {}
	metadata_found = False
	for path, f in archive_iterator:
	if path == metadata_filepath:
	metadata_found = True
	lines = (line.decode("utf-8") for line in f)
	reader = csv.DictReader(lines, delimiter="\t", quoting=csv.QUOTE_NONE)
	for row in reader:
	# set absolute path for mp3 audio file
	if not row["path"].endswith(".mp3"):
	row["path"] += ".mp3"
	row["path"] = os.path.join(path_to_clips, row["path"])
	# accent -> accents in CV 8.0
	if "accents" in row:
	row["accent"] = row["accents"]
	del row["accents"]
	# if data is incomplete, fill with empty values
	for field in data_fields:
	if field not in row:
	row[field] = ""
	metadata[row["path"]] = row
	elif path.startswith(path_to_clips):
	assert metadata_found, "Found audio clips before the metadata TSV file."
	if not metadata:
	break
	if path in metadata:
	result = dict(metadata[path])
	# set the audio feature and the path to the extracted file
	path = os.path.join(local_extracted_archive, path) if local_extracted_archive else path
	result["audio"] = {"path": path, "bytes": f.read()}
	# set path to None if the audio file doesn't exist locally (i.e. in streaming mode)
	result["path"] = path if local_extracted_archive else None

	yield path, result