|
|
|
from datasets import load_dataset |
|
from collections import Counter |
|
import json |
|
import os |
|
import tempfile |
|
from transformers import Wav2Vec2CTCTokenizer |
|
|
|
|
|
dataset_name = "voxpopuli" |
|
|
|
split = "train" |
|
|
|
use_auth_token = True |
|
|
|
tokenizer_name = f"wav2vec2-ctc-{dataset_name}-tokenizer" |
|
|
|
|
|
cutoff_freq = 0.01 |
|
|
|
dataset = load_dataset( |
|
"esb/datasets", |
|
dataset_name, |
|
split=split, |
|
use_auth_token=use_auth_token, |
|
) |
|
|
|
|
|
dataset = dataset.remove_columns(list(set(dataset.column_names) - {"text"})) |
|
|
|
|
|
def create_vocabulary_from_data(dataset, word_delimiter_token="|", cutoff_freq=0.0): |
|
def extract_all_chars(batch): |
|
all_text = " ".join(batch["text"]) |
|
|
|
count_chars_dict = Counter(list(all_text)) |
|
|
|
count_chars_dict = sorted(count_chars_dict.items(), key=lambda item: (-item[1], item[0])) |
|
|
|
vocab, freqs = zip(*count_chars_dict) |
|
|
|
return {"vocab": list(vocab), "freqs": list(freqs)} |
|
|
|
dataset = dataset.map( |
|
extract_all_chars, |
|
batched=True, |
|
batch_size=-1, |
|
remove_columns=dataset.column_names, |
|
) |
|
|
|
vocab, freqs = dataset["vocab"], dataset["freqs"] |
|
total_num_chars = sum(freqs) |
|
chars_to_remove = [] |
|
|
|
print("Character Occurences") |
|
print(f"Total characters in dataset: {total_num_chars}") |
|
print(50 * "-") |
|
print(f"{'Char'.rjust(5)} | {'Total occ'.rjust(10)} | {'% of total occ'.rjust(20)} |") |
|
print(50 * "-") |
|
for char, freq in zip(vocab, freqs): |
|
freq_in_percent = freq / total_num_chars * 100 |
|
print(f"{char.rjust(5)} | {str(freq).rjust(10)} | {str(round(freq_in_percent, 3)).rjust(20)} |") |
|
if freq_in_percent < cutoff_freq: |
|
chars_to_remove.append(char) |
|
print(50 * "-") |
|
|
|
vocab = list(set(vocab) - set(chars_to_remove)) |
|
|
|
|
|
vocab = ["<pad>", "<s>", "</s>", "<unk>"] + vocab |
|
|
|
|
|
vocab_dict = {v: k for k, v in enumerate(list(vocab))} |
|
|
|
|
|
if word_delimiter_token is not None: |
|
vocab_dict[word_delimiter_token] = vocab_dict[" "] |
|
del vocab_dict[" "] |
|
|
|
return vocab_dict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab_dict = create_vocabulary_from_data(dataset, cutoff_freq=cutoff_freq) |
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmp: |
|
with open(os.path.join(tmp, "vocab.json"), "w") as file: |
|
json.dump(vocab_dict, file) |
|
|
|
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(tmp) |
|
|
|
|
|
tokenizer.push_to_hub(tokenizer_name) |
|
|