add some files
Browse files- Untitled.ipynb +0 -0
- eval.py +137 -0
- eval.sh +6 -0
- inference.ipynb +479 -0
- train_tr.ipynb +0 -0
- vocab.json +1 -1
Untitled.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
eval.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
import argparse
|
3 |
+
import re
|
4 |
+
from typing import Dict
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from datasets import Audio, Dataset, load_dataset, load_metric
|
8 |
+
|
9 |
+
from transformers import AutoFeatureExtractor, pipeline
|
10 |
+
|
11 |
+
|
12 |
+
def log_results(result: Dataset, args: Dict[str, str]):
|
13 |
+
"""DO NOT CHANGE. This function computes and logs the result metrics."""
|
14 |
+
|
15 |
+
log_outputs = args.log_outputs
|
16 |
+
dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
|
17 |
+
|
18 |
+
# load metric
|
19 |
+
wer = load_metric("wer")
|
20 |
+
cer = load_metric("cer")
|
21 |
+
|
22 |
+
# compute metrics
|
23 |
+
wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
|
24 |
+
cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
|
25 |
+
|
26 |
+
# print & log results
|
27 |
+
result_str = f"WER: {wer_result}\n" f"CER: {cer_result}"
|
28 |
+
print(result_str)
|
29 |
+
|
30 |
+
with open(f"{dataset_id}_eval_results.txt", "w") as f:
|
31 |
+
f.write(result_str)
|
32 |
+
|
33 |
+
# log all results in text file. Possibly interesting for analysis
|
34 |
+
if log_outputs is not None:
|
35 |
+
pred_file = f"log_{dataset_id}_predictions.txt"
|
36 |
+
target_file = f"log_{dataset_id}_targets.txt"
|
37 |
+
|
38 |
+
with open(pred_file, "w") as p, open(target_file, "w") as t:
|
39 |
+
|
40 |
+
# mapping function to write output
|
41 |
+
def write_to_file(batch, i):
|
42 |
+
p.write(f"{i}" + "\n")
|
43 |
+
p.write(batch["prediction"] + "\n")
|
44 |
+
t.write(f"{i}" + "\n")
|
45 |
+
t.write(batch["target"] + "\n")
|
46 |
+
|
47 |
+
result.map(write_to_file, with_indices=True)
|
48 |
+
|
49 |
+
|
50 |
+
def normalize_text(text: str) -> str:
|
51 |
+
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
52 |
+
|
53 |
+
chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
54 |
+
|
55 |
+
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
56 |
+
|
57 |
+
# In addition, we can normalize the target text, e.g. removing new lines characters etc...
|
58 |
+
# note that order is important here!
|
59 |
+
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
60 |
+
|
61 |
+
for t in token_sequences_to_ignore:
|
62 |
+
text = " ".join(text.split(t))
|
63 |
+
|
64 |
+
return text
|
65 |
+
|
66 |
+
|
67 |
+
def main(args):
|
68 |
+
# load dataset
|
69 |
+
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
70 |
+
|
71 |
+
# for testing: only process the first two examples as a test
|
72 |
+
# dataset = dataset.select(range(10))
|
73 |
+
|
74 |
+
# load processor
|
75 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
76 |
+
sampling_rate = feature_extractor.sampling_rate
|
77 |
+
|
78 |
+
# resample audio
|
79 |
+
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
|
80 |
+
|
81 |
+
# load eval pipeline
|
82 |
+
if args.device is None:
|
83 |
+
args.device = 0 if torch.cuda.is_available() else -1
|
84 |
+
asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
|
85 |
+
|
86 |
+
# map function to decode audio
|
87 |
+
def map_to_pred(batch):
|
88 |
+
prediction = asr(
|
89 |
+
batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
|
90 |
+
)
|
91 |
+
|
92 |
+
batch["prediction"] = prediction["text"]
|
93 |
+
batch["target"] = normalize_text(batch["sentence"])
|
94 |
+
return batch
|
95 |
+
|
96 |
+
# run inference on all examples
|
97 |
+
result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
|
98 |
+
|
99 |
+
# compute and log_results
|
100 |
+
# do not change function below
|
101 |
+
log_results(result, args)
|
102 |
+
|
103 |
+
|
104 |
+
if __name__ == "__main__":
|
105 |
+
parser = argparse.ArgumentParser()
|
106 |
+
|
107 |
+
parser.add_argument(
|
108 |
+
"--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
|
109 |
+
)
|
110 |
+
parser.add_argument(
|
111 |
+
"--dataset",
|
112 |
+
type=str,
|
113 |
+
required=True,
|
114 |
+
help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
|
115 |
+
)
|
116 |
+
parser.add_argument(
|
117 |
+
"--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
|
118 |
+
)
|
119 |
+
parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
|
120 |
+
parser.add_argument(
|
121 |
+
"--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
|
122 |
+
)
|
123 |
+
parser.add_argument(
|
124 |
+
"--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to 1 second."
|
125 |
+
)
|
126 |
+
parser.add_argument(
|
127 |
+
"--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
|
128 |
+
)
|
129 |
+
parser.add_argument(
|
130 |
+
"--device",
|
131 |
+
type=int,
|
132 |
+
default=None,
|
133 |
+
help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
|
134 |
+
)
|
135 |
+
args = parser.parse_args()
|
136 |
+
|
137 |
+
main(args)
|
eval.sh
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
./eval.py \
|
2 |
+
--model_id ./ \
|
3 |
+
--dataset "mozilla-foundation/common_voice_8_0" \
|
4 |
+
--config ja \
|
5 |
+
--split test \
|
6 |
+
--log_outputs
|
inference.ipynb
ADDED
@@ -0,0 +1,479 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "3eace62e",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"from transformers import AutoModelForCTC, Wav2Vec2Processor\n",
|
11 |
+
"from datasets import load_dataset, load_metric, Audio\n",
|
12 |
+
"import torch"
|
13 |
+
]
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"cell_type": "code",
|
17 |
+
"execution_count": 3,
|
18 |
+
"id": "47d5c062",
|
19 |
+
"metadata": {},
|
20 |
+
"outputs": [
|
21 |
+
{
|
22 |
+
"name": "stderr",
|
23 |
+
"output_type": "stream",
|
24 |
+
"text": [
|
25 |
+
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
26 |
+
]
|
27 |
+
}
|
28 |
+
],
|
29 |
+
"source": [
|
30 |
+
"# model = AutoModelForCTC.from_pretrained(\".\").to('cuda')\n",
|
31 |
+
"processor = Wav2Vec2Processor.from_pretrained(\".\")"
|
32 |
+
]
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"cell_type": "code",
|
36 |
+
"execution_count": 47,
|
37 |
+
"id": "1ffed05d",
|
38 |
+
"metadata": {},
|
39 |
+
"outputs": [
|
40 |
+
{
|
41 |
+
"name": "stderr",
|
42 |
+
"output_type": "stream",
|
43 |
+
"text": [
|
44 |
+
"Using custom data configuration default-f6158d05a859ae5c\n",
|
45 |
+
"Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-f6158d05a859ae5c/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)\n"
|
46 |
+
]
|
47 |
+
}
|
48 |
+
],
|
49 |
+
"source": [
|
50 |
+
"common_voice_test = load_dataset('csv', data_files='km_kh_male/line_index_test.csv', split = 'train')"
|
51 |
+
]
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"cell_type": "code",
|
55 |
+
"execution_count": 48,
|
56 |
+
"id": "bb365941",
|
57 |
+
"metadata": {},
|
58 |
+
"outputs": [],
|
59 |
+
"source": [
|
60 |
+
"common_voice_test = (common_voice_test\n",
|
61 |
+
" .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
|
62 |
+
" .rename_column('text', 'sentence'))"
|
63 |
+
]
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"cell_type": "code",
|
67 |
+
"execution_count": null,
|
68 |
+
"id": "34979efb",
|
69 |
+
"metadata": {},
|
70 |
+
"outputs": [],
|
71 |
+
"source": [
|
72 |
+
"common_voice_test = common_voice_test.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')"
|
73 |
+
]
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"cell_type": "code",
|
77 |
+
"execution_count": null,
|
78 |
+
"id": "66ac6b14",
|
79 |
+
"metadata": {},
|
80 |
+
"outputs": [],
|
81 |
+
"source": []
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"cell_type": "code",
|
85 |
+
"execution_count": 17,
|
86 |
+
"id": "e135b397",
|
87 |
+
"metadata": {},
|
88 |
+
"outputs": [
|
89 |
+
{
|
90 |
+
"name": "stderr",
|
91 |
+
"output_type": "stream",
|
92 |
+
"text": [
|
93 |
+
"Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd)\n"
|
94 |
+
]
|
95 |
+
}
|
96 |
+
],
|
97 |
+
"source": [
|
98 |
+
"common_voice_test = load_dataset(\"common_voice\", \"tr\", split=\"test\")"
|
99 |
+
]
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"cell_type": "code",
|
103 |
+
"execution_count": 18,
|
104 |
+
"id": "9dd4cfd4",
|
105 |
+
"metadata": {},
|
106 |
+
"outputs": [
|
107 |
+
{
|
108 |
+
"data": {
|
109 |
+
"text/plain": [
|
110 |
+
"{'client_id': 'b8fffa3c4745500cd2c5f40a82b65bf1fb2d4c4f8638209a33fe1886fbfffdbd2f93aa43e0bd2026c4643e08aada408165138f75787cee501c4d735aa555a61c',\n",
|
111 |
+
" 'path': 'common_voice_tr_17343551.mp3',\n",
|
112 |
+
" 'audio': {'path': 'cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_17343551.mp3',\n",
|
113 |
+
" 'array': array([0. , 0. , 0. , ..., 0.00157976, 0.00167614,\n",
|
114 |
+
" 0.00091976], dtype=float32),\n",
|
115 |
+
" 'sampling_rate': 48000},\n",
|
116 |
+
" 'sentence': 'Aşırı derecede kapalı bir ortamımız var.',\n",
|
117 |
+
" 'up_votes': 2,\n",
|
118 |
+
" 'down_votes': 0,\n",
|
119 |
+
" 'age': 'thirties',\n",
|
120 |
+
" 'gender': 'male',\n",
|
121 |
+
" 'accent': 'other',\n",
|
122 |
+
" 'locale': 'tr',\n",
|
123 |
+
" 'segment': \"''\"}"
|
124 |
+
]
|
125 |
+
},
|
126 |
+
"execution_count": 18,
|
127 |
+
"metadata": {},
|
128 |
+
"output_type": "execute_result"
|
129 |
+
}
|
130 |
+
],
|
131 |
+
"source": [
|
132 |
+
"common_voice_test[3]"
|
133 |
+
]
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"cell_type": "code",
|
137 |
+
"execution_count": 19,
|
138 |
+
"id": "f36c3bcd",
|
139 |
+
"metadata": {},
|
140 |
+
"outputs": [],
|
141 |
+
"source": [
|
142 |
+
"# remove unnecceesary attributes\n",
|
143 |
+
"common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])"
|
144 |
+
]
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"cell_type": "code",
|
148 |
+
"execution_count": 20,
|
149 |
+
"id": "142cffaa",
|
150 |
+
"metadata": {},
|
151 |
+
"outputs": [],
|
152 |
+
"source": [
|
153 |
+
"common_voice_test = common_voice_test.cast_column(\"audio\", Audio(sampling_rate=16_000))"
|
154 |
+
]
|
155 |
+
},
|
156 |
+
{
|
157 |
+
"cell_type": "code",
|
158 |
+
"execution_count": 29,
|
159 |
+
"id": "b1103455",
|
160 |
+
"metadata": {},
|
161 |
+
"outputs": [
|
162 |
+
{
|
163 |
+
"data": {
|
164 |
+
"text/plain": [
|
165 |
+
"Dataset({\n",
|
166 |
+
" features: ['path', 'audio', 'sentence'],\n",
|
167 |
+
" num_rows: 1647\n",
|
168 |
+
"})"
|
169 |
+
]
|
170 |
+
},
|
171 |
+
"execution_count": 29,
|
172 |
+
"metadata": {},
|
173 |
+
"output_type": "execute_result"
|
174 |
+
}
|
175 |
+
],
|
176 |
+
"source": [
|
177 |
+
"common_voice_test"
|
178 |
+
]
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"cell_type": "code",
|
182 |
+
"execution_count": 30,
|
183 |
+
"id": "e2f9be66",
|
184 |
+
"metadata": {},
|
185 |
+
"outputs": [
|
186 |
+
{
|
187 |
+
"data": {
|
188 |
+
"text/plain": [
|
189 |
+
"'Pek çoğu da Roman toplumundan geliyor.'"
|
190 |
+
]
|
191 |
+
},
|
192 |
+
"execution_count": 30,
|
193 |
+
"metadata": {},
|
194 |
+
"output_type": "execute_result"
|
195 |
+
}
|
196 |
+
],
|
197 |
+
"source": [
|
198 |
+
"common_voice_test[0]['sentence']"
|
199 |
+
]
|
200 |
+
},
|
201 |
+
{
|
202 |
+
"cell_type": "code",
|
203 |
+
"execution_count": 26,
|
204 |
+
"id": "94a0e9c5",
|
205 |
+
"metadata": {},
|
206 |
+
"outputs": [],
|
207 |
+
"source": [
|
208 |
+
"import numpy as np"
|
209 |
+
]
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"cell_type": "code",
|
213 |
+
"execution_count": 33,
|
214 |
+
"id": "c2bcce8a",
|
215 |
+
"metadata": {},
|
216 |
+
"outputs": [
|
217 |
+
{
|
218 |
+
"data": {
|
219 |
+
"text/plain": [
|
220 |
+
"{'path': 'cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_17341269.mp3',\n",
|
221 |
+
" 'array': array([ 0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 8.288735e-06,\n",
|
222 |
+
" -1.994405e-03, -7.770515e-03], dtype=float32),\n",
|
223 |
+
" 'sampling_rate': 16000}"
|
224 |
+
]
|
225 |
+
},
|
226 |
+
"execution_count": 33,
|
227 |
+
"metadata": {},
|
228 |
+
"output_type": "execute_result"
|
229 |
+
}
|
230 |
+
],
|
231 |
+
"source": [
|
232 |
+
"common_voice_test[0]['audio']"
|
233 |
+
]
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"cell_type": "code",
|
237 |
+
"execution_count": 34,
|
238 |
+
"id": "47d9dd9c",
|
239 |
+
"metadata": {},
|
240 |
+
"outputs": [
|
241 |
+
{
|
242 |
+
"ename": "ValueError",
|
243 |
+
"evalue": "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).",
|
244 |
+
"output_type": "error",
|
245 |
+
"traceback": [
|
246 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
247 |
+
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
248 |
+
"Input \u001b[0;32mIn [34]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mprocessor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marray\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcommon_voice_test\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43maudio\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43marray\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msampling_rate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m16000\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
249 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:138\u001b[0m, in \u001b[0;36mWav2Vec2Processor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 132\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;124;03m When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's\u001b[39;00m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context\u001b[39;00m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;124;03m [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.\u001b[39;00m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 138\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_processor\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
250 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2417\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.__call__\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 2414\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 2416\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text):\n\u001b[0;32m-> 2417\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2418\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2419\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2420\u001b[0m )\n\u001b[1;32m 2422\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text_pair):\n\u001b[1;32m 2423\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2424\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2425\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2426\u001b[0m )\n",
|
251 |
+
"\u001b[0;31mValueError\u001b[0m: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
|
252 |
+
]
|
253 |
+
}
|
254 |
+
],
|
255 |
+
"source": [
|
256 |
+
"processor(np.array(common_voice_test[0]['audio'][\"array\"]), sampling_rate=16000)"
|
257 |
+
]
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"cell_type": "code",
|
261 |
+
"execution_count": 27,
|
262 |
+
"id": "5f0e5342",
|
263 |
+
"metadata": {},
|
264 |
+
"outputs": [],
|
265 |
+
"source": [
|
266 |
+
"def prepare_dataset(batch):\n",
|
267 |
+
" audio = batch[\"audio\"]\n",
|
268 |
+
" \n",
|
269 |
+
" # batched output is \"un-batched\"\n",
|
270 |
+
" batch[\"input_values\"] = processor(np.array(audio[\"array\"]), sampling_rate=audio[\"sampling_rate\"]).input_values[0]\n",
|
271 |
+
" batch[\"input_length\"] = len(batch[\"input_values\"])\n",
|
272 |
+
" \n",
|
273 |
+
" with processor.as_target_processor():\n",
|
274 |
+
" batch[\"labels\"] = processor(batch[\"sentence\"]).input_ids\n",
|
275 |
+
" return batch"
|
276 |
+
]
|
277 |
+
},
|
278 |
+
{
|
279 |
+
"cell_type": "code",
|
280 |
+
"execution_count": 28,
|
281 |
+
"id": "6786ed7e",
|
282 |
+
"metadata": {},
|
283 |
+
"outputs": [
|
284 |
+
{
|
285 |
+
"data": {
|
286 |
+
"application/vnd.jupyter.widget-view+json": {
|
287 |
+
"model_id": "b74fe324f3bd4d98b6366c614fec7991",
|
288 |
+
"version_major": 2,
|
289 |
+
"version_minor": 0
|
290 |
+
},
|
291 |
+
"text/plain": [
|
292 |
+
"0ex [00:00, ?ex/s]"
|
293 |
+
]
|
294 |
+
},
|
295 |
+
"metadata": {},
|
296 |
+
"output_type": "display_data"
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"ename": "ValueError",
|
300 |
+
"evalue": "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).",
|
301 |
+
"output_type": "error",
|
302 |
+
"traceback": [
|
303 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
304 |
+
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
305 |
+
"Input \u001b[0;32mIn [28]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m common_voice_test \u001b[38;5;241m=\u001b[39m \u001b[43mcommon_voice_test\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprepare_dataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mremove_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcommon_voice_test\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumn_names\u001b[49m\u001b[43m)\u001b[49m\n",
|
306 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2107\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m 2104\u001b[0m disable_tqdm \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mbool\u001b[39m(logging\u001b[38;5;241m.\u001b[39mget_verbosity() \u001b[38;5;241m==\u001b[39m logging\u001b[38;5;241m.\u001b[39mNOTSET) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mis_progress_bar_enabled()\n\u001b[1;32m 2106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m num_proc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m num_proc \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m-> 2107\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_single\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2108\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2109\u001b[0m \u001b[43m \u001b[49m\u001b[43mwith_indices\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwith_indices\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2110\u001b[0m \u001b[43m \u001b[49m\u001b[43mwith_rank\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwith_rank\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2111\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2112\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatched\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatched\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2113\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2114\u001b[0m \u001b[43m \u001b[49m\u001b[43mdrop_last_batch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdrop_last_batch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2115\u001b[0m \u001b[43m \u001b[49m\u001b[43mremove_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mremove_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2116\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeep_in_memory\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeep_in_memory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2117\u001b[0m \u001b[43m \u001b[49m\u001b[43mload_from_cache_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mload_from_cache_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2118\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_file_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_file_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2119\u001b[0m \u001b[43m \u001b[49m\u001b[43mwriter_batch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwriter_batch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2120\u001b[0m \u001b[43m \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2121\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable_nullable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable_nullable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2122\u001b[0m \u001b[43m \u001b[49m\u001b[43mfn_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2123\u001b[0m \u001b[43m \u001b[49m\u001b[43mnew_fingerprint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnew_fingerprint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2124\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable_tqdm\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable_tqdm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2125\u001b[0m \u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdesc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2126\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2127\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2129\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mformat_cache_file_name\u001b[39m(cache_file_name, rank):\n",
|
307 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:519\u001b[0m, in \u001b[0;36mtransmit_tasks.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;28mself\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mself\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 518\u001b[0m \u001b[38;5;66;03m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 519\u001b[0m out: Union[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDatasetDict\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 520\u001b[0m datasets: List[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(out\u001b[38;5;241m.\u001b[39mvalues()) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(out, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m [out]\n\u001b[1;32m 521\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m dataset \u001b[38;5;129;01min\u001b[39;00m datasets:\n\u001b[1;32m 522\u001b[0m \u001b[38;5;66;03m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n",
|
308 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:486\u001b[0m, in \u001b[0;36mtransmit_format.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 479\u001b[0m self_format \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 480\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_type,\n\u001b[1;32m 481\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mformat_kwargs\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_kwargs,\n\u001b[1;32m 482\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumns\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_columns,\n\u001b[1;32m 483\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_all_columns\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_output_all_columns,\n\u001b[1;32m 484\u001b[0m }\n\u001b[1;32m 485\u001b[0m \u001b[38;5;66;03m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 486\u001b[0m out: Union[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDatasetDict\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 487\u001b[0m datasets: List[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(out\u001b[38;5;241m.\u001b[39mvalues()) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(out, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m [out]\n\u001b[1;32m 488\u001b[0m \u001b[38;5;66;03m# re-apply format to the output\u001b[39;00m\n",
|
309 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/fingerprint.py:413\u001b[0m, in \u001b[0;36mfingerprint_transform.<locals>._fingerprint.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 407\u001b[0m kwargs[fingerprint_name] \u001b[38;5;241m=\u001b[39m update_fingerprint(\n\u001b[1;32m 408\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fingerprint, transform, kwargs_for_fingerprint\n\u001b[1;32m 409\u001b[0m )\n\u001b[1;32m 411\u001b[0m \u001b[38;5;66;03m# Call actual function\u001b[39;00m\n\u001b[0;32m--> 413\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;66;03m# Update fingerprint of in-place transforms + update in-place history of transforms\u001b[39;00m\n\u001b[1;32m 417\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace: \u001b[38;5;66;03m# update after calling func so that the fingerprint doesn't change if the function fails\u001b[39;00m\n",
|
310 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2465\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, disable_tqdm, desc, cache_only)\u001b[0m\n\u001b[1;32m 2463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m batched:\n\u001b[1;32m 2464\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, example \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(pbar):\n\u001b[0;32m-> 2465\u001b[0m example \u001b[38;5;241m=\u001b[39m \u001b[43mapply_function_on_filtered_inputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexample\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2466\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m update_data:\n\u001b[1;32m 2467\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
|
311 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2372\u001b[0m, in \u001b[0;36mDataset._map_single.<locals>.apply_function_on_filtered_inputs\u001b[0;34m(inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m 2370\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m with_rank:\n\u001b[1;32m 2371\u001b[0m additional_args \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (rank,)\n\u001b[0;32m-> 2372\u001b[0m processed_inputs \u001b[38;5;241m=\u001b[39m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfn_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43madditional_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2373\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m update_data \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 2374\u001b[0m \u001b[38;5;66;03m# Check if the function returns updated examples\u001b[39;00m\n\u001b[1;32m 2375\u001b[0m update_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28misinstance\u001b[39m(processed_inputs, (Mapping, pa\u001b[38;5;241m.\u001b[39mTable))\n",
|
312 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2067\u001b[0m, in \u001b[0;36mDataset.map.<locals>.decorate.<locals>.decorated\u001b[0;34m(item, *args, **kwargs)\u001b[0m\n\u001b[1;32m 2063\u001b[0m decorated_item \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 2064\u001b[0m Example(item, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m batched \u001b[38;5;28;01melse\u001b[39;00m Batch(item, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures)\n\u001b[1;32m 2065\u001b[0m )\n\u001b[1;32m 2066\u001b[0m \u001b[38;5;66;03m# Use the LazyDict internally, while mapping the function\u001b[39;00m\n\u001b[0;32m-> 2067\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdecorated_item\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2068\u001b[0m \u001b[38;5;66;03m# Return a standard dict\u001b[39;00m\n\u001b[1;32m 2069\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\u001b[38;5;241m.\u001b[39mdata \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, LazyDict) \u001b[38;5;28;01melse\u001b[39;00m result\n",
|
313 |
+
"Input \u001b[0;32mIn [27]\u001b[0m, in \u001b[0;36mprepare_dataset\u001b[0;34m(batch)\u001b[0m\n\u001b[1;32m 2\u001b[0m audio \u001b[38;5;241m=\u001b[39m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudio\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# batched output is \"un-batched\"\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_values\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mprocessor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marray\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43marray\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msampling_rate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msampling_rate\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39minput_values[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 6\u001b[0m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_length\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_values\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m processor\u001b[38;5;241m.\u001b[39mas_target_processor():\n",
|
314 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:138\u001b[0m, in \u001b[0;36mWav2Vec2Processor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 132\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;124;03m When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's\u001b[39;00m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context\u001b[39;00m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;124;03m [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.\u001b[39;00m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 138\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_processor\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
315 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2417\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.__call__\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 2414\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 2416\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text):\n\u001b[0;32m-> 2417\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2418\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2419\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2420\u001b[0m )\n\u001b[1;32m 2422\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text_pair):\n\u001b[1;32m 2423\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2424\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2425\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2426\u001b[0m )\n",
|
316 |
+
"\u001b[0;31mValueError\u001b[0m: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
|
317 |
+
]
|
318 |
+
}
|
319 |
+
],
|
320 |
+
"source": [
|
321 |
+
"common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names)"
|
322 |
+
]
|
323 |
+
},
|
324 |
+
{
|
325 |
+
"cell_type": "code",
|
326 |
+
"execution_count": 24,
|
327 |
+
"id": "81506c80",
|
328 |
+
"metadata": {},
|
329 |
+
"outputs": [
|
330 |
+
{
|
331 |
+
"data": {
|
332 |
+
"text/plain": [
|
333 |
+
"{'path': 'common_voice_tr_17341269.mp3',\n",
|
334 |
+
" 'audio': {'path': 'cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_17341269.mp3',\n",
|
335 |
+
" 'array': array([ 0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 8.288735e-06,\n",
|
336 |
+
" -1.994405e-03, -7.770515e-03], dtype=float32),\n",
|
337 |
+
" 'sampling_rate': 16000},\n",
|
338 |
+
" 'sentence': 'Pek çoğu da Roman toplumundan geliyor.'}"
|
339 |
+
]
|
340 |
+
},
|
341 |
+
"execution_count": 24,
|
342 |
+
"metadata": {},
|
343 |
+
"output_type": "execute_result"
|
344 |
+
}
|
345 |
+
],
|
346 |
+
"source": [
|
347 |
+
"common_voice_test[0]"
|
348 |
+
]
|
349 |
+
},
|
350 |
+
{
|
351 |
+
"cell_type": "code",
|
352 |
+
"execution_count": null,
|
353 |
+
"id": "603ecd46",
|
354 |
+
"metadata": {},
|
355 |
+
"outputs": [],
|
356 |
+
"source": []
|
357 |
+
},
|
358 |
+
{
|
359 |
+
"cell_type": "code",
|
360 |
+
"execution_count": 12,
|
361 |
+
"id": "760f0031",
|
362 |
+
"metadata": {},
|
363 |
+
"outputs": [],
|
364 |
+
"source": [
|
365 |
+
"i = 20"
|
366 |
+
]
|
367 |
+
},
|
368 |
+
{
|
369 |
+
"cell_type": "code",
|
370 |
+
"execution_count": 14,
|
371 |
+
"id": "e0355fac",
|
372 |
+
"metadata": {},
|
373 |
+
"outputs": [
|
374 |
+
{
|
375 |
+
"ename": "KeyError",
|
376 |
+
"evalue": "'input_values'",
|
377 |
+
"output_type": "error",
|
378 |
+
"traceback": [
|
379 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
380 |
+
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
381 |
+
"Input \u001b[0;32mIn [14]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m input_dict \u001b[38;5;241m=\u001b[39m processor(\u001b[43mcommon_voice_test\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minput_values\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
|
382 |
+
"\u001b[0;31mKeyError\u001b[0m: 'input_values'"
|
383 |
+
]
|
384 |
+
}
|
385 |
+
],
|
386 |
+
"source": [
|
387 |
+
"input_dict = processor(common_voice_test[i][\"input_values\"], return_tensors=\"pt\", padding=True)"
|
388 |
+
]
|
389 |
+
},
|
390 |
+
{
|
391 |
+
"cell_type": "code",
|
392 |
+
"execution_count": 13,
|
393 |
+
"id": "c0b3603c",
|
394 |
+
"metadata": {},
|
395 |
+
"outputs": [
|
396 |
+
{
|
397 |
+
"ename": "KeyError",
|
398 |
+
"evalue": "'input_values'",
|
399 |
+
"output_type": "error",
|
400 |
+
"traceback": [
|
401 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
402 |
+
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
403 |
+
"Input \u001b[0;32mIn [13]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m input_dict \u001b[38;5;241m=\u001b[39m processor(\u001b[43mcommon_voice_test\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minput_values\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m logits \u001b[38;5;241m=\u001b[39m model(input_dict\u001b[38;5;241m.\u001b[39minput_values\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m))\u001b[38;5;241m.\u001b[39mlogits\n\u001b[1;32m 3\u001b[0m pred_ids \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39margmax(logits, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n",
|
404 |
+
"\u001b[0;31mKeyError\u001b[0m: 'input_values'"
|
405 |
+
]
|
406 |
+
}
|
407 |
+
],
|
408 |
+
"source": [
|
409 |
+
"input_dict = processor(common_voice_test[i][\"input_values\"], return_tensors=\"pt\", padding=True)\n",
|
410 |
+
"logits = model(input_dict.input_values.to(\"cuda\")).logits\n",
|
411 |
+
"pred_ids = torch.argmax(logits, dim=-1)[0]"
|
412 |
+
]
|
413 |
+
},
|
414 |
+
{
|
415 |
+
"cell_type": "code",
|
416 |
+
"execution_count": 15,
|
417 |
+
"id": "23db2fe7",
|
418 |
+
"metadata": {},
|
419 |
+
"outputs": [
|
420 |
+
{
|
421 |
+
"name": "stdout",
|
422 |
+
"output_type": "stream",
|
423 |
+
"text": [
|
424 |
+
"Prediction:\n",
|
425 |
+
"ş\n",
|
426 |
+
"\n",
|
427 |
+
"Reference:\n",
|
428 |
+
"Yine de her iki grup farklı sorunlar çıkarıyor.\n"
|
429 |
+
]
|
430 |
+
}
|
431 |
+
],
|
432 |
+
"source": [
|
433 |
+
"print(\"Prediction:\")\n",
|
434 |
+
"print(processor.decode(pred_ids))\n",
|
435 |
+
"\n",
|
436 |
+
"print(\"\\nReference:\")\n",
|
437 |
+
"print(processor.decode(common_voice_test['labels'][i]))\n",
|
438 |
+
"# print(common_voice_test_transcription[0][\"sentence\"].lower())"
|
439 |
+
]
|
440 |
+
},
|
441 |
+
{
|
442 |
+
"cell_type": "code",
|
443 |
+
"execution_count": null,
|
444 |
+
"id": "4da2cb6c",
|
445 |
+
"metadata": {},
|
446 |
+
"outputs": [],
|
447 |
+
"source": []
|
448 |
+
},
|
449 |
+
{
|
450 |
+
"cell_type": "code",
|
451 |
+
"execution_count": null,
|
452 |
+
"id": "0f5325dd",
|
453 |
+
"metadata": {},
|
454 |
+
"outputs": [],
|
455 |
+
"source": []
|
456 |
+
}
|
457 |
+
],
|
458 |
+
"metadata": {
|
459 |
+
"kernelspec": {
|
460 |
+
"display_name": "Python 3 (ipykernel)",
|
461 |
+
"language": "python",
|
462 |
+
"name": "python3"
|
463 |
+
},
|
464 |
+
"language_info": {
|
465 |
+
"codemirror_mode": {
|
466 |
+
"name": "ipython",
|
467 |
+
"version": 3
|
468 |
+
},
|
469 |
+
"file_extension": ".py",
|
470 |
+
"mimetype": "text/x-python",
|
471 |
+
"name": "python",
|
472 |
+
"nbconvert_exporter": "python",
|
473 |
+
"pygments_lexer": "ipython3",
|
474 |
+
"version": "3.8.8"
|
475 |
+
}
|
476 |
+
},
|
477 |
+
"nbformat": 4,
|
478 |
+
"nbformat_minor": 5
|
479 |
+
}
|
train_tr.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
vocab.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"
|
|
|
1 |
+
{"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8, "I": 9, "J": 10, "K": 11, "L": 12, "M": 13, "N": 14, "O": 15, "P": 16, "Q": 17, "R": 18, "S": 19, "T": 20, "U": 21, "V": 22, "W": 23, "X": 24, "Y": 25, "Z": 26, "a": 27, "b": 28, "c": 29, "d": 30, "e": 31, "f": 32, "g": 33, "h": 34, "i": 35, "j": 36, "k": 37, "l": 38, "m": 39, "n": 40, "o": 41, "p": 42, "r": 43, "s": 44, "t": 45, "u": 46, "v": 47, "w": 48, "x": 49, "y": 50, "z": 51, "\u00c7": 52, "\u00d6": 53, "\u00dc": 54, "\u00e2": 55, "\u00e7": 56, "\u00eb": 57, "\u00ee": 58, "\u00f6": 59, "\u00fc": 60, "\u011f": 61, "\u0130": 62, "\u0131": 63, "\u015e": 64, "\u015f": 65, "|": 0, "[UNK]": 67, "[PAD]": 68}
|