openai
/

whisper-tiny

Automatic Speech Recognition

hf-asr-leaderboard

Inference Endpoints

Model card Files Files and versions Community

whisper-tiny / special_tokens_map.json

sanchit-gandhi's picture

sanchit-gandhi HF staff

Add missing merge to tokenizer (#40)

169d4a4 verified 9 months ago

2.19 kB

	{
	"additional_special_tokens": [
	"<\|endoftext\|>",
	"<\|startoftranscript\|>",
	"<\|en\|>",
	"<\|zh\|>",
	"<\|de\|>",
	"<\|es\|>",
	"<\|ru\|>",
	"<\|ko\|>",
	"<\|fr\|>",
	"<\|ja\|>",
	"<\|pt\|>",
	"<\|tr\|>",
	"<\|pl\|>",
	"<\|ca\|>",
	"<\|nl\|>",
	"<\|ar\|>",
	"<\|sv\|>",
	"<\|it\|>",
	"<\|id\|>",
	"<\|hi\|>",
	"<\|fi\|>",
	"<\|vi\|>",
	"<\|he\|>",
	"<\|uk\|>",
	"<\|el\|>",
	"<\|ms\|>",
	"<\|cs\|>",
	"<\|ro\|>",
	"<\|da\|>",
	"<\|hu\|>",
	"<\|ta\|>",
	"<\|no\|>",
	"<\|th\|>",
	"<\|ur\|>",
	"<\|hr\|>",
	"<\|bg\|>",
	"<\|lt\|>",
	"<\|la\|>",
	"<\|mi\|>",
	"<\|ml\|>",
	"<\|cy\|>",
	"<\|sk\|>",
	"<\|te\|>",
	"<\|fa\|>",
	"<\|lv\|>",
	"<\|bn\|>",
	"<\|sr\|>",
	"<\|az\|>",
	"<\|sl\|>",
	"<\|kn\|>",
	"<\|et\|>",
	"<\|mk\|>",
	"<\|br\|>",
	"<\|eu\|>",
	"<\|is\|>",
	"<\|hy\|>",
	"<\|ne\|>",
	"<\|mn\|>",
	"<\|bs\|>",
	"<\|kk\|>",
	"<\|sq\|>",
	"<\|sw\|>",
	"<\|gl\|>",
	"<\|mr\|>",
	"<\|pa\|>",
	"<\|si\|>",
	"<\|km\|>",
	"<\|sn\|>",
	"<\|yo\|>",
	"<\|so\|>",
	"<\|af\|>",
	"<\|oc\|>",
	"<\|ka\|>",
	"<\|be\|>",
	"<\|tg\|>",
	"<\|sd\|>",
	"<\|gu\|>",
	"<\|am\|>",
	"<\|yi\|>",
	"<\|lo\|>",
	"<\|uz\|>",
	"<\|fo\|>",
	"<\|ht\|>",
	"<\|ps\|>",
	"<\|tk\|>",
	"<\|nn\|>",
	"<\|mt\|>",
	"<\|sa\|>",
	"<\|lb\|>",
	"<\|my\|>",
	"<\|bo\|>",
	"<\|tl\|>",
	"<\|mg\|>",
	"<\|as\|>",
	"<\|tt\|>",
	"<\|haw\|>",
	"<\|ln\|>",
	"<\|ha\|>",
	"<\|ba\|>",
	"<\|jw\|>",
	"<\|su\|>",
	"<\|translate\|>",
	"<\|transcribe\|>",
	"<\|startoflm\|>",
	"<\|startofprev\|>",
	"<\|nocaptions\|>",
	"<\|notimestamps\|>"
	],
	"bos_token": {
	"content": "<\|endoftext\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false
	},
	"eos_token": {
	"content": "<\|endoftext\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false
	},
	"pad_token": {
	"content": "<\|endoftext\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false
	},
	"unk_token": {
	"content": "<\|endoftext\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false
	}
	}