{ "architectures": [ "MCTCTForAudioFrameClassification" ], "attention_head_dim": 384, "attention_probs_dropout_prob": 0.3, "bos_token_id": 0, "conv_channels": null, "conv_dropout": 0.3, "conv_glu_dim": 1, "conv_kernel": [ 7 ], "conv_stride": [ 3 ], "ctc_loss_reduction": "sum", "ctc_zero_infinity": false, "eos_token_id": 2, "hidden_act": "relu", "hidden_dropout_prob": 0.3, "hidden_size": 1536, "id2label": { "0": "ab", "1": "ar", "10": "dv", "11": "el", "12": "en", "13": "eo", "14": "es", "15": "et", "16": "eu", "17": "fa", "18": "fi", "19": "fr", "2": "as", "20": "fy-NL", "21": "ga-IE", "22": "hi", "23": "hsb", "24": "hu", "25": "ia", "26": "id", "27": "it", "28": "ja", "29": "ka", "3": "br", "30": "kab", "31": "ky", "32": "lg", "33": "lt", "34": "lv", "35": "mn", "36": "mt", "37": "nl", "38": "or", "39": "pa-IN", "4": "ca", "40": "pl", "41": "pt", "42": "rm-sursilv", "43": "rm-vallader", "44": "ro", "45": "ru", "46": "rw", "47": "sah", "48": "sl", "49": "sv-SE", "5": "cnh", "50": "ta", "51": "th", "52": "tr", "53": "tt", "54": "uk", "55": "vi", "56": "vot", "57": "zh-CN", "58": "zh-HK", "59": "zh-TW", "6": "cs", "7": "cv", "8": "cy", "9": "de" }, "initializer_range": 0.02, "input_channels": 1, "input_feat_per_channel": 80, "intermediate_size": 6144, "label2id": { "ab": "0", "ar": "1", "as": "2", "br": "3", "ca": "4", "cnh": "5", "cs": "6", "cv": "7", "cy": "8", "de": "9", "dv": "10", "el": "11", "en": "12", "eo": "13", "es": "14", "et": "15", "eu": "16", "fa": "17", "fi": "18", "fr": "19", "fy-NL": "20", "ga-IE": "21", "hi": "22", "hsb": "23", "hu": "24", "ia": "25", "id": "26", "it": "27", "ja": "28", "ka": "29", "kab": "30", "ky": "31", "lg": "32", "lt": "33", "lv": "34", "mn": "35", "mt": "36", "nl": "37", "or": "38", "pa-IN": "39", "pl": "40", "pt": "41", "rm-sursilv": "42", "rm-vallader": "43", "ro": "44", "ru": "45", "rw": "46", "sah": "47", "sl": "48", "sv-SE": "49", "ta": "50", "th": "51", "tr": "52", "tt": "53", "uk": "54", "vi": "55", "vot": "56", "zh-CN": "57", "zh-HK": "58", "zh-TW": "59" }, "layer_norm_eps": 1e-05, "layerdrop": 0.3, "max_position_embeddings": 920, "model_type": "mctct", "num_attention_heads": 4, "num_conv_layers": 1, "num_hidden_layers": 36, "pad_token_id": 1, "torch_dtype": "float32", "transformers_version": "4.20.0.dev0", "vocab_size": 8065 }