{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d23f1f27-fbf4-4fe5-a7b4-17815b23f283", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoProcessor" ] }, { "cell_type": "code", "execution_count": 2, "id": "cdefcb5e-0824-49ef-be73-8788cbb4e2a9", "metadata": {}, "outputs": [], "source": [ "processor = AutoProcessor.from_pretrained(\"chmanoj/xls-r-300m-te\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "ef78538d-ca83-4cd3-824d-1b7928f5bc4e", "metadata": {}, "outputs": [], "source": [ "vocab_dict = processor.tokenizer.get_vocab()\n", "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}" ] }, { "cell_type": "code", "execution_count": 4, "id": "cd355539-6dfb-4978-82a3-905c0236c6c3", "metadata": {}, "outputs": [], "source": [ "from pyctcdecode import build_ctcdecoder" ] }, { "cell_type": "code", "execution_count": 9, "id": "34429a23-a3e5-40ca-be4e-186bf12e1ff4", "metadata": {}, "outputs": [], "source": [ "# !which python\n", "\n", "# !pip install https://github.com/kpu/kenlm/archive/master.zip" ] }, { "cell_type": "code", "execution_count": 5, "id": "21f4fb99-1c19-4a0a-9ac0-90dd38645585", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading the LM will be faster if you build a binary file.\n", "Reading /mnt/c/Projects/Speech/xls-R-finetuning/xls-r-300m-te/3gram_correct.arpa\n", "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", "****************************************************************************************************\n", "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n", "Unigrams and labels don't seem to agree.\n" ] } ], "source": [ "decoder = build_ctcdecoder(\n", " labels=list(sorted_vocab_dict.keys()),\n", " kenlm_model_path=\"3gram_correct.arpa\",\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "id": "f892aada-710c-4bc2-a11f-c9a35c00870a", "metadata": {}, "outputs": [], "source": [ "from transformers import Wav2Vec2ProcessorWithLM\n", "\n", "processor_with_lm = Wav2Vec2ProcessorWithLM(\n", " feature_extractor=processor.feature_extractor,\n", " tokenizer=processor.tokenizer,\n", " decoder=decoder\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "id": "5e29f7f7-e116-4c65-9c14-ae7e871390bb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/mnt/c/Projects/Speech/xls-R-finetuning/xls-r-300m-te'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "os.getcwd()" ] }, { "cell_type": "code", "execution_count": 8, "id": "6f5775eb-aece-41fc-a1eb-8bf6f9b8f429", "metadata": {}, "outputs": [], "source": [ "processor_with_lm.save_pretrained(os.getcwd())" ] }, { "cell_type": "code", "execution_count": null, "id": "0e7e4d6f-01d0-4a24-9980-a6583fb6d048", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "id": "c5ea011b-9412-484a-b798-15fb6e338a99", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading language_model/3gram_correct.arpa\n", "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", "****************************************************************************************************\n", "SUCCESS\n" ] } ], "source": [ "!../kenlm/build/bin/build_binary language_model/3gram_correct.arpa language_model/3gram.bin" ] }, { "cell_type": "code", "execution_count": null, "id": "70c2709b-0b5c-440f-ae9f-11f8045e8fed", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 11, "id": "c5db962f-15f1-4b65-87e3-81e1af14e32e", "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import Repository" ] }, { "cell_type": "code", "execution_count": 12, "id": "d3801f28-cdb5-40cd-b1b9-5a00f8f24720", "metadata": {}, "outputs": [], "source": [ "repo = Repository(local_dir=\".\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c6421313-5d36-45ce-8300-3988985e7239", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 14, "id": "7dcfe5d2-063f-4b34-9fdd-5f025ef9f699", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Several commits (2) will be pushed upstream.\n", "The progress bars may be unreliable.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d17d7664ff97403f9f428264855729c2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Upload file language_model/3gram_correct.arpa: 0%| | 32.0k/2.59G [00:00\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mrepo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpush_to_hub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommit_message\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Upload lm-boosted decoder\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/site-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mpush_to_hub\u001b[0;34m(self, commit_message, blocking, clean_ok, auto_lfs_prune)\u001b[0m\n\u001b[1;32m 1233\u001b[0m \u001b[0mupstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34mf\"origin {self.current_branch}\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1234\u001b[0m \u001b[0mblocking\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mblocking\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1235\u001b[0;31m \u001b[0mauto_lfs_prune\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mauto_lfs_prune\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1236\u001b[0m )\n\u001b[1;32m 1237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/site-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mgit_push\u001b[0;34m(self, upstream, blocking, auto_lfs_prune)\u001b[0m\n\u001b[1;32m 989\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 990\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mblocking\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 991\u001b[0;31m \u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommunicate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 992\u001b[0m \u001b[0mreturn_code\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpoll\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 993\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkill\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36mcommunicate\u001b[0;34m(self, input, timeout)\u001b[0m\n\u001b[1;32m 962\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 963\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 964\u001b[0;31m \u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_communicate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendtime\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 965\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[0;31m# https://bugs.python.org/issue25942\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m_communicate\u001b[0;34m(self, input, endtime, orig_timeout)\u001b[0m\n\u001b[1;32m 1713\u001b[0m 'failed to raise TimeoutExpired.')\n\u001b[1;32m 1714\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1715\u001b[0;31m \u001b[0mready\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mselector\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1716\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_timeout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mendtime\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morig_timeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1717\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/selectors.py\u001b[0m in \u001b[0;36mselect\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 413\u001b[0m \u001b[0mready\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 415\u001b[0;31m \u001b[0mfd_event_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_selector\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpoll\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 416\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mInterruptedError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 417\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mready\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "repo.push_to_hub(commit_message=\"Upload lm-boosted decoder\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a505c088-5f40-4d9a-8d75-263a07cc93a5", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }