{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": "['../src',\n '/Users/m3hrdadfi/Projects/HF/hfflax/hub/wav2vec2-base-persian/notes',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles/lib/python',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python39.zip',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/lib-dynload',\n '',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages',\n '/Users/m3hrdadfi/Projects/Apps/zabanshenas',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages/IPython/extensions',\n '/Users/m3hrdadfi/.ipython']" }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sys.path" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "if \"../src\" not in sys.path:\n", " sys.path.insert(0, \"../src\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from normalizer import normalizer" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "سلام بر شما که می‌آیید و می‌آموزید که بی‌آرآیم \n", "کتاب‌هایمان میدانی کجا‌ها ماه‌هاس که کی‌هامون و کیهان دنباله‌هاشون برای بهای هستند \n", "میان‌‌افزار‌های امروزی نرم‌افزار سخت‌افزار امروز نوشت‌افزار‌ها \n", "این کتاب بهترین در نوع شتر آسان‌تر هست \n", "سه چیز هست که از پژوهش در این زمینه آموخته‌ام \n" ] } ], "source": [ "input_text = \"سلام بر شما که میآیید و میآموزید که بیآرآیم\"\n", "print(normalizer({\"sentence\": input_text}, return_dict=False))\n", "\n", "input_text = \"کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند.\"\n", "print(normalizer({\"sentence\": input_text}, return_dict=False))\n", "\n", "input_text = \" میانافزارهای امروزی نرمافزار سخت افزار امروز نوشتافزار ها\"\n", "print(normalizer({\"sentence\": input_text}, return_dict=False))\n", "\n", "input_text = \"این کتاب بهترین در نوع شتر آسانتر هست\"\n", "print(normalizer({\"sentence\": input_text}, return_dict=False))\n", "\n", "input_text = \"سه چیز هست که از پژوهش در این زمینه آموختهام\"\n", "print(normalizer({\"sentence\": input_text}, return_dict=False))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# !mkdir -p /home/m3hrdadfi/code/data\n", "# %cd /home/m3hrdadfi/code/data\n", "# !wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz && tar -xzf fa.tar.gz\n", "# %cd /home/m3hrdadfi/" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# import os\n", "\n", "# lang = \"fa\"\n", "# abs_path_to_data = os.path.join(f\"/home/m3hrdadfi/code/data/{lang}/dataset\", f\"cv{lang}\", lang)\n", "# save_path = \"/\".join(abs_path_to_data.split('/')[:-2])\n", "# print(abs_path_to_data)\n", "# print(save_path)\n", "# print()\n", "# !ls {save_path}\n", "# !ls {abs_path_to_data}/*.tsv" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def normalizer_without_batch(text, pruning=False):\n", " try:\n", " batch = {\n", " \"sentence\": text\n", " }\n", " text = normalizer(batch, return_dict=False)\n", " \n", " if pruning:\n", " if not len(text.split()) > 3:\n", " text = None\n", " \n", " except:\n", " print(text)\n", " text = None\n", " \n", " return text" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# test_df = pd.read_csv(f\"{abs_path_to_data}/test.tsv\", sep=\"\\t\")\n", "\n", "# print(f\"Step 0: {len(test_df)}\")\n", "\n", "# test_df[\"path\"] = abs_path_to_data + \"/clips/\" + test_df[\"path\"]\n", "# test_df[\"status\"] = test_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n", "# test_df = test_df.dropna(subset=[\"path\"])\n", "# test_df = test_df.drop(\"status\", 1)\n", "# print(f\"Step 1: {len(test_df)}\")\n", "\n", "# test_df[\"prev_sentence\"] = test_df[\"sentence\"]\n", "# test_df[\"sentence\"] = test_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t))\n", "# test_df = test_df.dropna(subset=[\"sentence\"])\n", "# print(f\"Step 2: {len(test_df)}\")\n", "\n", "# test_df = test_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n", "# test_df = test_df.drop_duplicates(subset=\"path\")\n", "# print(f\"Step 3: {len(test_df)}\")\n", "\n", "# test_df = test_df.reset_index(drop=True)\n", "# test_df.head()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# _train_df = pd.concat([\n", "# pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n", "# pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n", "# ])\n", "# print(len(_train_df))\n", "\n", "# train_df = pd.concat([\n", "# pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n", "# pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n", "# pd.read_csv(f\"{abs_path_to_data}/validated.tsv\", sep=\"\\t\"),\n", "# pd.read_csv(f\"{abs_path_to_data}/other.tsv\", sep=\"\\t\"),\n", "# ])\n", "# print(f\"Step 0: {len(train_df)}\")\n", "\n", "# train_df[\"path\"] = abs_path_to_data + \"/clips/\" + train_df[\"path\"]\n", "# train_df[\"status\"] = train_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n", "# train_df = train_df.dropna(subset=[\"path\"])\n", "# train_df = train_df.drop(\"status\", 1)\n", "# print(f\"Step 1: {len(train_df)}\")\n", "\n", "# train_df[\"prev_sentence\"] = train_df[\"sentence\"]\n", "# train_df[\"sentence\"] = train_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t, pruning=True))\n", "# train_df = train_df.dropna(subset=[\"sentence\"])\n", "# print(f\"Step 2: {len(train_df)}\")\n", "\n", "# train_df = train_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n", "# train_df = train_df.drop_duplicates(subset=\"path\")\n", "# print(f\"Step 3: {len(train_df)}\")\n", "\n", "# train_df = train_df.sample(frac=1)\n", "# train_df = train_df.reset_index(drop=True)\n", "# train_df.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# from tqdm import tqdm\n", "\n", "# testset_indices = []\n", "\n", "# for index, row in tqdm(test_df.iterrows(), total=len(test_df), position=0):\n", "# _id = row[\"path\"]\n", "# finder = train_df[train_df[\"path\"] == _id]\n", "# if len(finder) > 0:\n", "# testset_indices.extend(list(finder.index))\n", "\n", "# testset_indices = list(set(testset_indices))\n", "# print(f\"Found #{len(testset_indices)} test data\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# print(len(train_df))\n", "# train_df = train_df.drop(testset_indices)\n", "# print(len(train_df))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# import pandas as pd\n", "\n", "# df = pd.concat([train_df, test_df], axis=0)\n", "# # df = validated_df.copy()\n", "# print(df.info())\n", "# # df[\"sentence\"] = df[\"prev_sentence\"].apply(lambda t: normalizer_without_batch(t))\n", "# # df = df.dropna(subset=[\"sentence\"])\n", "# # df[\"sentence_spell\"] = df[\"sentence\"].apply(lambda t: normalizer({\"sentence\": t}, is_spell_check=True, return_dict=False))\n", "# df = df.reset_index(drop=True)\n", "# print(df.info())\n", "# df.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# import torchaudio\n", "# import librosa\n", "# import IPython.display as ipd\n", "# import numpy as np\n", "\n", "# def load_audio(path):\n", "# speech, sr = torchaudio.load(path)\n", "# speech = speech[0].numpy().squeeze() \n", "# speech = librosa.resample(np.asarray(speech), sr, 16_000)\n", " \n", "# print(speech.shape, sr)\n", " \n", "# ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# main_vocab = [\"ح\", \"چ\", \"ج\", \"ث\", \"ت\", \"پ\", \"ب\", \"آ\", \"ا\", \"ش\", \"س\", \"ژ\", \"ز\", \"ر\", \"ذ\", \"د\", \"خ\", \"ق\", \"ف\", \"غ\", \"ع\", \"ظ\", \"ط\", \"ض\", \"ص\", \"ی\", \"ه\", \"و\", \"ن\", \"م\", \"ل\", \"گ\", \"ک\"]\n", "# text = \" \".join(df[\"sentence\"].values.tolist())\n", "# vocab = list(sorted(set(text)))\n", "\n", "# for v in main_vocab:\n", "# if v not in vocab:\n", "# print(\"v\", v)\n", "\n", "# print(len(main_vocab), len(vocab))\n", "# print(len(vocab), vocab)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# import numpy as np\n", "\n", "\n", "# idx = np.random.randint(0, len(df))\n", "# # idx = 6140\n", "# sample = df.iloc[idx]\n", "# ipd.display(sample)\n", "# # print(sample.iloc[idx][\"prev_sentence\"])\n", "# print()\n", "# print(sample[\"prev_sentence\"])\n", "# print(sample[\"sentence\"])\n", "# print()\n", "# load_audio(sample[\"path\"])" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# new_train_df = train_df.copy()\n", "# new_train_df[\"_path\"] = new_train_df[\"path\"]\n", "# new_train_df[\"path\"] = new_train_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n", "# print(new_train_df.info())\n", "# new_train_df.head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# new_test_df = test_df.copy()\n", "# new_test_df[\"_path\"] = new_test_df[\"path\"]\n", "# new_test_df[\"path\"] = new_test_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n", "# print(new_test_df.info())\n", "# new_test_df.head()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# import shutil\n", "# from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# !mkdir -p {save_path}/clips\n", "# !mkdir -p {save_path}/augs" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# for index, row in tqdm(new_train_df.iterrows(), position=0, total=len(new_train_df)):\n", "# shutil.copy(row[\"_path\"], row[\"path\"])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "# for index, row in tqdm(new_test_df.iterrows(), position=0, total=len(new_test_df)):\n", "# shutil.copy(row[\"_path\"], row[\"path\"])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# # aug_train_df = new_train_df.copy()\n", "# aug_train_df = new_train_df.sample(frac=0.1)\n", "# aug_train_df = aug_train_df.reset_index(drop=True)\n", "# aug_train_df[\"_path\"] = aug_train_df[\"path\"]\n", "# aug_train_df[\"path\"] = aug_train_df[\"path\"].apply(lambda t: \"/\".join(t.split('.')[:-1]).replace(\"clips\", \"augs\") + \"_aug.mp3.wav\")\n", "# print(aug_train_df.info())\n", "# aug_train_df.head()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# print(aug_train_df.iloc[0][\"_path\"])\n", "# print(aug_train_df.iloc[0][\"path\"])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# # augmentation\n", "\n", "# from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain\n", "# import numpy as np\n", "# import soundfile as sf\n", "\n", "# augment = Compose([\n", "# # AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n", "# # PitchShift(min_semitones=-1, max_semitones=2, p=0.2),\n", "# # Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8)\n", "# AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n", "# TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n", "# PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n", "# ])\n", "\n", "# def augmented_speech_file_to_array_fn(in_path, out_path):\n", "# speech_array, sampling_rate = torchaudio.load(in_path)\n", "# speech_array = speech_array.squeeze().numpy()\n", "# speech_array = augment(samples=speech_array, sample_rate=sampling_rate)\n", "# sf.write(out_path, speech_array, sampling_rate, \"PCM_24\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# # for index, row in tqdm(aug_train_df.iterrows(), position=0, total=len(aug_train_df)):\n", "# # augmented_speech_file_to_array_fn(row[\"_path\"], row[\"path\"])\n", "# !ls" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# # new_train_aug_df = pd.concat([new_train_df, aug_train_df], axis=0)\n", "# new_train_aug_df = new_train_df.copy()\n", "# new_train_aug_df = new_train_aug_df.sample(frac=1)\n", "# new_train_aug_df = new_train_aug_df.reset_index(drop=True)\n", "# print(new_train_aug_df.info())\n", "# new_train_aug_df.head()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "# new_train_df.to_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n", "# new_train_aug_df.to_csv(f\"{save_path}/train_with_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n", "# new_test_df.to_csv(f\"{save_path}/test.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "# new_train_df.count()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# new_test_df.count()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# import pandas as pd\n", "\n", "# import os\n", "# from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "# train_df = pd.read_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\")\n", "# print(train_df.info())\n", "# train_df.head()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "# test_df = pd.read_csv(f\"{save_path}/test.csv\", sep=\"\\t\")\n", "# print(test_df.info())\n", "# test_df.head()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "# non_existed_train = []\n", "\n", "# for index, row in tqdm(train_df.iterrows(), total=len(train_df), position=0):\n", "# if not os.path.exists(row[\"path\"]):\n", "# non_existed_train.extends(list(index))\n", "# break" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "# import numpy as np\n", "\n", "\n", "# idx = np.random.randint(0, len(train_df))\n", "# # idx = 6140\n", "# sample = train_df.iloc[idx]\n", "# ipd.display(sample)\n", "# # print(sample.iloc[idx][\"prev_sentence\"])\n", "# print()\n", "# print(sample[\"prev_sentence\"])\n", "# print(sample[\"sentence\"])\n", "# print()\n", "# load_audio(sample[\"path\"])" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "# train_df_half = train_df.copy()\n", "# print(train_df_half.shape)\n", "# train_df_half = train_df_half.dropna()\n", "# print(train_df_half.shape)\n", "# train_df_half = train_df_half.drop_duplicates()\n", "# print(train_df_half.shape)\n", "\n", "# train_df_half = train_df_half.sample(frac=0.5)\n", "# train_df_half = train_df_half.reset_index(drop=True)\n", "# print(train_df_half.shape)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "# train_df_half.to_csv(f\"{save_path}/train_no_aug_half.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "transformers", "name": "transformers" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.4" }, "orig_nbformat": 2 }, "nbformat": 4, "nbformat_minor": 2 }