{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "['../src',\n '/Users/m3hrdadfi/Projects/HF/hfflax/hub/wav2vec2-base-persian/notes',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles',\n '/Users/m3hrdadfi/.vscode/extensions/ms-toolsai.jupyter-2021.2.603412351/pythonFiles/lib/python',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python39.zip',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/lib-dynload',\n '',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages',\n '/Users/m3hrdadfi/Projects/Apps/zabanshenas',\n '/Users/m3hrdadfi/opt/anaconda3/envs/transformers/lib/python3.9/site-packages/IPython/extensions',\n '/Users/m3hrdadfi/.ipython']"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sys.path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "if \"../src\" not in sys.path:\n",
    "    sys.path.insert(0, \"../src\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from normalizer import normalizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "سلام بر شما که می‌آیید و می‌آموزید که بی‌آرآیم \n",
      "کتاب‌هایمان میدانی کجا‌ها ماه‌هاس که کی‌هامون و کیهان دنباله‌هاشون برای بهای هستند \n",
      "میان‌‌افزار‌های امروزی نرم‌افزار سخت‌افزار امروز نوشت‌افزار‌ها \n",
      "این کتاب بهترین در نوع شتر آسان‌تر هست \n",
      "سه چیز هست که از پژوهش در این زمینه آموخته‌ام \n"
     ]
    }
   ],
   "source": [
    "input_text = \"سلام بر شما که میآیید و میآموزید که بیآرآیم\"\n",
    "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
    "\n",
    "input_text = \"کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند.\"\n",
    "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
    "\n",
    "input_text = \" میانافزارهای امروزی نرمافزار سخت افزار امروز نوشتافزار ها\"\n",
    "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
    "\n",
    "input_text = \"این کتاب بهترین در نوع شتر آسانتر هست\"\n",
    "print(normalizer({\"sentence\": input_text}, return_dict=False))\n",
    "\n",
    "input_text = \"سه چیز هست که از پژوهش در این زمینه آموختهام\"\n",
    "print(normalizer({\"sentence\": input_text}, return_dict=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !mkdir -p /home/m3hrdadfi/code/data\n",
    "# %cd /home/m3hrdadfi/code/data\n",
    "# !wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz && tar -xzf fa.tar.gz\n",
    "# %cd /home/m3hrdadfi/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import os\n",
    "\n",
    "# lang = \"fa\"\n",
    "# abs_path_to_data = os.path.join(f\"/home/m3hrdadfi/code/data/{lang}/dataset\", f\"cv{lang}\", lang)\n",
    "# save_path = \"/\".join(abs_path_to_data.split('/')[:-2])\n",
    "# print(abs_path_to_data)\n",
    "# print(save_path)\n",
    "# print()\n",
    "# !ls {save_path}\n",
    "# !ls {abs_path_to_data}/*.tsv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalizer_without_batch(text, pruning=False):\n",
    "    try:\n",
    "        batch = {\n",
    "            \"sentence\": text\n",
    "        }\n",
    "        text = normalizer(batch, return_dict=False)\n",
    "        \n",
    "        if pruning:\n",
    "            if not len(text.split()) > 3:\n",
    "                text = None\n",
    "        \n",
    "    except:\n",
    "        print(text)\n",
    "        text = None\n",
    "        \n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_df = pd.read_csv(f\"{abs_path_to_data}/test.tsv\", sep=\"\\t\")\n",
    "\n",
    "# print(f\"Step 0: {len(test_df)}\")\n",
    "\n",
    "# test_df[\"path\"] = abs_path_to_data + \"/clips/\" + test_df[\"path\"]\n",
    "# test_df[\"status\"] = test_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
    "# test_df = test_df.dropna(subset=[\"path\"])\n",
    "# test_df = test_df.drop(\"status\", 1)\n",
    "# print(f\"Step 1: {len(test_df)}\")\n",
    "\n",
    "# test_df[\"prev_sentence\"] = test_df[\"sentence\"]\n",
    "# test_df[\"sentence\"] = test_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t))\n",
    "# test_df = test_df.dropna(subset=[\"sentence\"])\n",
    "# print(f\"Step 2: {len(test_df)}\")\n",
    "\n",
    "# test_df = test_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n",
    "# test_df = test_df.drop_duplicates(subset=\"path\")\n",
    "# print(f\"Step 3: {len(test_df)}\")\n",
    "\n",
    "# test_df = test_df.reset_index(drop=True)\n",
    "# test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# _train_df = pd.concat([\n",
    "#     pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
    "#     pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
    "# ])\n",
    "# print(len(_train_df))\n",
    "\n",
    "# train_df = pd.concat([\n",
    "#     pd.read_csv(f\"{abs_path_to_data}/train.tsv\", sep=\"\\t\"),\n",
    "#     pd.read_csv(f\"{abs_path_to_data}/dev.tsv\", sep=\"\\t\"),\n",
    "#     pd.read_csv(f\"{abs_path_to_data}/validated.tsv\", sep=\"\\t\"),\n",
    "#     pd.read_csv(f\"{abs_path_to_data}/other.tsv\", sep=\"\\t\"),\n",
    "# ])\n",
    "# print(f\"Step 0: {len(train_df)}\")\n",
    "\n",
    "# train_df[\"path\"] = abs_path_to_data + \"/clips/\" + train_df[\"path\"]\n",
    "# train_df[\"status\"] = train_df[\"path\"].apply(lambda path: True if os.path.exists(path) else None)\n",
    "# train_df = train_df.dropna(subset=[\"path\"])\n",
    "# train_df = train_df.drop(\"status\", 1)\n",
    "# print(f\"Step 1: {len(train_df)}\")\n",
    "\n",
    "# train_df[\"prev_sentence\"] = train_df[\"sentence\"]\n",
    "# train_df[\"sentence\"] = train_df[\"sentence\"].apply(lambda t: normalizer_without_batch(t, pruning=True))\n",
    "# train_df = train_df.dropna(subset=[\"sentence\"])\n",
    "# print(f\"Step 2: {len(train_df)}\")\n",
    "\n",
    "# train_df = train_df[[\"prev_sentence\", \"sentence\", \"path\"]]\n",
    "# train_df = train_df.drop_duplicates(subset=\"path\")\n",
    "# print(f\"Step 3: {len(train_df)}\")\n",
    "\n",
    "# train_df = train_df.sample(frac=1)\n",
    "# train_df = train_df.reset_index(drop=True)\n",
    "# train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from tqdm import tqdm\n",
    "\n",
    "# testset_indices = []\n",
    "\n",
    "# for index, row in tqdm(test_df.iterrows(), total=len(test_df), position=0):\n",
    "#     _id = row[\"path\"]\n",
    "#     finder = train_df[train_df[\"path\"] == _id]\n",
    "#     if len(finder) > 0:\n",
    "#         testset_indices.extend(list(finder.index))\n",
    "\n",
    "# testset_indices = list(set(testset_indices))\n",
    "# print(f\"Found #{len(testset_indices)} test data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(len(train_df))\n",
    "# train_df = train_df.drop(testset_indices)\n",
    "# print(len(train_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import pandas as pd\n",
    "\n",
    "# df = pd.concat([train_df, test_df], axis=0)\n",
    "# # df = validated_df.copy()\n",
    "# print(df.info())\n",
    "# # df[\"sentence\"] = df[\"prev_sentence\"].apply(lambda t: normalizer_without_batch(t))\n",
    "# # df = df.dropna(subset=[\"sentence\"])\n",
    "# # df[\"sentence_spell\"] = df[\"sentence\"].apply(lambda t: normalizer({\"sentence\": t}, is_spell_check=True, return_dict=False))\n",
    "# df = df.reset_index(drop=True)\n",
    "# print(df.info())\n",
    "# df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import torchaudio\n",
    "# import librosa\n",
    "# import IPython.display as ipd\n",
    "# import numpy as np\n",
    "\n",
    "# def load_audio(path):\n",
    "#     speech, sr = torchaudio.load(path)\n",
    "#     speech = speech[0].numpy().squeeze()    \n",
    "#     speech = librosa.resample(np.asarray(speech), sr, 16_000)\n",
    "    \n",
    "#     print(speech.shape, sr)\n",
    "    \n",
    "#     ipd.display(ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# main_vocab = [\"ح\", \"چ\", \"ج\", \"ث\", \"ت\", \"پ\", \"ب\", \"آ\", \"ا\", \"ش\", \"س\", \"ژ\", \"ز\", \"ر\", \"ذ\", \"د\", \"خ\", \"ق\", \"ف\", \"غ\", \"ع\", \"ظ\", \"ط\", \"ض\", \"ص\", \"ی\", \"ه\", \"و\", \"ن\", \"م\", \"ل\", \"گ\", \"ک\"]\n",
    "# text = \" \".join(df[\"sentence\"].values.tolist())\n",
    "# vocab = list(sorted(set(text)))\n",
    "\n",
    "# for v in main_vocab:\n",
    "#     if v not in vocab:\n",
    "#         print(\"v\", v)\n",
    "\n",
    "# print(len(main_vocab), len(vocab))\n",
    "# print(len(vocab), vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import numpy as np\n",
    "\n",
    "\n",
    "# idx = np.random.randint(0, len(df))\n",
    "# # idx = 6140\n",
    "# sample = df.iloc[idx]\n",
    "# ipd.display(sample)\n",
    "# # print(sample.iloc[idx][\"prev_sentence\"])\n",
    "# print()\n",
    "# print(sample[\"prev_sentence\"])\n",
    "# print(sample[\"sentence\"])\n",
    "# print()\n",
    "# load_audio(sample[\"path\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# new_train_df = train_df.copy()\n",
    "# new_train_df[\"_path\"] = new_train_df[\"path\"]\n",
    "# new_train_df[\"path\"] = new_train_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n",
    "# print(new_train_df.info())\n",
    "# new_train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# new_test_df = test_df.copy()\n",
    "# new_test_df[\"_path\"] = new_test_df[\"path\"]\n",
    "# new_test_df[\"path\"] = new_test_df[\"path\"].apply(lambda t: os.path.join(\"/home/m3hrdadfi/code/data/fa/dataset/clips\", t.split(\"/\")[-1]))\n",
    "# print(new_test_df.info())\n",
    "# new_test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import shutil\n",
    "# from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !mkdir -p {save_path}/clips\n",
    "# !mkdir -p {save_path}/augs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for index, row in tqdm(new_train_df.iterrows(), position=0, total=len(new_train_df)):\n",
    "#     shutil.copy(row[\"_path\"], row[\"path\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for index, row in tqdm(new_test_df.iterrows(), position=0, total=len(new_test_df)):\n",
    "#     shutil.copy(row[\"_path\"], row[\"path\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # aug_train_df = new_train_df.copy()\n",
    "# aug_train_df = new_train_df.sample(frac=0.1)\n",
    "# aug_train_df = aug_train_df.reset_index(drop=True)\n",
    "# aug_train_df[\"_path\"] = aug_train_df[\"path\"]\n",
    "# aug_train_df[\"path\"] = aug_train_df[\"path\"].apply(lambda t: \"/\".join(t.split('.')[:-1]).replace(\"clips\", \"augs\") + \"_aug.mp3.wav\")\n",
    "# print(aug_train_df.info())\n",
    "# aug_train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(aug_train_df.iloc[0][\"_path\"])\n",
    "# print(aug_train_df.iloc[0][\"path\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # augmentation\n",
    "\n",
    "# from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain\n",
    "# import numpy as np\n",
    "# import soundfile as sf\n",
    "\n",
    "# augment = Compose([\n",
    "# #     AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
    "# #     PitchShift(min_semitones=-1, max_semitones=2, p=0.2),\n",
    "# #     Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.8)\n",
    "#     AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),\n",
    "#     TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),\n",
    "#     PitchShift(min_semitones=-4, max_semitones=4, p=0.5),\n",
    "# ])\n",
    "\n",
    "# def augmented_speech_file_to_array_fn(in_path, out_path):\n",
    "#     speech_array, sampling_rate = torchaudio.load(in_path)\n",
    "#     speech_array = speech_array.squeeze().numpy()\n",
    "#     speech_array = augment(samples=speech_array, sample_rate=sampling_rate)\n",
    "#     sf.write(out_path, speech_array, sampling_rate, \"PCM_24\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # for index, row in tqdm(aug_train_df.iterrows(), position=0, total=len(aug_train_df)):\n",
    "# #     augmented_speech_file_to_array_fn(row[\"_path\"], row[\"path\"])\n",
    "# !ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # new_train_aug_df = pd.concat([new_train_df, aug_train_df], axis=0)\n",
    "# new_train_aug_df = new_train_df.copy()\n",
    "# new_train_aug_df = new_train_aug_df.sample(frac=1)\n",
    "# new_train_aug_df = new_train_aug_df.reset_index(drop=True)\n",
    "# print(new_train_aug_df.info())\n",
    "# new_train_aug_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# new_train_df.to_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
    "# new_train_aug_df.to_csv(f\"{save_path}/train_with_aug.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)\n",
    "# new_test_df.to_csv(f\"{save_path}/test.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# new_train_df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# new_test_df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import pandas as pd\n",
    "\n",
    "# import os\n",
    "# from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_df = pd.read_csv(f\"{save_path}/train_no_aug.csv\", sep=\"\\t\")\n",
    "# print(train_df.info())\n",
    "# train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_df = pd.read_csv(f\"{save_path}/test.csv\", sep=\"\\t\")\n",
    "# print(test_df.info())\n",
    "# test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "# non_existed_train = []\n",
    "\n",
    "# for index, row in tqdm(train_df.iterrows(), total=len(train_df), position=0):\n",
    "#     if not os.path.exists(row[\"path\"]):\n",
    "#         non_existed_train.extends(list(index))\n",
    "#         break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import numpy as np\n",
    "\n",
    "\n",
    "# idx = np.random.randint(0, len(train_df))\n",
    "# # idx = 6140\n",
    "# sample = train_df.iloc[idx]\n",
    "# ipd.display(sample)\n",
    "# # print(sample.iloc[idx][\"prev_sentence\"])\n",
    "# print()\n",
    "# print(sample[\"prev_sentence\"])\n",
    "# print(sample[\"sentence\"])\n",
    "# print()\n",
    "# load_audio(sample[\"path\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_df_half = train_df.copy()\n",
    "# print(train_df_half.shape)\n",
    "# train_df_half = train_df_half.dropna()\n",
    "# print(train_df_half.shape)\n",
    "# train_df_half = train_df_half.drop_duplicates()\n",
    "# print(train_df_half.shape)\n",
    "\n",
    "# train_df_half = train_df_half.sample(frac=0.5)\n",
    "# train_df_half = train_df_half.reset_index(drop=True)\n",
    "# print(train_df_half.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_df_half.to_csv(f\"{save_path}/train_no_aug_half.csv\", sep=\"\\t\", encoding=\"utf-8\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "transformers",
   "name": "transformers"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  },
  "orig_nbformat": 2
 },
 "nbformat": 4,
 "nbformat_minor": 2
}