{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "wl3FpBgqtZ6u"
},
"source": [
"# **Utils**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "GsYjBsG4Jc_z"
},
"outputs": [],
"source": [
"!pip install fuzzywuzzy\n",
"!pip install python-Levenshtein\n",
"!pip install torchmetrics\n",
"!pip install nltk"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "GWsNQXOQJVcI"
},
"outputs": [],
"source": [
"import os\n",
"from statistics import mean\n",
"import pandas as pd\n",
"from fuzzywuzzy import fuzz\n",
"import Levenshtein\n",
"from IPython.display import display"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"id": "xc9fDE9MJLgP"
},
"outputs": [],
"source": [
"def calculate_fuzzy_score(reference_text, output_text):\n",
" fuzzy_score = fuzz.token_sort_ratio(reference_text, output_text)\n",
" return fuzzy_score"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"id": "OmRWPXfti2Oz"
},
"outputs": [],
"source": [
"def calculate_cer(reference_text, output_text):\n",
" distance = Levenshtein.distance(reference_text, output_text)\n",
" total_characters = len(reference_text)\n",
" cer = distance / total_characters if total_characters > 0 else float('inf')\n",
" return cer"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"id": "33FgDK8Vi7Yd"
},
"outputs": [],
"source": [
"def calculate_wer(reference_text, output_text):\n",
" reference_words = reference_text.split()\n",
" output_words = output_text.split()\n",
" distance = Levenshtein.distance(' '.join(reference_words), ' '.join(output_words))\n",
" total_words = len(reference_words)\n",
" wer = distance / total_words if total_words > 0 else float('inf')\n",
" return wer"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"id": "Ik74uvonttmS"
},
"outputs": [],
"source": [
"import nltk\n",
"from nltk.translate.bleu_score import sentence_bleu\n",
"\n",
"def calculate_bleu(reference_text, output_text):\n",
" reference_tokens = [reference_text.split()]\n",
" output_tokens = output_text.split()\n",
" bleu_score = sentence_bleu(reference_tokens, output_tokens)\n",
" return bleu_score"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"id": "eTl4ZLNgt-el"
},
"outputs": [],
"source": [
"def calculate_jaccard_index(reference_text, output_text):\n",
" reference_set = set(reference_text)\n",
" output_set = set(output_text)\n",
" intersection = len(reference_set & output_set)\n",
" union = len(reference_set | output_set)\n",
" jaccard_index = intersection / union if union > 0 else 0\n",
" return jaccard_index"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"id": "VkIi5KCGvu42"
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"\n",
"def calculate_cosine_similarity(text1, text2):\n",
" vectorizer = CountVectorizer().fit_transform([text1, text2])\n",
" vectors = vectorizer.toarray()\n",
" cos_sim = cosine_similarity(vectors)[0][1]\n",
"\n",
" return cos_sim"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"id": "80okNADtm9BJ"
},
"outputs": [],
"source": [
"def print_file_contents(file_path):\n",
" try:\n",
" with open(file_path, 'r', encoding='utf-8') as file:\n",
" contents = file.read()\n",
" print(contents)\n",
" except FileNotFoundError:\n",
" print(f\"Error: The file '{file_path}' was not found.\")\n",
" except Exception as e:\n",
" print(f\"Error: An unexpected error occurred - {e}\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"id": "qa6tfr3ov6lh"
},
"outputs": [],
"source": [
"def read_file(file_path):\n",
" try:\n",
" with open(file_path, 'r', encoding='utf-8') as file:\n",
" content = file.read()\n",
" return content\n",
" except FileNotFoundError:\n",
" print(f\"Error: The file '{file_path}' was not found.\")\n",
" return None\n",
" except Exception as e:\n",
" print(f\"Error: An unexpected error occurred - {e}\")\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "UfXhf7e7r9U7"
},
"outputs": [],
"source": [
"starting_font = 12\n",
"increment_font = 4"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"id": "yhq59NQbmXWA"
},
"outputs": [],
"source": [
"def evaluate_ocr_models(num_docs):\n",
" ocr_models = {\n",
" \"gemini_pro\": extract_text_gemini,\n",
" \"gemini_flash\": extract_text_gemini,\n",
" \"opus\": extract_text_opus,\n",
" \"sonnet\": extract_text_sonnet,\n",
" \"haiku\": extract_text_haiku,\n",
" \"gpt4_turbo\": extract_text_gpt,\n",
" \"gpt4o\": extract_text_gpt,\n",
" \"vision\": extract_text_vision,\n",
" # \"marker\": None\n",
" }\n",
"\n",
" evaluation_metrics = {\n",
" \"Fuzzy Score\": calculate_fuzzy_score,\n",
" \"CER\": calculate_cer,\n",
" \"WER\": calculate_wer,\n",
" \"BLEU\": calculate_bleu,\n",
" \"Jaccard Index\": calculate_jaccard_index,\n",
" \"Cosine Similarity\": calculate_cosine_similarity\n",
" }\n",
"\n",
" results = {metric: {model: 0 for model in ocr_models} for metric in evaluation_metrics.keys()}\n",
"\n",
" for model in ocr_models:\n",
" for metric in evaluation_metrics:\n",
" model_metrics = []\n",
" for docs in range(num_docs):\n",
" if model == \"marker\" and os.path.exists(f\"/content/{model}_output_{docs}.md\"):\n",
" score = evaluation_metrics[metric](read_file(f\"/content/reference_{docs}.txt\"), read_file(f\"/content/{model}_output_{docs}.md\"))\n",
" elif os.path.exists(f\"/content/{model}_font_output_{starting_font + docs * increment_font}.txt\"):\n",
" score = evaluation_metrics[metric](read_file(f\"/content/font-reference.txt\"), read_file(f\"/content/{model}_font_output_{starting_font + docs * increment_font}.txt\"))\n",
" else:\n",
" continue\n",
" model_metrics.append(score)\n",
"\n",
" for docs in range(1, 7):\n",
" if model == \"marker\" and os.path.exists(f\"/content/{model}_output_{docs}.md\"):\n",
" score = evaluation_metrics[metric](read_file(f\"/content/reference_{docs}.txt\"), read_file(f\"/content/{model}_output_{docs}.md\"))\n",
" elif os.path.exists(f\"/content/{model}_output_{docs}.txt\"):\n",
" score = evaluation_metrics[metric](read_file(f\"/content/reference_{docs}.txt\"), read_file(f\"/content/{model}_output_{docs}.txt\"))\n",
" else:\n",
" continue\n",
" model_metrics.append(score)\n",
"\n",
" results[metric][model] = mean(model_metrics)\n",
" scores_df = pd.DataFrame(results)\n",
" scores_df.index.name = 'Models'\n",
" return scores_df"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"id": "LW806l1dsw5d"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"\n",
"def evaluate_ocr_models_for_different_languages():\n",
" ocr_models = {\n",
" # \"gemini_flash\": extract_text_gemini,\n",
" # \"opus\": extract_text_opus,\n",
" # \"sonnet\": extract_text_sonnet,\n",
" # \"haiku\": extract_text_haiku,\n",
" # \"gpt4_turbo\": extract_text_gpt,\n",
" # \"gpt4o\": extract_text_gpt,\n",
" \"vision\": extract_text_vision,\n",
" # \"tesseract\": extract_text_tesseract,\n",
" }\n",
"\n",
" evaluation_metrics = {\n",
" \"Fuzzy Score\": calculate_fuzzy_score,\n",
" \"CER\": calculate_cer,\n",
" \"WER\": calculate_wer,\n",
" # \"BLEU\": calculate_bleu,\n",
" \"Jaccard Index\": calculate_jaccard_index,\n",
" \"Cosine Similarity\": calculate_cosine_similarity\n",
" }\n",
"\n",
" results = {model: {metric: [] for metric in evaluation_metrics} for model in ocr_models}\n",
"\n",
" for model in ocr_models:\n",
" for language in languages:\n",
" model_output_path = f\"/content/{model}_{language}_output.txt\"\n",
" reference_path = f\"/content/{language}_reference.txt\"\n",
"\n",
" if os.path.exists(model_output_path):\n",
" for metric in evaluation_metrics:\n",
" score = evaluation_metrics[metric](read_file(reference_path), read_file(model_output_path))\n",
" results[model][metric].append(score)\n",
" else:\n",
" print(reference_path)\n",
" for metric in evaluation_metrics:\n",
" print(model_output_path)\n",
" results[model][metric].append(None)\n",
"\n",
" models_dfs = {}\n",
" for model, metrics_scores in results.items():\n",
" df = pd.DataFrame(metrics_scores)\n",
" df.index = [language for language in languages]\n",
" df.index.name = 'Languages'\n",
" models_dfs[model] = df\n",
"\n",
" return models_dfs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "XXzL1iPNCemd",
"outputId": "b244ef17-5988-4847-90f9-078ce9ed0285"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: opus\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 18,\n \"fields\": [\n {\n \"column\": \"Languages\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"arabic\",\n \"bengali\",\n \"greek\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 21,\n \"min\": 28,\n \"max\": 99,\n \"num_unique_values\": 15,\n \"samples\": [\n 90,\n 28,\n 76\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.29423382595625325,\n \"min\": 0.007773985589685248,\n \"max\": 0.8049417436721575,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.3767572633552015,\n 0.6662768031189084,\n 0.04246440844378989\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.799784066203956,\n \"min\": 0.026106696935300794,\n \"max\": 12.82857142857143,\n \"num_unique_values\": 18,\n \"samples\": [\n 2.3532934131736525,\n 4.792613636363637,\n 0.2553542009884679\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1722359278462858,\n \"min\": 0.4127906976744186,\n \"max\": 0.9814814814814815,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.8627450980392157,\n 0.8153846153846154,\n 0.9710144927536232\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.30157125562544634,\n \"min\": 0.08928163578007509,\n \"max\": 0.9994166695838832,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.875069029090396,\n 0.5306427852649345,\n 0.9927171409278328\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Languages | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" arabic | \n",
" 76 | \n",
" 0.376757 | \n",
" 2.353293 | \n",
" 0.862745 | \n",
" 0.875069 | \n",
"
\n",
" \n",
" bengali | \n",
" 74 | \n",
" 0.666277 | \n",
" 4.792614 | \n",
" 0.815385 | \n",
" 0.530643 | \n",
"
\n",
" \n",
" chinese | \n",
" 62 | \n",
" 0.353923 | \n",
" 9.975610 | \n",
" 0.573086 | \n",
" 0.398227 | \n",
"
\n",
" \n",
" cyrillic | \n",
" 95 | \n",
" 0.040299 | \n",
" 0.242165 | \n",
" 0.963636 | \n",
" 0.987005 | \n",
"
\n",
" \n",
" dutch | \n",
" 94 | \n",
" 0.048046 | \n",
" 0.280783 | \n",
" 0.942857 | \n",
" 0.992913 | \n",
"
\n",
" \n",
" english | \n",
" 97 | \n",
" 0.045094 | \n",
" 0.216625 | \n",
" 0.954545 | \n",
" 0.997309 | \n",
"
\n",
" \n",
" french | \n",
" 99 | \n",
" 0.007774 | \n",
" 0.026107 | \n",
" 0.979592 | \n",
" 0.999417 | \n",
"
\n",
" \n",
" german | \n",
" 96 | \n",
" 0.049643 | \n",
" 0.311236 | \n",
" 0.981481 | \n",
" 0.994753 | \n",
"
\n",
" \n",
" greek | \n",
" 96 | \n",
" 0.042464 | \n",
" 0.255354 | \n",
" 0.971014 | \n",
" 0.992717 | \n",
"
\n",
" \n",
" hebrew | \n",
" 44 | \n",
" 0.804942 | \n",
" 4.868712 | \n",
" 0.555556 | \n",
" 0.292718 | \n",
"
\n",
" \n",
" hindi | \n",
" 90 | \n",
" 0.250994 | \n",
" 1.209375 | \n",
" 0.876923 | \n",
" 0.975469 | \n",
"
\n",
" \n",
" japanese | \n",
" 92 | \n",
" 0.103740 | \n",
" 7.592593 | \n",
" 0.910798 | \n",
" 0.915534 | \n",
"
\n",
" \n",
" korean | \n",
" 28 | \n",
" 0.804641 | \n",
" 3.336268 | \n",
" 0.412791 | \n",
" 0.089282 | \n",
"
\n",
" \n",
" latin | \n",
" 97 | \n",
" 0.027611 | \n",
" 0.177043 | \n",
" 0.962963 | \n",
" 0.985059 | \n",
"
\n",
" \n",
" spanish | \n",
" 99 | \n",
" 0.019810 | \n",
" 0.092405 | \n",
" 0.937500 | \n",
" 0.996633 | \n",
"
\n",
" \n",
" thai | \n",
" 67 | \n",
" 0.463868 | \n",
" 12.828571 | \n",
" 0.678571 | \n",
" 0.659937 | \n",
"
\n",
" \n",
" urdu | \n",
" 49 | \n",
" 0.714204 | \n",
" 3.275132 | \n",
" 0.883721 | \n",
" 0.391585 | \n",
"
\n",
" \n",
" vietnamese | \n",
" 84 | \n",
" 0.132894 | \n",
" 0.561207 | \n",
" 0.960396 | \n",
" 0.949777 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Languages \n",
"arabic 76 0.376757 2.353293 0.862745 0.875069\n",
"bengali 74 0.666277 4.792614 0.815385 0.530643\n",
"chinese 62 0.353923 9.975610 0.573086 0.398227\n",
"cyrillic 95 0.040299 0.242165 0.963636 0.987005\n",
"dutch 94 0.048046 0.280783 0.942857 0.992913\n",
"english 97 0.045094 0.216625 0.954545 0.997309\n",
"french 99 0.007774 0.026107 0.979592 0.999417\n",
"german 96 0.049643 0.311236 0.981481 0.994753\n",
"greek 96 0.042464 0.255354 0.971014 0.992717\n",
"hebrew 44 0.804942 4.868712 0.555556 0.292718\n",
"hindi 90 0.250994 1.209375 0.876923 0.975469\n",
"japanese 92 0.103740 7.592593 0.910798 0.915534\n",
"korean 28 0.804641 3.336268 0.412791 0.089282\n",
"latin 97 0.027611 0.177043 0.962963 0.985059\n",
"spanish 99 0.019810 0.092405 0.937500 0.996633\n",
"thai 67 0.463868 12.828571 0.678571 0.659937\n",
"urdu 49 0.714204 3.275132 0.883721 0.391585\n",
"vietnamese 84 0.132894 0.561207 0.960396 0.949777"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Model: sonnet\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 18,\n \"fields\": [\n {\n \"column\": \"Languages\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"arabic\",\n \"bengali\",\n \"greek\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 23,\n \"min\": 23,\n \"max\": 99,\n \"num_unique_values\": 16,\n \"samples\": [\n 65,\n 67,\n 98\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3227746192270715,\n \"min\": 0.022942737959802808,\n \"max\": 0.8464106844741235,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.49609497032177446,\n 0.7703703703703704,\n 0.10898379970544919\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8.061524934035456,\n \"min\": 0.11464245175936436,\n \"max\": 24.121951219512194,\n \"num_unique_values\": 18,\n \"samples\": [\n 3.12375249500998,\n 5.488636363636363,\n 0.6869851729818781\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.21471218402735157,\n \"min\": 0.18821603927986907,\n \"max\": 0.9636363636363636,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.6805555555555556,\n 0.5977011494252874,\n 0.7528089887640449\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.340623867114622,\n \"min\": 0.03106848830006,\n \"max\": 0.9988109929760138,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.7658988355718965,\n 0.3788357860891555,\n 0.9797664355295872\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Languages | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" arabic | \n",
" 65 | \n",
" 0.496095 | \n",
" 3.123752 | \n",
" 0.680556 | \n",
" 0.765899 | \n",
"
\n",
" \n",
" bengali | \n",
" 67 | \n",
" 0.770370 | \n",
" 5.488636 | \n",
" 0.597701 | \n",
" 0.378836 | \n",
"
\n",
" \n",
" chinese | \n",
" 23 | \n",
" 0.846411 | \n",
" 24.121951 | \n",
" 0.188216 | \n",
" 0.031068 | \n",
"
\n",
" \n",
" cyrillic | \n",
" 90 | \n",
" 0.117815 | \n",
" 0.710826 | \n",
" 0.698630 | \n",
" 0.963953 | \n",
"
\n",
" \n",
" dutch | \n",
" 91 | \n",
" 0.110134 | \n",
" 0.715765 | \n",
" 0.957746 | \n",
" 0.991618 | \n",
"
\n",
" \n",
" english | \n",
" 98 | \n",
" 0.037843 | \n",
" 0.162469 | \n",
" 0.954545 | \n",
" 0.996665 | \n",
"
\n",
" \n",
" french | \n",
" 99 | \n",
" 0.022943 | \n",
" 0.114642 | \n",
" 0.941176 | \n",
" 0.998811 | \n",
"
\n",
" \n",
" german | \n",
" 94 | \n",
" 0.104686 | \n",
" 0.668539 | \n",
" 0.963636 | \n",
" 0.989543 | \n",
"
\n",
" \n",
" greek | \n",
" 92 | \n",
" 0.108984 | \n",
" 0.686985 | \n",
" 0.752809 | \n",
" 0.979766 | \n",
"
\n",
" \n",
" hebrew | \n",
" 51 | \n",
" 0.826436 | \n",
" 4.969325 | \n",
" 0.556962 | \n",
" 0.319149 | \n",
"
\n",
" \n",
" hindi | \n",
" 77 | \n",
" 0.579168 | \n",
" 2.823958 | \n",
" 0.694118 | \n",
" 0.806080 | \n",
"
\n",
" \n",
" japanese | \n",
" 67 | \n",
" 0.302760 | \n",
" 24.111111 | \n",
" 0.676647 | \n",
" 0.657504 | \n",
"
\n",
" \n",
" korean | \n",
" 28 | \n",
" 0.801266 | \n",
" 3.288732 | \n",
" 0.397183 | \n",
" 0.033994 | \n",
"
\n",
" \n",
" latin | \n",
" 95 | \n",
" 0.050274 | \n",
" 0.317121 | \n",
" 0.962963 | \n",
" 0.978361 | \n",
"
\n",
" \n",
" spanish | \n",
" 98 | \n",
" 0.027187 | \n",
" 0.143038 | \n",
" 0.877551 | \n",
" 0.997151 | \n",
"
\n",
" \n",
" thai | \n",
" 55 | \n",
" 0.660560 | \n",
" 18.328571 | \n",
" 0.747475 | \n",
" 0.388898 | \n",
"
\n",
" \n",
" urdu | \n",
" 52 | \n",
" 0.695227 | \n",
" 3.174603 | \n",
" 0.569231 | \n",
" 0.550558 | \n",
"
\n",
" \n",
" vietnamese | \n",
" 79 | \n",
" 0.243123 | \n",
" 1.062931 | \n",
" 0.872727 | \n",
" 0.893903 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Languages \n",
"arabic 65 0.496095 3.123752 0.680556 0.765899\n",
"bengali 67 0.770370 5.488636 0.597701 0.378836\n",
"chinese 23 0.846411 24.121951 0.188216 0.031068\n",
"cyrillic 90 0.117815 0.710826 0.698630 0.963953\n",
"dutch 91 0.110134 0.715765 0.957746 0.991618\n",
"english 98 0.037843 0.162469 0.954545 0.996665\n",
"french 99 0.022943 0.114642 0.941176 0.998811\n",
"german 94 0.104686 0.668539 0.963636 0.989543\n",
"greek 92 0.108984 0.686985 0.752809 0.979766\n",
"hebrew 51 0.826436 4.969325 0.556962 0.319149\n",
"hindi 77 0.579168 2.823958 0.694118 0.806080\n",
"japanese 67 0.302760 24.111111 0.676647 0.657504\n",
"korean 28 0.801266 3.288732 0.397183 0.033994\n",
"latin 95 0.050274 0.317121 0.962963 0.978361\n",
"spanish 98 0.027187 0.143038 0.877551 0.997151\n",
"thai 55 0.660560 18.328571 0.747475 0.388898\n",
"urdu 52 0.695227 3.174603 0.569231 0.550558\n",
"vietnamese 79 0.243123 1.062931 0.872727 0.893903"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Model: haiku\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 18,\n \"fields\": [\n {\n \"column\": \"Languages\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"arabic\",\n \"bengali\",\n \"greek\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 34,\n \"min\": 8,\n \"max\": 99,\n \"num_unique_values\": 16,\n \"samples\": [\n 36,\n 45,\n 99\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3595556787059576,\n \"min\": 0.023113528212100613,\n \"max\": 0.9941569282136895,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.7647610121836926,\n 0.8066276803118908,\n 0.7292587137947962\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 15.710004299638811,\n \"min\": 0.1070528967254408,\n \"max\": 63.370370370370374,\n \"num_unique_values\": 18,\n \"samples\": [\n 4.874251497005988,\n 5.786931818181818,\n 4.85831960461285\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3191530126934353,\n \"min\": 0.013888888888888888,\n \"max\": 0.9571428571428572,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.4659090909090909,\n 0.5730337078651685,\n 0.3582089552238806\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.43168443386351163,\n \"min\": 0.0,\n \"max\": 0.9983834336252877,\n \"num_unique_values\": 17,\n \"samples\": [\n 0.30342133556900935,\n 0.2608202926447102,\n 0.9983834336252877\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Languages | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" arabic | \n",
" 36 | \n",
" 0.764761 | \n",
" 4.874251 | \n",
" 0.465909 | \n",
" 0.303421 | \n",
"
\n",
" \n",
" bengali | \n",
" 45 | \n",
" 0.806628 | \n",
" 5.786932 | \n",
" 0.573034 | \n",
" 0.260820 | \n",
"
\n",
" \n",
" chinese | \n",
" 14 | \n",
" 0.994157 | \n",
" 28.804878 | \n",
" 0.193059 | \n",
" 0.044445 | \n",
"
\n",
" \n",
" cyrillic | \n",
" 62 | \n",
" 0.489099 | \n",
" 3.149573 | \n",
" 0.566265 | \n",
" 0.800964 | \n",
"
\n",
" \n",
" dutch | \n",
" 91 | \n",
" 0.138048 | \n",
" 0.892980 | \n",
" 0.957143 | \n",
" 0.991231 | \n",
"
\n",
" \n",
" english | \n",
" 99 | \n",
" 0.023114 | \n",
" 0.107053 | \n",
" 0.933333 | \n",
" 0.998383 | \n",
"
\n",
" \n",
" french | \n",
" 82 | \n",
" 0.286500 | \n",
" 1.700341 | \n",
" 0.807018 | \n",
" 0.976977 | \n",
"
\n",
" \n",
" german | \n",
" 93 | \n",
" 0.104860 | \n",
" 0.670787 | \n",
" 0.946429 | \n",
" 0.990169 | \n",
"
\n",
" \n",
" greek | \n",
" 24 | \n",
" 0.729259 | \n",
" 4.858320 | \n",
" 0.358209 | \n",
" 0.248820 | \n",
"
\n",
" \n",
" hebrew | \n",
" 36 | \n",
" 0.786260 | \n",
" 4.754601 | \n",
" 0.537500 | \n",
" 0.159440 | \n",
"
\n",
" \n",
" hindi | \n",
" 8 | \n",
" 0.954821 | \n",
" 4.756250 | \n",
" 0.032258 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" japanese | \n",
" 30 | \n",
" 0.789403 | \n",
" 63.370370 | \n",
" 0.349680 | \n",
" 0.123645 | \n",
"
\n",
" \n",
" korean | \n",
" 20 | \n",
" 0.846414 | \n",
" 3.521127 | \n",
" 0.280443 | \n",
" 0.027006 | \n",
"
\n",
" \n",
" latin | \n",
" 97 | \n",
" 0.039073 | \n",
" 0.256809 | \n",
" 0.894737 | \n",
" 0.982940 | \n",
"
\n",
" \n",
" spanish | \n",
" 98 | \n",
" 0.044046 | \n",
" 0.243038 | \n",
" 0.860000 | \n",
" 0.995875 | \n",
"
\n",
" \n",
" thai | \n",
" 28 | \n",
" 0.857252 | \n",
" 23.671429 | \n",
" 0.611111 | \n",
" 0.153045 | \n",
"
\n",
" \n",
" urdu | \n",
" 8 | \n",
" 0.951121 | \n",
" 4.375661 | \n",
" 0.013889 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" vietnamese | \n",
" 79 | \n",
" 0.496900 | \n",
" 2.204310 | \n",
" 0.932039 | \n",
" 0.915511 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Languages \n",
"arabic 36 0.764761 4.874251 0.465909 0.303421\n",
"bengali 45 0.806628 5.786932 0.573034 0.260820\n",
"chinese 14 0.994157 28.804878 0.193059 0.044445\n",
"cyrillic 62 0.489099 3.149573 0.566265 0.800964\n",
"dutch 91 0.138048 0.892980 0.957143 0.991231\n",
"english 99 0.023114 0.107053 0.933333 0.998383\n",
"french 82 0.286500 1.700341 0.807018 0.976977\n",
"german 93 0.104860 0.670787 0.946429 0.990169\n",
"greek 24 0.729259 4.858320 0.358209 0.248820\n",
"hebrew 36 0.786260 4.754601 0.537500 0.159440\n",
"hindi 8 0.954821 4.756250 0.032258 0.000000\n",
"japanese 30 0.789403 63.370370 0.349680 0.123645\n",
"korean 20 0.846414 3.521127 0.280443 0.027006\n",
"latin 97 0.039073 0.256809 0.894737 0.982940\n",
"spanish 98 0.044046 0.243038 0.860000 0.995875\n",
"thai 28 0.857252 23.671429 0.611111 0.153045\n",
"urdu 8 0.951121 4.375661 0.013889 0.000000\n",
"vietnamese 79 0.496900 2.204310 0.932039 0.915511"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Model: gpt4_turbo\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 18,\n \"fields\": [\n {\n \"column\": \"Languages\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"arabic\",\n \"bengali\",\n \"greek\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 25,\n \"min\": 21,\n \"max\": 100,\n \"num_unique_values\": 17,\n \"samples\": [\n 55,\n 78,\n 99\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3877421701249285,\n \"min\": 0.0027869709109911165,\n \"max\": 1.2734177215189872,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.6807247735082786,\n 0.7037037037037037,\n 0.08051055473735887\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 10.591607212508988,\n \"min\": 0.008988764044943821,\n \"max\": 37.44444444444444,\n \"num_unique_values\": 18,\n \"samples\": [\n 4.323353293413174,\n 5.065340909090909,\n 0.5090609555189456\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2609830573030153,\n \"min\": 0.013071895424836602,\n \"max\": 1.0,\n \"num_unique_values\": 17,\n \"samples\": [\n 0.8333333333333334,\n 0.6265060240963856,\n 0.9333333333333333\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.33479822045097335,\n \"min\": 0.0,\n \"max\": 0.9999206978588416,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.7219781551814013,\n 0.5410799913297344,\n 0.9886639701758357\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Languages | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" arabic | \n",
" 55 | \n",
" 0.680725 | \n",
" 4.323353 | \n",
" 0.833333 | \n",
" 0.721978 | \n",
"
\n",
" \n",
" bengali | \n",
" 78 | \n",
" 0.703704 | \n",
" 5.065341 | \n",
" 0.626506 | \n",
" 0.541080 | \n",
"
\n",
" \n",
" chinese | \n",
" 23 | \n",
" 0.824708 | \n",
" 23.756098 | \n",
" 0.205575 | \n",
" 0.029604 | \n",
"
\n",
" \n",
" cyrillic | \n",
" 90 | \n",
" 0.095353 | \n",
" 0.584046 | \n",
" 0.697368 | \n",
" 0.963841 | \n",
"
\n",
" \n",
" dutch | \n",
" 96 | \n",
" 0.093724 | \n",
" 0.604143 | \n",
" 0.944444 | \n",
" 0.990596 | \n",
"
\n",
" \n",
" english | \n",
" 99 | \n",
" 0.021074 | \n",
" 0.090680 | \n",
" 0.933333 | \n",
" 0.998749 | \n",
"
\n",
" \n",
" french | \n",
" 95 | \n",
" 0.083239 | \n",
" 0.467650 | \n",
" 0.842105 | \n",
" 0.989760 | \n",
"
\n",
" \n",
" german | \n",
" 100 | \n",
" 0.002787 | \n",
" 0.008989 | \n",
" 1.000000 | \n",
" 0.999921 | \n",
"
\n",
" \n",
" greek | \n",
" 93 | \n",
" 0.080511 | \n",
" 0.509061 | \n",
" 0.930556 | \n",
" 0.988664 | \n",
"
\n",
" \n",
" hebrew | \n",
" 54 | \n",
" 0.857372 | \n",
" 5.174233 | \n",
" 0.681159 | \n",
" 0.379806 | \n",
"
\n",
" \n",
" hindi | \n",
" 72 | \n",
" 0.628320 | \n",
" 3.112500 | \n",
" 0.907692 | \n",
" 0.697304 | \n",
"
\n",
" \n",
" japanese | \n",
" 64 | \n",
" 0.461264 | \n",
" 37.444444 | \n",
" 0.644737 | \n",
" 0.550642 | \n",
"
\n",
" \n",
" korean | \n",
" 21 | \n",
" 1.273418 | \n",
" 5.221831 | \n",
" 0.013072 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" latin | \n",
" 92 | \n",
" 0.191456 | \n",
" 1.387160 | \n",
" 0.896552 | \n",
" 0.927789 | \n",
"
\n",
" \n",
" spanish | \n",
" 99 | \n",
" 0.028030 | \n",
" 0.139241 | \n",
" 0.843137 | \n",
" 0.997357 | \n",
"
\n",
" \n",
" thai | \n",
" 56 | \n",
" 0.852417 | \n",
" 23.771429 | \n",
" 0.728155 | \n",
" 0.360498 | \n",
"
\n",
" \n",
" urdu | \n",
" 52 | \n",
" 0.714204 | \n",
" 3.267196 | \n",
" 0.622951 | \n",
" 0.720733 | \n",
"
\n",
" \n",
" vietnamese | \n",
" 85 | \n",
" 0.246610 | \n",
" 1.063793 | \n",
" 0.933333 | \n",
" 0.947641 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Languages \n",
"arabic 55 0.680725 4.323353 0.833333 0.721978\n",
"bengali 78 0.703704 5.065341 0.626506 0.541080\n",
"chinese 23 0.824708 23.756098 0.205575 0.029604\n",
"cyrillic 90 0.095353 0.584046 0.697368 0.963841\n",
"dutch 96 0.093724 0.604143 0.944444 0.990596\n",
"english 99 0.021074 0.090680 0.933333 0.998749\n",
"french 95 0.083239 0.467650 0.842105 0.989760\n",
"german 100 0.002787 0.008989 1.000000 0.999921\n",
"greek 93 0.080511 0.509061 0.930556 0.988664\n",
"hebrew 54 0.857372 5.174233 0.681159 0.379806\n",
"hindi 72 0.628320 3.112500 0.907692 0.697304\n",
"japanese 64 0.461264 37.444444 0.644737 0.550642\n",
"korean 21 1.273418 5.221831 0.013072 0.000000\n",
"latin 92 0.191456 1.387160 0.896552 0.927789\n",
"spanish 99 0.028030 0.139241 0.843137 0.997357\n",
"thai 56 0.852417 23.771429 0.728155 0.360498\n",
"urdu 52 0.714204 3.267196 0.622951 0.720733\n",
"vietnamese 85 0.246610 1.063793 0.933333 0.947641"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Model: gpt4o\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 18,\n \"fields\": [\n {\n \"column\": \"Languages\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"arabic\",\n \"bengali\",\n \"greek\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12,\n \"min\": 61,\n \"max\": 100,\n \"num_unique_values\": 13,\n \"samples\": [\n 81,\n 93,\n 96\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2225394916433416,\n \"min\": 0.004305461137548153,\n \"max\": 0.7400805060379528,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.06310527960012496,\n 0.4619883040935672,\n 0.04639175257731959\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.0907817978978644,\n \"min\": 0.003778337531486146,\n \"max\": 5.957142857142857,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.34530938123752497,\n 3.3238636363636362,\n 0.2355848434925865\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.14751181897243867,\n \"min\": 0.5068119891008175,\n \"max\": 0.9782608695652174,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.6712328767123288,\n 0.9016393442622951,\n 0.7282608695652174\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.11483698205261142,\n \"min\": 0.5965830453920575,\n \"max\": 0.9999999999999993,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.9809529877631776,\n 0.8236557439357621,\n 0.9931121552504696\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Languages | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" arabic | \n",
" 96 | \n",
" 0.063105 | \n",
" 0.345309 | \n",
" 0.671233 | \n",
" 0.980953 | \n",
"
\n",
" \n",
" bengali | \n",
" 86 | \n",
" 0.461988 | \n",
" 3.323864 | \n",
" 0.901639 | \n",
" 0.823656 | \n",
"
\n",
" \n",
" chinese | \n",
" 73 | \n",
" 0.194491 | \n",
" 5.365854 | \n",
" 0.759690 | \n",
" 0.596583 | \n",
"
\n",
" \n",
" cyrillic | \n",
" 97 | \n",
" 0.041621 | \n",
" 0.216524 | \n",
" 0.726027 | \n",
" 0.988776 | \n",
"
\n",
" \n",
" dutch | \n",
" 99 | \n",
" 0.030452 | \n",
" 0.177215 | \n",
" 0.906667 | \n",
" 0.998459 | \n",
"
\n",
" \n",
" english | \n",
" 100 | \n",
" 0.004305 | \n",
" 0.003778 | \n",
" 0.976744 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" french | \n",
" 98 | \n",
" 0.047023 | \n",
" 0.264472 | \n",
" 0.923077 | \n",
" 0.999005 | \n",
"
\n",
" \n",
" german | \n",
" 99 | \n",
" 0.028566 | \n",
" 0.141573 | \n",
" 0.946429 | \n",
" 0.997423 | \n",
"
\n",
" \n",
" greek | \n",
" 98 | \n",
" 0.046392 | \n",
" 0.235585 | \n",
" 0.728261 | \n",
" 0.993112 | \n",
"
\n",
" \n",
" hebrew | \n",
" 68 | \n",
" 0.453194 | \n",
" 2.714110 | \n",
" 0.522727 | \n",
" 0.775091 | \n",
"
\n",
" \n",
" hindi | \n",
" 91 | \n",
" 0.348672 | \n",
" 1.659375 | \n",
" 0.797297 | \n",
" 0.956740 | \n",
"
\n",
" \n",
" japanese | \n",
" 93 | \n",
" 0.076135 | \n",
" 5.296296 | \n",
" 0.935714 | \n",
" 0.920010 | \n",
"
\n",
" \n",
" korean | \n",
" 61 | \n",
" 0.525316 | \n",
" 2.133803 | \n",
" 0.506812 | \n",
" 0.822502 | \n",
"
\n",
" \n",
" latin | \n",
" 98 | \n",
" 0.046106 | \n",
" 0.280156 | \n",
" 0.928571 | \n",
" 0.985503 | \n",
"
\n",
" \n",
" spanish | \n",
" 99 | \n",
" 0.007587 | \n",
" 0.022785 | \n",
" 0.978261 | \n",
" 0.997419 | \n",
"
\n",
" \n",
" thai | \n",
" 81 | \n",
" 0.217812 | \n",
" 5.957143 | \n",
" 0.777778 | \n",
" 0.860192 | \n",
"
\n",
" \n",
" urdu | \n",
" 70 | \n",
" 0.740081 | \n",
" 3.375661 | \n",
" 0.745098 | \n",
" 0.777132 | \n",
"
\n",
" \n",
" vietnamese | \n",
" 98 | \n",
" 0.029640 | \n",
" 0.112931 | \n",
" 0.951456 | \n",
" 0.997747 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Languages \n",
"arabic 96 0.063105 0.345309 0.671233 0.980953\n",
"bengali 86 0.461988 3.323864 0.901639 0.823656\n",
"chinese 73 0.194491 5.365854 0.759690 0.596583\n",
"cyrillic 97 0.041621 0.216524 0.726027 0.988776\n",
"dutch 99 0.030452 0.177215 0.906667 0.998459\n",
"english 100 0.004305 0.003778 0.976744 1.000000\n",
"french 98 0.047023 0.264472 0.923077 0.999005\n",
"german 99 0.028566 0.141573 0.946429 0.997423\n",
"greek 98 0.046392 0.235585 0.728261 0.993112\n",
"hebrew 68 0.453194 2.714110 0.522727 0.775091\n",
"hindi 91 0.348672 1.659375 0.797297 0.956740\n",
"japanese 93 0.076135 5.296296 0.935714 0.920010\n",
"korean 61 0.525316 2.133803 0.506812 0.822502\n",
"latin 98 0.046106 0.280156 0.928571 0.985503\n",
"spanish 99 0.007587 0.022785 0.978261 0.997419\n",
"thai 81 0.217812 5.957143 0.777778 0.860192\n",
"urdu 70 0.740081 3.375661 0.745098 0.777132\n",
"vietnamese 98 0.029640 0.112931 0.951456 0.997747"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Model: vision\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 18,\n \"fields\": [\n {\n \"column\": \"Languages\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"arabic\",\n \"bengali\",\n \"greek\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 24,\n \"min\": 25,\n \"max\": 100,\n \"num_unique_values\": 8,\n \"samples\": [\n 25,\n 75,\n 100\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1993613749398623,\n \"min\": 0.020288206295032234,\n \"max\": 0.6714158504007124,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.030927835051546393,\n 0.0354775828460039,\n 0.03043691703485518\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12.85374771439189,\n \"min\": 0.075,\n \"max\": 52.96296296296296,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.15568862275449102,\n 0.1534090909090909,\n 0.16803953871499178\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.02895347975962444,\n \"min\": 0.88,\n \"max\": 0.9953917050691244,\n \"num_unique_values\": 17,\n \"samples\": [\n 0.98,\n 0.9827586206896551,\n 0.9767441860465116\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3480704012555531,\n \"min\": 0.0042342482556298085,\n \"max\": 1.0000000000000036,\n \"num_unique_values\": 17,\n \"samples\": [\n 1.0000000000000036,\n 1.0000000000000018,\n 0.9999999999999993\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Languages | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" arabic | \n",
" 100 | \n",
" 0.030928 | \n",
" 0.155689 | \n",
" 0.980000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" bengali | \n",
" 100 | \n",
" 0.035478 | \n",
" 0.153409 | \n",
" 0.982759 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" chinese | \n",
" 25 | \n",
" 0.627713 | \n",
" 18.024390 | \n",
" 0.908832 | \n",
" 0.004234 | \n",
"
\n",
" \n",
" cyrillic | \n",
" 100 | \n",
" 0.037657 | \n",
" 0.212251 | \n",
" 0.962963 | \n",
" 0.999906 | \n",
"
\n",
" \n",
" dutch | \n",
" 100 | \n",
" 0.025207 | \n",
" 0.140391 | \n",
" 0.957746 | \n",
" 0.999881 | \n",
"
\n",
" \n",
" english | \n",
" 100 | \n",
" 0.024473 | \n",
" 0.107053 | \n",
" 0.976744 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" french | \n",
" 100 | \n",
" 0.020288 | \n",
" 0.094211 | \n",
" 0.959184 | \n",
" 0.999753 | \n",
"
\n",
" \n",
" german | \n",
" 100 | \n",
" 0.041282 | \n",
" 0.225843 | \n",
" 0.981132 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" greek | \n",
" 100 | \n",
" 0.030437 | \n",
" 0.168040 | \n",
" 0.985294 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" hebrew | \n",
" 98 | \n",
" 0.046806 | \n",
" 0.240491 | \n",
" 0.979167 | \n",
" 0.992660 | \n",
"
\n",
" \n",
" hindi | \n",
" 99 | \n",
" 0.038486 | \n",
" 0.167708 | \n",
" 0.966667 | \n",
" 0.998818 | \n",
"
\n",
" \n",
" japanese | \n",
" 29 | \n",
" 0.671416 | \n",
" 52.962963 | \n",
" 0.949398 | \n",
" 0.031592 | \n",
"
\n",
" \n",
" korean | \n",
" 75 | \n",
" 0.173840 | \n",
" 0.686620 | \n",
" 0.995392 | \n",
" 0.448421 | \n",
"
\n",
" \n",
" latin | \n",
" 100 | \n",
" 0.027611 | \n",
" 0.169261 | \n",
" 0.981132 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" spanish | \n",
" 99 | \n",
" 0.060906 | \n",
" 0.336709 | \n",
" 0.880000 | \n",
" 0.997281 | \n",
"
\n",
" \n",
" thai | \n",
" 59 | \n",
" 0.200000 | \n",
" 5.142857 | \n",
" 0.974684 | \n",
" 0.373060 | \n",
"
\n",
" \n",
" urdu | \n",
" 96 | \n",
" 0.038528 | \n",
" 0.153439 | \n",
" 0.975000 | \n",
" 0.954732 | \n",
"
\n",
" \n",
" vietnamese | \n",
" 99 | \n",
" 0.021310 | \n",
" 0.075000 | \n",
" 0.989899 | \n",
" 0.998874 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Languages \n",
"arabic 100 0.030928 0.155689 0.980000 1.000000\n",
"bengali 100 0.035478 0.153409 0.982759 1.000000\n",
"chinese 25 0.627713 18.024390 0.908832 0.004234\n",
"cyrillic 100 0.037657 0.212251 0.962963 0.999906\n",
"dutch 100 0.025207 0.140391 0.957746 0.999881\n",
"english 100 0.024473 0.107053 0.976744 1.000000\n",
"french 100 0.020288 0.094211 0.959184 0.999753\n",
"german 100 0.041282 0.225843 0.981132 1.000000\n",
"greek 100 0.030437 0.168040 0.985294 1.000000\n",
"hebrew 98 0.046806 0.240491 0.979167 0.992660\n",
"hindi 99 0.038486 0.167708 0.966667 0.998818\n",
"japanese 29 0.671416 52.962963 0.949398 0.031592\n",
"korean 75 0.173840 0.686620 0.995392 0.448421\n",
"latin 100 0.027611 0.169261 0.981132 1.000000\n",
"spanish 99 0.060906 0.336709 0.880000 0.997281\n",
"thai 59 0.200000 5.142857 0.974684 0.373060\n",
"urdu 96 0.038528 0.153439 0.975000 0.954732\n",
"vietnamese 99 0.021310 0.075000 0.989899 0.998874"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Model: tesseract\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 18,\n \"fields\": [\n {\n \"column\": \"Languages\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"arabic\",\n \"bengali\",\n \"greek\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 34,\n \"min\": 7,\n \"max\": 94,\n \"num_unique_values\": 14,\n \"samples\": [\n 24,\n 88,\n 18\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3945193578603726,\n \"min\": 0.045885475919605616,\n \"max\": 1.0244879786286731,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.8706654170571696,\n 0.8631578947368421,\n 0.8576337751595483\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 20.218444781185273,\n \"min\": 0.188422247446084,\n \"max\": 84.37037037037037,\n \"num_unique_values\": 18,\n \"samples\": [\n 5.500998003992016,\n 6.215909090909091,\n 5.660626029654036\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.34968392223635286,\n \"min\": 0.014336917562724014,\n \"max\": 0.8936170212765957,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.09174311926605505,\n 0.027777777777777776,\n 0.13385826771653545\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4392089361824662,\n \"min\": 0.0,\n \"max\": 0.9768747457106202,\n \"num_unique_values\": 12,\n \"samples\": [\n 0.02020536020703633,\n 0.913915025058188,\n 0.0006574697271499722\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Languages | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" arabic | \n",
" 18 | \n",
" 0.870665 | \n",
" 5.500998 | \n",
" 0.091743 | \n",
" 0.000657 | \n",
"
\n",
" \n",
" bengali | \n",
" 18 | \n",
" 0.863158 | \n",
" 6.215909 | \n",
" 0.027778 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" chinese | \n",
" 10 | \n",
" 0.991653 | \n",
" 28.609756 | \n",
" 0.025974 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" cyrillic | \n",
" 15 | \n",
" 0.881964 | \n",
" 5.547009 | \n",
" 0.051282 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" dutch | \n",
" 86 | \n",
" 0.089156 | \n",
" 0.455696 | \n",
" 0.853333 | \n",
" 0.967344 | \n",
"
\n",
" \n",
" english | \n",
" 94 | \n",
" 0.052799 | \n",
" 0.225441 | \n",
" 0.893617 | \n",
" 0.976875 | \n",
"
\n",
" \n",
" french | \n",
" 93 | \n",
" 0.045885 | \n",
" 0.188422 | \n",
" 0.807692 | \n",
" 0.966458 | \n",
"
\n",
" \n",
" german | \n",
" 76 | \n",
" 0.172792 | \n",
" 1.053933 | \n",
" 0.770492 | \n",
" 0.753715 | \n",
"
\n",
" \n",
" greek | \n",
" 16 | \n",
" 0.857634 | \n",
" 5.660626 | \n",
" 0.133858 | \n",
" 0.004431 | \n",
"
\n",
" \n",
" hebrew | \n",
" 21 | \n",
" 0.857372 | \n",
" 5.159509 | \n",
" 0.178218 | \n",
" 0.004796 | \n",
"
\n",
" \n",
" hindi | \n",
" 18 | \n",
" 0.876177 | \n",
" 4.308333 | \n",
" 0.032258 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" japanese | \n",
" 15 | \n",
" 1.024488 | \n",
" 84.370370 | \n",
" 0.025862 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" korean | \n",
" 24 | \n",
" 0.886920 | \n",
" 3.642606 | \n",
" 0.014337 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" latin | \n",
" 75 | \n",
" 0.131545 | \n",
" 0.817121 | \n",
" 0.774194 | \n",
" 0.888726 | \n",
"
\n",
" \n",
" spanish | \n",
" 88 | \n",
" 0.093151 | \n",
" 0.470886 | \n",
" 0.622951 | \n",
" 0.913915 | \n",
"
\n",
" \n",
" thai | \n",
" 7 | \n",
" 0.944020 | \n",
" 26.264286 | \n",
" 0.163934 | \n",
" 0.020205 | \n",
"
\n",
" \n",
" urdu | \n",
" 10 | \n",
" 0.939620 | \n",
" 4.293651 | \n",
" 0.022472 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" vietnamese | \n",
" 56 | \n",
" 0.397714 | \n",
" 1.718103 | \n",
" 0.327586 | \n",
" 0.119540 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Languages \n",
"arabic 18 0.870665 5.500998 0.091743 0.000657\n",
"bengali 18 0.863158 6.215909 0.027778 0.000000\n",
"chinese 10 0.991653 28.609756 0.025974 0.000000\n",
"cyrillic 15 0.881964 5.547009 0.051282 0.000000\n",
"dutch 86 0.089156 0.455696 0.853333 0.967344\n",
"english 94 0.052799 0.225441 0.893617 0.976875\n",
"french 93 0.045885 0.188422 0.807692 0.966458\n",
"german 76 0.172792 1.053933 0.770492 0.753715\n",
"greek 16 0.857634 5.660626 0.133858 0.004431\n",
"hebrew 21 0.857372 5.159509 0.178218 0.004796\n",
"hindi 18 0.876177 4.308333 0.032258 0.000000\n",
"japanese 15 1.024488 84.370370 0.025862 0.000000\n",
"korean 24 0.886920 3.642606 0.014337 0.000000\n",
"latin 75 0.131545 0.817121 0.774194 0.888726\n",
"spanish 88 0.093151 0.470886 0.622951 0.913915\n",
"thai 7 0.944020 26.264286 0.163934 0.020205\n",
"urdu 10 0.939620 4.293651 0.022472 0.000000\n",
"vietnamese 56 0.397714 1.718103 0.327586 0.119540"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
}
],
"source": [
"evaluation_results = evaluate_ocr_models_for_different_languages()\n",
"\n",
"for model, df in evaluation_results.items():\n",
" print(f\"Model: {model}\")\n",
" display(df)\n",
" print(\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"id": "4YwFT6BjEhwj"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"def plot_languages(models_dfs, evaluation_metric):\n",
" fig, axs = plt.subplots(4, 2, figsize=(25, 20)) # Create a 3x2 grid of subplots\n",
" fig.suptitle(f'{evaluation_metric} for Different OCR Models', fontsize=16)\n",
"\n",
" models = list(models_dfs.keys())\n",
"\n",
" for i, model in enumerate(models):\n",
" row = i // 2\n",
" col = i % 2\n",
" ax = axs[row, col]\n",
"\n",
" df = models_dfs[model]\n",
"\n",
" if evaluation_metric in df.columns:\n",
" ax.bar(df.index, df[evaluation_metric], color=\"#5675a8\")\n",
" ax.set_title(model)\n",
" ax.set_xlabel('Languages')\n",
" ax.set_ylabel(evaluation_metric)\n",
"\n",
" else:\n",
" ax.text(0.5, 0.5, 'Metric Not Available', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)\n",
" ax.set_title(model)\n",
" ax.set_xlabel('Font Size')\n",
" ax.set_ylabel(evaluation_metric)\n",
"\n",
" for j in range(len(models), 6):\n",
" row = j // 2\n",
" col = j % 2\n",
" fig.delaxes(axs[row, col])\n",
"\n",
" plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout to fit title\n",
" plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 816
},
"id": "HfI2PsqsEmuE",
"outputId": "0e643da5-f1b0-402e-882a-6f6a17ba4209"
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plot_languages(evaluation_results, \"Fuzzy Score\")"
]
},
{
"cell_type": "code",
"source": [
"plot_languages(evaluation_results, \"Fuzzy Score\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 943
},
"collapsed": true,
"id": "kFi32CB1Bvt8",
"outputId": "7e4cbfef-0d86-4849-fb23-09a921742d28"
},
"execution_count": 31,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"evaluation_results = evaluate_ocr_models_for_different_languages()\n",
"\n",
"for model, df in evaluation_results.items():\n",
" print(f\"Model: {model}\")\n",
" display(df)\n",
" print(\"\\n\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 697
},
"collapsed": true,
"id": "UuWDxHtjBy06",
"outputId": "0ced273e-cc3e-43d4-cbd1-2026d925fb5b"
},
"execution_count": 28,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Model: vision\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Languages \n",
"arabic 100 0.030928 0.155689 0.980000 1.000000\n",
"bengali 100 0.035478 0.153409 0.982759 1.000000\n",
"chinese 24 0.604341 17.268293 0.980057 0.006081\n",
"cyrillic 100 0.037657 0.212251 0.962963 0.999906\n",
"dutch 100 0.025207 0.140391 0.957746 0.999881\n",
"english 100 0.024473 0.107053 0.976744 1.000000\n",
"french 100 0.020288 0.094211 0.959184 0.999753\n",
"german 100 0.041282 0.225843 0.981132 1.000000\n",
"greek 100 0.030437 0.168040 0.985294 1.000000\n",
"hebrew 98 0.046806 0.240491 0.979167 0.992660\n",
"hindi 99 0.038486 0.167708 0.966667 0.998818\n",
"japanese 29 0.674087 53.259259 0.946988 0.034352\n",
"korean 75 0.173840 0.686620 0.995392 0.448421\n",
"latin 100 0.027611 0.169261 0.981132 1.000000\n",
"spanish 99 0.060906 0.336709 0.880000 0.997281\n",
"thai 59 0.200000 5.142857 0.974684 0.373060\n",
"urdu 96 0.038528 0.153439 0.975000 0.954732\n",
"vietnamese 99 0.021310 0.075000 0.989899 0.998874"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Languages | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" arabic | \n",
" 100 | \n",
" 0.030928 | \n",
" 0.155689 | \n",
" 0.980000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" bengali | \n",
" 100 | \n",
" 0.035478 | \n",
" 0.153409 | \n",
" 0.982759 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" chinese | \n",
" 24 | \n",
" 0.604341 | \n",
" 17.268293 | \n",
" 0.980057 | \n",
" 0.006081 | \n",
"
\n",
" \n",
" cyrillic | \n",
" 100 | \n",
" 0.037657 | \n",
" 0.212251 | \n",
" 0.962963 | \n",
" 0.999906 | \n",
"
\n",
" \n",
" dutch | \n",
" 100 | \n",
" 0.025207 | \n",
" 0.140391 | \n",
" 0.957746 | \n",
" 0.999881 | \n",
"
\n",
" \n",
" english | \n",
" 100 | \n",
" 0.024473 | \n",
" 0.107053 | \n",
" 0.976744 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" french | \n",
" 100 | \n",
" 0.020288 | \n",
" 0.094211 | \n",
" 0.959184 | \n",
" 0.999753 | \n",
"
\n",
" \n",
" german | \n",
" 100 | \n",
" 0.041282 | \n",
" 0.225843 | \n",
" 0.981132 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" greek | \n",
" 100 | \n",
" 0.030437 | \n",
" 0.168040 | \n",
" 0.985294 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" hebrew | \n",
" 98 | \n",
" 0.046806 | \n",
" 0.240491 | \n",
" 0.979167 | \n",
" 0.992660 | \n",
"
\n",
" \n",
" hindi | \n",
" 99 | \n",
" 0.038486 | \n",
" 0.167708 | \n",
" 0.966667 | \n",
" 0.998818 | \n",
"
\n",
" \n",
" japanese | \n",
" 29 | \n",
" 0.674087 | \n",
" 53.259259 | \n",
" 0.946988 | \n",
" 0.034352 | \n",
"
\n",
" \n",
" korean | \n",
" 75 | \n",
" 0.173840 | \n",
" 0.686620 | \n",
" 0.995392 | \n",
" 0.448421 | \n",
"
\n",
" \n",
" latin | \n",
" 100 | \n",
" 0.027611 | \n",
" 0.169261 | \n",
" 0.981132 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" spanish | \n",
" 99 | \n",
" 0.060906 | \n",
" 0.336709 | \n",
" 0.880000 | \n",
" 0.997281 | \n",
"
\n",
" \n",
" thai | \n",
" 59 | \n",
" 0.200000 | \n",
" 5.142857 | \n",
" 0.974684 | \n",
" 0.373060 | \n",
"
\n",
" \n",
" urdu | \n",
" 96 | \n",
" 0.038528 | \n",
" 0.153439 | \n",
" 0.975000 | \n",
" 0.954732 | \n",
"
\n",
" \n",
" vietnamese | \n",
" 99 | \n",
" 0.021310 | \n",
" 0.075000 | \n",
" 0.989899 | \n",
" 0.998874 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df",
"summary": "{\n \"name\": \"df\",\n \"rows\": 18,\n \"fields\": [\n {\n \"column\": \"Languages\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"arabic\",\n \"bengali\",\n \"greek\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 24,\n \"min\": 24,\n \"max\": 100,\n \"num_unique_values\": 8,\n \"samples\": [\n 24,\n 75,\n 100\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.19634747342347822,\n \"min\": 0.020288206295032234,\n \"max\": 0.674087266251113,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.030927835051546393,\n 0.0354775828460039,\n 0.03043691703485518\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12.873922557853087,\n \"min\": 0.075,\n \"max\": 53.25925925925926,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.15568862275449102,\n 0.1534090909090909,\n 0.16803953871499178\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.02545892435154941,\n \"min\": 0.88,\n \"max\": 0.9953917050691244,\n \"num_unique_values\": 17,\n \"samples\": [\n 0.98,\n 0.9827586206896551,\n 0.9767441860465116\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.34744659110653675,\n \"min\": 0.006081103291140964,\n \"max\": 1.0000000000000036,\n \"num_unique_values\": 17,\n \"samples\": [\n 1.0000000000000036,\n 1.0000000000000018,\n 0.9999999999999993\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "LRy9xCV6O3Nb"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"\n",
"def evaluate_ocr_models_for_different_fonts(num_docs, starting_font, increment_font):\n",
" ocr_models = {\n",
" # \"gemini_flash\": extract_text_gemini,\n",
" \"opus\": extract_text_opus,\n",
" \"sonnet\": extract_text_sonnet,\n",
" \"haiku\": extract_text_haiku,\n",
" \"gpt4_turbo\": extract_text_gpt,\n",
" \"gpt4o\": extract_text_gpt,\n",
" \"vision\": extract_text_vision,\n",
" \"tesseract\": extract_text_tesseract,\n",
" }\n",
"\n",
" evaluation_metrics = {\n",
" \"Fuzzy Score\": calculate_fuzzy_score,\n",
" \"CER\": calculate_cer,\n",
" \"WER\": calculate_wer,\n",
" # \"BLEU\": calculate_bleu,\n",
" \"Jaccard Index\": calculate_jaccard_index,\n",
" \"Cosine Similarity\": calculate_cosine_similarity\n",
" }\n",
"\n",
" results = {model: {metric: [] for metric in evaluation_metrics} for model in ocr_models}\n",
"\n",
" for model in ocr_models:\n",
" for docs in range(1, 12):\n",
" font_size = docs\n",
" model_output_path = f\"/content/{model}_font_output_{font_size}.txt\"\n",
" reference_path = f\"/content/font-reference.txt\"\n",
"\n",
" if os.path.exists(model_output_path):\n",
" for metric in evaluation_metrics:\n",
" score = evaluation_metrics[metric](read_file(reference_path), read_file(model_output_path))\n",
" results[model][metric].append(score)\n",
" else:\n",
" for metric in evaluation_metrics:\n",
" print(model_output_path)\n",
" results[model][metric].append(None)\n",
"\n",
" # for docs in range(num_docs):\n",
" # font_size = starting_font + docs * increment_font\n",
" # model_output_path = f\"/content/{model}_font_output_{font_size}.txt\"\n",
" # reference_path = f\"/content/font-reference.txt\"\n",
"\n",
" # if os.path.exists(model_output_path):\n",
" # for metric in evaluation_metrics:\n",
" # score = evaluation_metrics[metric](read_file(reference_path), read_file(model_output_path))\n",
" # results[model][metric].append(score)\n",
" # else:\n",
" # for metric in evaluation_metrics:\n",
" # print(model_output_path)\n",
" # results[model][metric].append(None)\n",
"\n",
" models_dfs = {}\n",
" for model, metrics_scores in results.items():\n",
" df = pd.DataFrame(metrics_scores)\n",
" # df.index = [docs+1 if docs<11 else (starting_font + (docs-11) * increment_font) for docs in range(len(df))]\n",
" df.index = [docs+1 for docs in range(len(df))]\n",
" df.index.name = 'Font Size'\n",
" models_dfs[model] = df\n",
"\n",
" return models_dfs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "H0Stery6Q7cu",
"outputId": "b9c8b11e-b662-4c96-83d4-026f270377aa"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/content/vision_font_output_1.txt\n",
"/content/vision_font_output_1.txt\n",
"/content/vision_font_output_1.txt\n",
"/content/vision_font_output_1.txt\n",
"/content/vision_font_output_1.txt\n",
"Model: opus\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 11,\n \"fields\": [\n {\n \"column\": \"Font Size\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 1,\n \"max\": 11,\n \"num_unique_values\": 11,\n \"samples\": [\n 6,\n 1,\n 10\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 26,\n \"min\": 11,\n \"max\": 83,\n \"num_unique_values\": 9,\n \"samples\": [\n 83,\n 54,\n 81\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2706902059592143,\n \"min\": 0.20877659574468085,\n \"max\": 0.9401595744680851,\n \"num_unique_values\": 11,\n \"samples\": [\n 0.5332446808510638,\n 0.9391622340425532,\n 0.20877659574468085\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.6112781316192542,\n \"min\": 1.188976377952756,\n \"max\": 5.56496062992126,\n \"num_unique_values\": 11,\n \"samples\": [\n 3.1003937007874014,\n 5.559055118110236,\n 1.188976377952756\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1278999198278638,\n \"min\": 0.5777777777777777,\n \"max\": 0.9148936170212766,\n \"num_unique_values\": 9,\n \"samples\": [\n 0.8541666666666666,\n 0.6461538461538462,\n 0.8333333333333334\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2254302455083485,\n \"min\": 0.40280813892384887,\n \"max\": 0.9686748398131518,\n \"num_unique_values\": 11,\n \"samples\": [\n 0.7377493548436327,\n 0.5343168969237784,\n 0.9646151323331051\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Font Size | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 11 | \n",
" 0.939162 | \n",
" 5.559055 | \n",
" 0.577778 | \n",
" 0.534317 | \n",
"
\n",
" \n",
" 2 | \n",
" 11 | \n",
" 0.940160 | \n",
" 5.564961 | \n",
" 0.577778 | \n",
" 0.402808 | \n",
"
\n",
" \n",
" 3 | \n",
" 54 | \n",
" 0.744016 | \n",
" 4.385827 | \n",
" 0.646154 | \n",
" 0.671715 | \n",
"
\n",
" \n",
" 4 | \n",
" 49 | \n",
" 0.659574 | \n",
" 3.866142 | \n",
" 0.734694 | \n",
" 0.411806 | \n",
"
\n",
" \n",
" 5 | \n",
" 58 | \n",
" 0.545213 | \n",
" 3.179134 | \n",
" 0.653846 | \n",
" 0.713956 | \n",
"
\n",
" \n",
" 6 | \n",
" 56 | \n",
" 0.533245 | \n",
" 3.100394 | \n",
" 0.770833 | \n",
" 0.737749 | \n",
"
\n",
" \n",
" 7 | \n",
" 81 | \n",
" 0.244681 | \n",
" 1.429134 | \n",
" 0.833333 | \n",
" 0.957964 | \n",
"
\n",
" \n",
" 8 | \n",
" 77 | \n",
" 0.281250 | \n",
" 1.645669 | \n",
" 0.914894 | \n",
" 0.941545 | \n",
"
\n",
" \n",
" 9 | \n",
" 81 | \n",
" 0.235040 | \n",
" 1.364173 | \n",
" 0.854167 | \n",
" 0.963042 | \n",
"
\n",
" \n",
" 10 | \n",
" 83 | \n",
" 0.208777 | \n",
" 1.188976 | \n",
" 0.914894 | \n",
" 0.964615 | \n",
"
\n",
" \n",
" 11 | \n",
" 78 | \n",
" 0.501995 | \n",
" 2.950787 | \n",
" 0.860000 | \n",
" 0.968675 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Font Size \n",
"1 11 0.939162 5.559055 0.577778 0.534317\n",
"2 11 0.940160 5.564961 0.577778 0.402808\n",
"3 54 0.744016 4.385827 0.646154 0.671715\n",
"4 49 0.659574 3.866142 0.734694 0.411806\n",
"5 58 0.545213 3.179134 0.653846 0.713956\n",
"6 56 0.533245 3.100394 0.770833 0.737749\n",
"7 81 0.244681 1.429134 0.833333 0.957964\n",
"8 77 0.281250 1.645669 0.914894 0.941545\n",
"9 81 0.235040 1.364173 0.854167 0.963042\n",
"10 83 0.208777 1.188976 0.914894 0.964615\n",
"11 78 0.501995 2.950787 0.860000 0.968675"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Model: sonnet\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 11,\n \"fields\": [\n {\n \"column\": \"Font Size\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 1,\n \"max\": 11,\n \"num_unique_values\": 11,\n \"samples\": [\n 6,\n 1,\n 10\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 25,\n \"min\": 13,\n \"max\": 81,\n \"num_unique_values\": 11,\n \"samples\": [\n 43,\n 14,\n 73\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.26048711871268654,\n \"min\": 0.20944148936170212,\n \"max\": 0.9305186170212766,\n \"num_unique_values\": 11,\n \"samples\": [\n 0.691156914893617,\n 0.9271941489361702,\n 0.3134973404255319\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.5598515803234427,\n \"min\": 1.218503937007874,\n \"max\": 5.50984251968504,\n \"num_unique_values\": 11,\n \"samples\": [\n 4.057086614173229,\n 5.49015748031496,\n 1.8228346456692914\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.07769381021431758,\n \"min\": 0.6,\n \"max\": 0.8367346938775511,\n \"num_unique_values\": 10,\n \"samples\": [\n 0.8367346938775511,\n 0.6666666666666666,\n 0.6862745098039216\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3092924518668363,\n \"min\": 0.13849155265525223,\n \"max\": 0.964876185576174,\n \"num_unique_values\": 11,\n \"samples\": [\n 0.13849155265525223,\n 0.21952362028805444,\n 0.9251568793533181\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Font Size | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 14 | \n",
" 0.927194 | \n",
" 5.490157 | \n",
" 0.644444 | \n",
" 0.219524 | \n",
"
\n",
" \n",
" 2 | \n",
" 15 | \n",
" 0.915891 | \n",
" 5.423228 | \n",
" 0.666667 | \n",
" 0.355524 | \n",
"
\n",
" \n",
" 3 | \n",
" 13 | \n",
" 0.930519 | \n",
" 5.509843 | \n",
" 0.600000 | \n",
" 0.404004 | \n",
"
\n",
" \n",
" 4 | \n",
" 52 | \n",
" 0.754654 | \n",
" 4.435039 | \n",
" 0.633333 | \n",
" 0.529081 | \n",
"
\n",
" \n",
" 5 | \n",
" 53 | \n",
" 0.677859 | \n",
" 3.944882 | \n",
" 0.696429 | \n",
" 0.421371 | \n",
"
\n",
" \n",
" 6 | \n",
" 43 | \n",
" 0.691157 | \n",
" 4.057087 | \n",
" 0.686275 | \n",
" 0.138492 | \n",
"
\n",
" \n",
" 7 | \n",
" 51 | \n",
" 0.564495 | \n",
" 3.281496 | \n",
" 0.700000 | \n",
" 0.737316 | \n",
"
\n",
" \n",
" 8 | \n",
" 69 | \n",
" 0.441157 | \n",
" 2.555118 | \n",
" 0.765957 | \n",
" 0.894934 | \n",
"
\n",
" \n",
" 9 | \n",
" 75 | \n",
" 0.343750 | \n",
" 1.944882 | \n",
" 0.836735 | \n",
" 0.945566 | \n",
"
\n",
" \n",
" 10 | \n",
" 73 | \n",
" 0.313497 | \n",
" 1.822835 | \n",
" 0.722222 | \n",
" 0.925157 | \n",
"
\n",
" \n",
" 11 | \n",
" 81 | \n",
" 0.209441 | \n",
" 1.218504 | \n",
" 0.836735 | \n",
" 0.964876 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Font Size \n",
"1 14 0.927194 5.490157 0.644444 0.219524\n",
"2 15 0.915891 5.423228 0.666667 0.355524\n",
"3 13 0.930519 5.509843 0.600000 0.404004\n",
"4 52 0.754654 4.435039 0.633333 0.529081\n",
"5 53 0.677859 3.944882 0.696429 0.421371\n",
"6 43 0.691157 4.057087 0.686275 0.138492\n",
"7 51 0.564495 3.281496 0.700000 0.737316\n",
"8 69 0.441157 2.555118 0.765957 0.894934\n",
"9 75 0.343750 1.944882 0.836735 0.945566\n",
"10 73 0.313497 1.822835 0.722222 0.925157\n",
"11 81 0.209441 1.218504 0.836735 0.964876"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Model: haiku\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 11,\n \"fields\": [\n {\n \"column\": \"Font Size\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 1,\n \"max\": 11,\n \"num_unique_values\": 11,\n \"samples\": [\n 6,\n 1,\n 10\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 29,\n \"min\": 7,\n \"max\": 88,\n \"num_unique_values\": 11,\n \"samples\": [\n 65,\n 9,\n 77\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2583607660330754,\n \"min\": 0.20079787234042554,\n \"max\": 0.961436170212766,\n \"num_unique_values\": 11,\n \"samples\": [\n 0.5728058510638298,\n 0.953125,\n 0.3949468085106383\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.5375984125960058,\n \"min\": 1.1751968503937007,\n \"max\": 5.692913385826771,\n \"num_unique_values\": 11,\n \"samples\": [\n 3.358267716535433,\n 5.643700787401575,\n 2.3188976377952755\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.11353233506269551,\n \"min\": 0.5111111111111111,\n \"max\": 0.875,\n \"num_unique_values\": 10,\n \"samples\": [\n 0.8571428571428571,\n 0.5777777777777777,\n 0.6222222222222222\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.27283869866102467,\n \"min\": 0.23374491359409186,\n \"max\": 0.9838809725569844,\n \"num_unique_values\": 11,\n \"samples\": [\n 0.8651734765851289,\n 0.23374491359409186,\n 0.9531800031135179\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Font Size | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 9 | \n",
" 0.953125 | \n",
" 5.643701 | \n",
" 0.600000 | \n",
" 0.233745 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 0.955785 | \n",
" 5.657480 | \n",
" 0.577778 | \n",
" 0.318580 | \n",
"
\n",
" \n",
" 3 | \n",
" 7 | \n",
" 0.961436 | \n",
" 5.692913 | \n",
" 0.511111 | \n",
" 0.398063 | \n",
"
\n",
" \n",
" 4 | \n",
" 13 | \n",
" 0.927859 | \n",
" 5.494094 | \n",
" 0.666667 | \n",
" 0.428811 | \n",
"
\n",
" \n",
" 5 | \n",
" 16 | \n",
" 0.911902 | \n",
" 5.397638 | \n",
" 0.600000 | \n",
" 0.451232 | \n",
"
\n",
" \n",
" 6 | \n",
" 65 | \n",
" 0.572806 | \n",
" 3.358268 | \n",
" 0.735849 | \n",
" 0.865173 | \n",
"
\n",
" \n",
" 7 | \n",
" 25 | \n",
" 0.857048 | \n",
" 5.072835 | \n",
" 0.622222 | \n",
" 0.592152 | \n",
"
\n",
" \n",
" 8 | \n",
" 28 | \n",
" 0.841755 | \n",
" 4.984252 | \n",
" 0.630435 | \n",
" 0.747555 | \n",
"
\n",
" \n",
" 9 | \n",
" 49 | \n",
" 0.700465 | \n",
" 4.141732 | \n",
" 0.680851 | \n",
" 0.883847 | \n",
"
\n",
" \n",
" 10 | \n",
" 77 | \n",
" 0.394947 | \n",
" 2.318898 | \n",
" 0.857143 | \n",
" 0.953180 | \n",
"
\n",
" \n",
" 11 | \n",
" 88 | \n",
" 0.200798 | \n",
" 1.175197 | \n",
" 0.875000 | \n",
" 0.983881 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Font Size \n",
"1 9 0.953125 5.643701 0.600000 0.233745\n",
"2 8 0.955785 5.657480 0.577778 0.318580\n",
"3 7 0.961436 5.692913 0.511111 0.398063\n",
"4 13 0.927859 5.494094 0.666667 0.428811\n",
"5 16 0.911902 5.397638 0.600000 0.451232\n",
"6 65 0.572806 3.358268 0.735849 0.865173\n",
"7 25 0.857048 5.072835 0.622222 0.592152\n",
"8 28 0.841755 4.984252 0.630435 0.747555\n",
"9 49 0.700465 4.141732 0.680851 0.883847\n",
"10 77 0.394947 2.318898 0.857143 0.953180\n",
"11 88 0.200798 1.175197 0.875000 0.983881"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Model: gpt4_turbo\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 11,\n \"fields\": [\n {\n \"column\": \"Font Size\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 1,\n \"max\": 11,\n \"num_unique_values\": 11,\n \"samples\": [\n 6,\n 1,\n 10\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 31,\n \"min\": 6,\n \"max\": 96,\n \"num_unique_values\": 11,\n \"samples\": [\n 60,\n 6,\n 94\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3421238782483519,\n \"min\": 0.04920212765957447,\n \"max\": 0.9654255319148937,\n \"num_unique_values\": 11,\n \"samples\": [\n 0.6449468085106383,\n 0.9654255319148937,\n 0.07912234042553191\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.028721720144799,\n \"min\": 0.2755905511811024,\n \"max\": 5.716535433070866,\n \"num_unique_values\": 11,\n \"samples\": [\n 3.765748031496063,\n 5.716535433070866,\n 0.44291338582677164\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.11537976396364674,\n \"min\": 0.5333333333333333,\n \"max\": 0.9148936170212766,\n \"num_unique_values\": 11,\n \"samples\": [\n 0.7647058823529411,\n 0.5333333333333333,\n 0.86\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.25606447753978095,\n \"min\": 0.2987571711162542,\n \"max\": 0.9888192836918042,\n \"num_unique_values\": 11,\n \"samples\": [\n 0.7837497701197942,\n 0.3718509279635078,\n 0.9888192836918042\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Font Size | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 6 | \n",
" 0.965426 | \n",
" 5.716535 | \n",
" 0.533333 | \n",
" 0.371851 | \n",
"
\n",
" \n",
" 2 | \n",
" 13 | \n",
" 0.927194 | \n",
" 5.490157 | \n",
" 0.666667 | \n",
" 0.298757 | \n",
"
\n",
" \n",
" 3 | \n",
" 26 | \n",
" 0.850731 | \n",
" 5.029528 | \n",
" 0.641509 | \n",
" 0.548370 | \n",
"
\n",
" \n",
" 4 | \n",
" 53 | \n",
" 0.672872 | \n",
" 3.944882 | \n",
" 0.692308 | \n",
" 0.581897 | \n",
"
\n",
" \n",
" 5 | \n",
" 56 | \n",
" 0.707114 | \n",
" 4.131890 | \n",
" 0.735849 | \n",
" 0.666032 | \n",
"
\n",
" \n",
" 6 | \n",
" 60 | \n",
" 0.644947 | \n",
" 3.765748 | \n",
" 0.764706 | \n",
" 0.783750 | \n",
"
\n",
" \n",
" 7 | \n",
" 71 | \n",
" 0.413231 | \n",
" 2.427165 | \n",
" 0.788462 | \n",
" 0.912369 | \n",
"
\n",
" \n",
" 8 | \n",
" 75 | \n",
" 0.420878 | \n",
" 2.456693 | \n",
" 0.792453 | \n",
" 0.945860 | \n",
"
\n",
" \n",
" 9 | \n",
" 92 | \n",
" 0.088098 | \n",
" 0.501969 | \n",
" 0.895833 | \n",
" 0.988406 | \n",
"
\n",
" \n",
" 10 | \n",
" 94 | \n",
" 0.079122 | \n",
" 0.442913 | \n",
" 0.860000 | \n",
" 0.988819 | \n",
"
\n",
" \n",
" 11 | \n",
" 96 | \n",
" 0.049202 | \n",
" 0.275591 | \n",
" 0.914894 | \n",
" 0.987096 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Font Size \n",
"1 6 0.965426 5.716535 0.533333 0.371851\n",
"2 13 0.927194 5.490157 0.666667 0.298757\n",
"3 26 0.850731 5.029528 0.641509 0.548370\n",
"4 53 0.672872 3.944882 0.692308 0.581897\n",
"5 56 0.707114 4.131890 0.735849 0.666032\n",
"6 60 0.644947 3.765748 0.764706 0.783750\n",
"7 71 0.413231 2.427165 0.788462 0.912369\n",
"8 75 0.420878 2.456693 0.792453 0.945860\n",
"9 92 0.088098 0.501969 0.895833 0.988406\n",
"10 94 0.079122 0.442913 0.860000 0.988819\n",
"11 96 0.049202 0.275591 0.914894 0.987096"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Model: gpt4o\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 11,\n \"fields\": [\n {\n \"column\": \"Font Size\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 1,\n \"max\": 11,\n \"num_unique_values\": 11,\n \"samples\": [\n 6,\n 1,\n 10\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28,\n \"min\": 7,\n \"max\": 99,\n \"num_unique_values\": 10,\n \"samples\": [\n 98,\n 52,\n 80\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.35695772268159165,\n \"min\": 0.014960106382978724,\n \"max\": 0.9647606382978723,\n \"num_unique_values\": 11,\n \"samples\": [\n 0.2509973404255319,\n 0.9647606382978723,\n 0.02759308510638298\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.1139951323900963,\n \"min\": 0.0531496062992126,\n \"max\": 5.71259842519685,\n \"num_unique_values\": 11,\n \"samples\": [\n 1.4625984251968505,\n 5.71259842519685,\n 0.14566929133858267\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1337860161791627,\n \"min\": 0.4666666666666667,\n \"max\": 0.9166666666666666,\n \"num_unique_values\": 11,\n \"samples\": [\n 0.8235294117647058,\n 0.4666666666666667,\n 0.9166666666666666\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.25121656836400763,\n \"min\": 0.31970530939306635,\n \"max\": 0.9979545301643027,\n \"num_unique_values\": 11,\n \"samples\": [\n 0.956832943133306,\n 0.31970530939306635,\n 0.9978873363758954\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Font Size | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 7 | \n",
" 0.964761 | \n",
" 5.712598 | \n",
" 0.466667 | \n",
" 0.319705 | \n",
"
\n",
" \n",
" 2 | \n",
" 52 | \n",
" 0.763298 | \n",
" 4.507874 | \n",
" 0.722222 | \n",
" 0.487974 | \n",
"
\n",
" \n",
" 3 | \n",
" 51 | \n",
" 0.812168 | \n",
" 4.748031 | \n",
" 0.690909 | \n",
" 0.517332 | \n",
"
\n",
" \n",
" 4 | \n",
" 63 | \n",
" 0.647939 | \n",
" 3.812992 | \n",
" 0.725490 | \n",
" 0.850500 | \n",
"
\n",
" \n",
" 5 | \n",
" 71 | \n",
" 0.448138 | \n",
" 2.488189 | \n",
" 0.754717 | \n",
" 0.917159 | \n",
"
\n",
" \n",
" 6 | \n",
" 80 | \n",
" 0.250997 | \n",
" 1.462598 | \n",
" 0.823529 | \n",
" 0.956833 | \n",
"
\n",
" \n",
" 7 | \n",
" 87 | \n",
" 0.188497 | \n",
" 1.090551 | \n",
" 0.843137 | \n",
" 0.979915 | \n",
"
\n",
" \n",
" 8 | \n",
" 94 | \n",
" 0.088763 | \n",
" 0.494094 | \n",
" 0.877551 | \n",
" 0.991400 | \n",
"
\n",
" \n",
" 9 | \n",
" 98 | \n",
" 0.023936 | \n",
" 0.114173 | \n",
" 0.914894 | \n",
" 0.997767 | \n",
"
\n",
" \n",
" 10 | \n",
" 99 | \n",
" 0.027593 | \n",
" 0.145669 | \n",
" 0.916667 | \n",
" 0.997887 | \n",
"
\n",
" \n",
" 11 | \n",
" 99 | \n",
" 0.014960 | \n",
" 0.053150 | \n",
" 0.897959 | \n",
" 0.997955 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Font Size \n",
"1 7 0.964761 5.712598 0.466667 0.319705\n",
"2 52 0.763298 4.507874 0.722222 0.487974\n",
"3 51 0.812168 4.748031 0.690909 0.517332\n",
"4 63 0.647939 3.812992 0.725490 0.850500\n",
"5 71 0.448138 2.488189 0.754717 0.917159\n",
"6 80 0.250997 1.462598 0.823529 0.956833\n",
"7 87 0.188497 1.090551 0.843137 0.979915\n",
"8 94 0.088763 0.494094 0.877551 0.991400\n",
"9 98 0.023936 0.114173 0.914894 0.997767\n",
"10 99 0.027593 0.145669 0.916667 0.997887\n",
"11 99 0.014960 0.053150 0.897959 0.997955"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Model: vision\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 11,\n \"fields\": [\n {\n \"column\": \"Font Size\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 1,\n \"max\": 11,\n \"num_unique_values\": 11,\n \"samples\": [\n 6,\n 1,\n 10\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28.25990485161304,\n \"min\": 11.0,\n \"max\": 100.0,\n \"num_unique_values\": 5,\n \"samples\": [\n 73.0,\n 100.0,\n 95.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2836086492441448,\n \"min\": 0.03856382978723404,\n \"max\": 0.9368351063829787,\n \"num_unique_values\": 10,\n \"samples\": [\n 0.0694813829787234,\n 0.316156914893617,\n 0.04055851063829787\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.686012047454625,\n \"min\": 0.20669291338582677,\n \"max\": 5.547244094488189,\n \"num_unique_values\": 10,\n \"samples\": [\n 0.39173228346456695,\n 1.860236220472441,\n 0.21850393700787402\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.14553198126384706,\n \"min\": 0.5333333333333333,\n \"max\": 0.9777777777777777,\n \"num_unique_values\": 5,\n \"samples\": [\n 0.7068965517241379,\n 0.9148936170212766,\n 0.9555555555555556\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.11484568404271042,\n \"min\": 0.6345357249509544,\n \"max\": 0.9999070934173677,\n \"num_unique_values\": 7,\n \"samples\": [\n 0.6345357249509544,\n 0.9136700731018212,\n 0.9999070934173677\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Font Size | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 11.0 | \n",
" 0.936835 | \n",
" 5.547244 | \n",
" 0.533333 | \n",
" 0.634536 | \n",
"
\n",
" \n",
" 3 | \n",
" 73.0 | \n",
" 0.316157 | \n",
" 1.860236 | \n",
" 0.706897 | \n",
" 0.913670 | \n",
"
\n",
" \n",
" 4 | \n",
" 95.0 | \n",
" 0.081782 | \n",
" 0.464567 | \n",
" 0.955556 | \n",
" 0.982378 | \n",
"
\n",
" \n",
" 5 | \n",
" 99.0 | \n",
" 0.042221 | \n",
" 0.230315 | \n",
" 0.955556 | \n",
" 0.995717 | \n",
"
\n",
" \n",
" 6 | \n",
" 100.0 | \n",
" 0.042886 | \n",
" 0.234252 | \n",
" 0.955556 | \n",
" 0.998699 | \n",
"
\n",
" \n",
" 7 | \n",
" 100.0 | \n",
" 0.040559 | \n",
" 0.218504 | \n",
" 0.955556 | \n",
" 0.999907 | \n",
"
\n",
" \n",
" 8 | \n",
" 100.0 | \n",
" 0.045213 | \n",
" 0.248031 | \n",
" 0.955556 | \n",
" 0.999907 | \n",
"
\n",
" \n",
" 9 | \n",
" 100.0 | \n",
" 0.038564 | \n",
" 0.206693 | \n",
" 0.977778 | \n",
" 0.999907 | \n",
"
\n",
" \n",
" 10 | \n",
" 100.0 | \n",
" 0.069481 | \n",
" 0.391732 | \n",
" 0.914894 | \n",
" 0.999722 | \n",
"
\n",
" \n",
" 11 | \n",
" 100.0 | \n",
" 0.048870 | \n",
" 0.267717 | \n",
" 0.914894 | \n",
" 0.999722 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Font Size \n",
"1 NaN NaN NaN NaN NaN\n",
"2 11.0 0.936835 5.547244 0.533333 0.634536\n",
"3 73.0 0.316157 1.860236 0.706897 0.913670\n",
"4 95.0 0.081782 0.464567 0.955556 0.982378\n",
"5 99.0 0.042221 0.230315 0.955556 0.995717\n",
"6 100.0 0.042886 0.234252 0.955556 0.998699\n",
"7 100.0 0.040559 0.218504 0.955556 0.999907\n",
"8 100.0 0.045213 0.248031 0.955556 0.999907\n",
"9 100.0 0.038564 0.206693 0.977778 0.999907\n",
"10 100.0 0.069481 0.391732 0.914894 0.999722\n",
"11 100.0 0.048870 0.267717 0.914894 0.999722"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Model: tesseract\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"df\",\n \"rows\": 11,\n \"fields\": [\n {\n \"column\": \"Font Size\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 1,\n \"max\": 11,\n \"num_unique_values\": 11,\n \"samples\": [\n 6,\n 1,\n 10\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 26,\n \"min\": 0,\n \"max\": 65,\n \"num_unique_values\": 7,\n \"samples\": [\n 0,\n 9,\n 60\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2380588349780737,\n \"min\": 0.37333776595744683,\n \"max\": 0.9996675531914894,\n \"num_unique_values\": 7,\n \"samples\": [\n 0.9996675531914894,\n 0.9544547872340425,\n 0.4328457446808511\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.4439344346996803,\n \"min\": 2.12007874015748,\n \"max\": 5.921259842519685,\n \"num_unique_values\": 7,\n \"samples\": [\n 5.921259842519685,\n 5.6515748031496065,\n 2.486220472440945\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3133414775340051,\n \"min\": 0.022222222222222223,\n \"max\": 0.7017543859649122,\n \"num_unique_values\": 7,\n \"samples\": [\n 0.022222222222222223,\n 0.48936170212765956,\n 0.6666666666666666\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3095614211412481,\n \"min\": 0.0,\n \"max\": 0.8153483295215893,\n \"num_unique_values\": 7,\n \"samples\": [\n 0.0,\n 0.058424057987394716,\n 0.7149728391186198\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe",
"variable_name": "df"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Font Size | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 0 | \n",
" 0.999668 | \n",
" 5.921260 | \n",
" 0.022222 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0.999668 | \n",
" 5.921260 | \n",
" 0.022222 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0.999668 | \n",
" 5.921260 | \n",
" 0.022222 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0.999668 | \n",
" 5.921260 | \n",
" 0.022222 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 5 | \n",
" 0 | \n",
" 0.999668 | \n",
" 5.921260 | \n",
" 0.022222 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 6 | \n",
" 9 | \n",
" 0.954455 | \n",
" 5.651575 | \n",
" 0.489362 | \n",
" 0.058424 | \n",
"
\n",
" \n",
" 7 | \n",
" 19 | \n",
" 0.896277 | \n",
" 5.291339 | \n",
" 0.571429 | \n",
" 0.169130 | \n",
"
\n",
" \n",
" 8 | \n",
" 30 | \n",
" 0.833112 | \n",
" 4.889764 | \n",
" 0.600000 | \n",
" 0.157488 | \n",
"
\n",
" \n",
" 9 | \n",
" 51 | \n",
" 0.638630 | \n",
" 3.732283 | \n",
" 0.650000 | \n",
" 0.510251 | \n",
"
\n",
" \n",
" 10 | \n",
" 60 | \n",
" 0.432846 | \n",
" 2.486220 | \n",
" 0.666667 | \n",
" 0.714973 | \n",
"
\n",
" \n",
" 11 | \n",
" 65 | \n",
" 0.373338 | \n",
" 2.120079 | \n",
" 0.701754 | \n",
" 0.815348 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER Jaccard Index Cosine Similarity\n",
"Font Size \n",
"1 0 0.999668 5.921260 0.022222 0.000000\n",
"2 0 0.999668 5.921260 0.022222 0.000000\n",
"3 0 0.999668 5.921260 0.022222 0.000000\n",
"4 0 0.999668 5.921260 0.022222 0.000000\n",
"5 0 0.999668 5.921260 0.022222 0.000000\n",
"6 9 0.954455 5.651575 0.489362 0.058424\n",
"7 19 0.896277 5.291339 0.571429 0.169130\n",
"8 30 0.833112 4.889764 0.600000 0.157488\n",
"9 51 0.638630 3.732283 0.650000 0.510251\n",
"10 60 0.432846 2.486220 0.666667 0.714973\n",
"11 65 0.373338 2.120079 0.701754 0.815348"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
}
],
"source": [
"num_docs = 9\n",
"evaluation_results = evaluate_ocr_models_for_different_fonts(num_docs, starting_font, increment_font)\n",
"\n",
"for model, df in evaluation_results.items():\n",
" print(f\"Model: {model}\")\n",
" display(df)\n",
" print(\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "eOfF1FQszIqj"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"def plot_evaluation_metric_for_all_models(models_dfs, evaluation_metric):\n",
" fig, axs = plt.subplots(4, 2, figsize=(20, 20)) # Create a 3x2 grid of subplots\n",
" fig.suptitle(f'{evaluation_metric} for Different OCR Models', fontsize=16)\n",
"\n",
" models = list(models_dfs.keys())\n",
"\n",
" for i, model in enumerate(models):\n",
" row = i // 2\n",
" col = i % 2\n",
" ax = axs[row, col]\n",
"\n",
" df = models_dfs[model]\n",
"\n",
" if evaluation_metric in df.columns:\n",
" ax.bar(df.index, df[evaluation_metric], color=\"#d4377c\")\n",
" ax.set_title(model)\n",
" ax.set_xlabel('Font Size')\n",
" ax.set_ylabel(evaluation_metric)\n",
"\n",
" else:\n",
" ax.text(0.5, 0.5, 'Metric Not Available', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)\n",
" ax.set_title(model)\n",
" ax.set_xlabel('Font Size')\n",
" ax.set_ylabel(evaluation_metric)\n",
"\n",
" for j in range(len(models), 6):\n",
" row = j // 2\n",
" col = j % 2\n",
" fig.delaxes(axs[row, col])\n",
"\n",
" plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout to fit title\n",
" plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "Sis7oGDkz6RR",
"outputId": "fd6e8b1a-5e84-4bf5-ba0e-6b70e3a3900e"
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plot_evaluation_metric_for_all_models(evaluation_results, 'Cosine Similarity')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "D9GJEfL8wjRw"
},
"outputs": [],
"source": [
"evaluation_metrics = {\n",
" \"Fuzzy Score\": calculate_fuzzy_score,\n",
" \"CER\": calculate_cer,\n",
" \"WER\": calculate_wer,\n",
" \"BLEU\": calculate_bleu,\n",
" \"Jaccard Index\": calculate_jaccard_index,\n",
" \"Cosine Similarity\": calculate_cosine_similarity\n",
"}\n",
"\n",
"# plot_line_graphs(evaluation_results, evaluation_metrics)\n",
"# plot_heatmaps(evaluation_results, evaluation_metrics)\n",
"# plot_box_plots(evaluation_results, evaluation_metrics)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 653
},
"id": "YqLAlGsHo8HV",
"outputId": "62cb5c2a-dedf-44f0-9d2d-c89344b86808"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/nltk/translate/bleu_score.py:552: UserWarning: \n",
"The hypothesis contains 0 counts of 3-gram overlaps.\n",
"Therefore the BLEU score evaluates to 0, independently of\n",
"how many N-gram overlaps of lower order it contains.\n",
"Consider using lower n-gram order or use SmoothingFunction()\n",
" warnings.warn(_msg)\n",
"/usr/local/lib/python3.10/dist-packages/nltk/translate/bleu_score.py:552: UserWarning: \n",
"The hypothesis contains 0 counts of 4-gram overlaps.\n",
"Therefore the BLEU score evaluates to 0, independently of\n",
"how many N-gram overlaps of lower order it contains.\n",
"Consider using lower n-gram order or use SmoothingFunction()\n",
" warnings.warn(_msg)\n",
"/usr/local/lib/python3.10/dist-packages/nltk/translate/bleu_score.py:552: UserWarning: \n",
"The hypothesis contains 0 counts of 2-gram overlaps.\n",
"Therefore the BLEU score evaluates to 0, independently of\n",
"how many N-gram overlaps of lower order it contains.\n",
"Consider using lower n-gram order or use SmoothingFunction()\n",
" warnings.warn(_msg)\n"
]
},
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"summary": "{\n \"name\": \"evaluate_ocr_models(9)\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"Models\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 8,\n \"samples\": [\n \"gemini_flash\",\n \"gpt4_turbo\",\n \"gemini_pro\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fuzzy Score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 16.30660349246027,\n \"min\": 51.8,\n \"max\": 99.6,\n \"num_unique_values\": 8,\n \"samples\": [\n 51.8,\n 96.2,\n 91.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1883000826417473,\n \"min\": 0.08145846504281821,\n \"max\": 0.6201274231036187,\n \"num_unique_values\": 8,\n \"samples\": [\n 0.6201274231036187,\n 0.09927711945144765,\n 0.20839952194852984\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WER\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.1332343712105184,\n \"min\": 0.3751224553809715,\n \"max\": 3.590325362542492,\n \"num_unique_values\": 8,\n \"samples\": [\n 3.590325362542492,\n 0.5523439578025001,\n 1.1120799847099623\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BLEU\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.21682159817829272,\n \"min\": 0.22758422399211156,\n \"max\": 0.8850498889601823,\n \"num_unique_values\": 8,\n \"samples\": [\n 0.22758422399211156,\n 0.8320234054525525,\n 0.7061959848707065\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Jaccard Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.059989973554716705,\n \"min\": 0.763340201961604,\n \"max\": 0.9305045210040584,\n \"num_unique_values\": 8,\n \"samples\": [\n 0.8238050857829903,\n 0.906389224992381,\n 0.9194949092126512\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cosine Similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0884914513532198,\n \"min\": 0.7411261220383015,\n \"max\": 0.997394825369346,\n \"num_unique_values\": 8,\n \"samples\": [\n 0.8722908517458412,\n 0.9805010465062004,\n 0.9749292600430796\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
"type": "dataframe"
},
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fuzzy Score | \n",
" CER | \n",
" WER | \n",
" BLEU | \n",
" Jaccard Index | \n",
" Cosine Similarity | \n",
"
\n",
" \n",
" Models | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" gemini_pro | \n",
" 91.400000 | \n",
" 0.208400 | \n",
" 1.112080 | \n",
" 0.706196 | \n",
" 0.919495 | \n",
" 0.974929 | \n",
"
\n",
" \n",
" gemini_flash | \n",
" 51.800000 | \n",
" 0.620127 | \n",
" 3.590325 | \n",
" 0.227584 | \n",
" 0.823805 | \n",
" 0.872291 | \n",
"
\n",
" \n",
" opus | \n",
" 95.666667 | \n",
" 0.087719 | \n",
" 0.469858 | \n",
" 0.818859 | \n",
" 0.930505 | \n",
" 0.983913 | \n",
"
\n",
" \n",
" sonnet | \n",
" 73.933333 | \n",
" 0.370517 | \n",
" 2.215236 | \n",
" 0.543590 | \n",
" 0.763340 | \n",
" 0.741126 | \n",
"
\n",
" \n",
" haiku | \n",
" 87.666667 | \n",
" 0.209490 | \n",
" 1.194556 | \n",
" 0.679309 | \n",
" 0.886219 | \n",
" 0.951519 | \n",
"
\n",
" \n",
" gpt4_turbo | \n",
" 96.200000 | \n",
" 0.099277 | \n",
" 0.552344 | \n",
" 0.832023 | \n",
" 0.906389 | \n",
" 0.980501 | \n",
"
\n",
" \n",
" gpt4o | \n",
" 97.666667 | \n",
" 0.101787 | \n",
" 0.375122 | \n",
" 0.885050 | \n",
" 0.914903 | \n",
" 0.990599 | \n",
"
\n",
" \n",
" vision | \n",
" 99.600000 | \n",
" 0.081458 | \n",
" 0.414169 | \n",
" 0.514288 | \n",
" 0.930115 | \n",
" 0.997395 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"text/plain": [
" Fuzzy Score CER WER BLEU Jaccard Index \\\n",
"Models \n",
"gemini_pro 91.400000 0.208400 1.112080 0.706196 0.919495 \n",
"gemini_flash 51.800000 0.620127 3.590325 0.227584 0.823805 \n",
"opus 95.666667 0.087719 0.469858 0.818859 0.930505 \n",
"sonnet 73.933333 0.370517 2.215236 0.543590 0.763340 \n",
"haiku 87.666667 0.209490 1.194556 0.679309 0.886219 \n",
"gpt4_turbo 96.200000 0.099277 0.552344 0.832023 0.906389 \n",
"gpt4o 97.666667 0.101787 0.375122 0.885050 0.914903 \n",
"vision 99.600000 0.081458 0.414169 0.514288 0.930115 \n",
"\n",
" Cosine Similarity \n",
"Models \n",
"gemini_pro 0.974929 \n",
"gemini_flash 0.872291 \n",
"opus 0.983913 \n",
"sonnet 0.741126 \n",
"haiku 0.951519 \n",
"gpt4_turbo 0.980501 \n",
"gpt4o 0.990599 \n",
"vision 0.997395 "
]
},
"execution_count": 133,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"evaluate_ocr_models(9)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"id": "IG2BEKxmkFgV"
},
"outputs": [],
"source": [
"languages = [\"arabic\", \"bengali\", \"chinese\", \"cyrillic\", \"dutch\", \"english\", \"french\", \"german\", \"greek\", \"hebrew\", \"hindi\", \"japanese\", \"korean\", \"latin\", \"spanish\", \"thai\", \"urdu\", \"vietnamese\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "cqvwcOYyw9H0",
"outputId": "773fcde2-8b58-495d-e394-384f36106d2b"
},
"outputs": [
{
"data": {
"text/plain": [
"18"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(languages)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "CQlOQ5gG7orW"
},
"source": [
"## **Marker**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "g973ztuocjjG"
},
"outputs": [],
"source": [
"!pip install marker-pdf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "1EM1nFCQfSFy"
},
"outputs": [],
"source": [
"!pip install poetry"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "kSOPEJ57d2fL",
"outputId": "fc5eb94a-3575-408c-e67c-6b639bb3e113"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cloning into 'marker'...\n",
"remote: Enumerating objects: 1177, done.\u001b[K\n",
"remote: Counting objects: 100% (530/530), done.\u001b[K\n",
"remote: Compressing objects: 100% (163/163), done.\u001b[K\n",
"remote: Total 1177 (delta 421), reused 431 (delta 360), pack-reused 647\u001b[K\n",
"Receiving objects: 100% (1177/1177), 1.41 MiB | 7.03 MiB/s, done.\n",
"Resolving deltas: 100% (774/774), done.\n"
]
}
],
"source": [
"!git clone https://github.com/VikParuchuri/marker.git"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "SutZucw-fOn5"
},
"outputs": [],
"source": [
"%cd /content/marker\n",
"!poetry install"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "kc_mWIrVmgrV"
},
"outputs": [],
"source": [
"!marker_single /content/scan1.pdf /content --batch_multiplier 2 --max_pages 10 --langs English"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bUYM2Ovn7tW9"
},
"source": [
"# **Claude**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "d44IMxaC84bG"
},
"outputs": [],
"source": [
"!pip install anthropic\n",
"!pip install fitz\n",
"!pip install PyMuPDF"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Bdakn4AIt2Cj"
},
"outputs": [],
"source": [
"import fitz # PyMuPDF\n",
"import anthropic\n",
"import base64\n",
"import httpx\n",
"from PIL import Image\n",
"import requests"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2ZYLwD_Bt4JB"
},
"outputs": [],
"source": [
"image_media_type = \"image/png\"\n",
"client = anthropic.Anthropic(\n",
" api_key = \"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\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"id": "509VKaGm8z_L"
},
"outputs": [],
"source": [
"def convert_pdf_to_images(pdf_path):\n",
" images = []\n",
" with fitz.open(pdf_path) as doc:\n",
" for page_num in range(len(doc)):\n",
" page = doc.load_page(page_num)\n",
" pix = page.get_pixmap()\n",
" images.append(pix)\n",
" return images\n",
"\n",
"def encode_image_to_base64(image):\n",
" image_bytes = image.tobytes()\n",
" base64_encoded = base64.b64encode(image_bytes)\n",
" base64_string = base64_encoded.decode(\"utf-8\")\n",
" return base64_string"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SqPIotb99sLB"
},
"source": [
"**Claude - Opus**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "eDM51V2Ql0av"
},
"outputs": [],
"source": [
"def extract_text_opus(pdf_path):\n",
" images = convert_pdf_to_images(pdf_path)\n",
" extracted_text = \"\"\n",
" for image in images:\n",
" base64_string = encode_image_to_base64(image)\n",
" message = client.messages.create(\n",
" model=\"claude-3-opus-20240229\",\n",
" max_tokens=1024,\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"image\",\n",
" \"source\": {\n",
" \"type\": \"base64\",\n",
" \"media_type\": image_media_type,\n",
" \"data\": base64_string,\n",
" },\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"Return the bounding boxes of all the tables present in this image.\"\n",
" }\n",
" ],\n",
" }\n",
" ],\n",
" )\n",
" extracted_text += message.content[0].text\n",
" return extracted_text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "SdGdnz_NvK7K"
},
"outputs": [],
"source": [
"num_docs = 11\n",
"for language in range(len(languages)):\n",
" extracted_text = extract_text_opus((\"/content/{}.pdf\").format(languages[language]))\n",
" output_file_path = (\"/content/opus_{}_output.txt\").format(languages[language])\n",
" with open(output_file_path, \"w\") as txt_file:\n",
" txt_file.write(extracted_text)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9OtYd0SG98BY"
},
"source": [
"**Claude - Sonnet**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "3319Um258rJk"
},
"outputs": [],
"source": [
"def extract_text_sonnet(pdf_path):\n",
" images = convert_pdf_to_images(pdf_path)\n",
" extracted_text = \"\"\n",
" for image in images:\n",
" base64_string = encode_image_to_base64(image)\n",
" message = client.messages.create(\n",
" model=\"claude-3-sonnet-20240229\",\n",
" max_tokens=1024,\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"image\",\n",
" \"source\": {\n",
" \"type\": \"base64\",\n",
" \"media_type\": image_media_type,\n",
" \"data\": base64_string,\n",
" },\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"Extract the actual text from this image and don't summarize.\"\n",
" }\n",
" ],\n",
" }\n",
" ],\n",
" )\n",
" extracted_text += message.content[0].text\n",
" return extracted_text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ODh6BiUL9VcD"
},
"outputs": [],
"source": [
"for language in range(len(languages)):\n",
" extracted_text = extract_text_sonnet((\"/content/{}.pdf\").format(languages[language]))\n",
" output_file_path = (\"/content/sonnet_{}_output.txt\").format(languages[language])\n",
" with open(output_file_path, \"w\") as txt_file:\n",
" txt_file.write(extracted_text)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "CThXxO7p-BMJ"
},
"source": [
"**Claude - Haiku**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "L541KQ9J9Lvy"
},
"outputs": [],
"source": [
"def extract_text_haiku(pdf_path):\n",
" images = convert_pdf_to_images(pdf_path)\n",
" extracted_text = \"\"\n",
" for image in images:\n",
" base64_string = encode_image_to_base64(image)\n",
" message = client.messages.create(\n",
" model=\"claude-3-haiku-20240307\",\n",
" max_tokens=1024,\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"image\",\n",
" \"source\": {\n",
" \"type\": \"base64\",\n",
" \"media_type\": image_media_type,\n",
" \"data\": base64_string,\n",
" },\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"Extract the text from this image.\"\n",
" }\n",
" ],\n",
" }\n",
" ],\n",
" )\n",
" extracted_text += message.content[0].text\n",
" return extracted_text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "E-99_Lnj9cJD"
},
"outputs": [],
"source": [
"for language in range(len(languages)):\n",
" extracted_text = extract_text_haiku((\"/content/{}.pdf\").format(languages[language]))\n",
" output_file_path = (\"/content/haiku_{}_output.txt\").format(languages[language])\n",
" with open(output_file_path, \"w\") as txt_file:\n",
" txt_file.write(extracted_text)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "gfKCNSmLyW5f"
},
"source": [
"# **Gemini**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "iY_ykg3GzNDH"
},
"outputs": [],
"source": [
"!pip install google-generativeai\n",
"!pip install fitz\n",
"!pip install PyMuPDF"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2BP_rT0vAz1S"
},
"outputs": [],
"source": [
"import fitz # PyMuPDF\n",
"import google.generativeai as genai\n",
"import os\n",
"import pathlib\n",
"import PIL.Image"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "HTnd3kkkA2IP"
},
"outputs": [],
"source": [
"num_docs = 4\n",
"genai.configure(api_key= 'AIzaSyBwk94xRhPOIkvO0E3pYhXQ7Rrk5my5IyY')\n",
"gemini_pro_vision = genai.GenerativeModel('gemini-pro-vision')\n",
"genimi_gemini_flash = genai.GenerativeModel('gemini-1.5-flash-latest')\n",
"prompt = \"Extract the text from this image.\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "kcZrMxbj-I2c"
},
"outputs": [],
"source": [
"def extract_text_gemini(model, pdf_path):\n",
" model = genai.GenerativeModel('gemini-1.5-flash-latest')\n",
" extracted_text = \"\"\n",
" with fitz.open(pdf_path) as doc:\n",
" for page_num in range(len(doc)):\n",
" page = doc.load_page(page_num)\n",
" pix = page.get_pixmap()\n",
" img = PIL.Image.frombytes(\"RGB\", [pix.width, pix.height], pix.samples)\n",
" response = model.generate_content(\n",
" [img, prompt], stream=False\n",
" )\n",
" response.resolve()\n",
" extracted_text += response.text\n",
" print(extracted_text)\n",
" return extracted_text"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "5XR-W_cR3vBw"
},
"source": [
"**Gemini 1.5 Pro**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Ir2GgoHk17AK"
},
"outputs": [],
"source": [
"num_docs = 9\n",
"for language in range(len(languages)):\n",
" print(languages[language])\n",
" extracted_text = extract_text_gemini(gemini_pro_vision, (\"/content/{}.pdf\").format(languages[language]))\n",
" output_file_path = (\"/content/gemini_pro_{}_output.txt\").format(languages[language])\n",
" with open(output_file_path, \"w\") as txt_file:\n",
" txt_file.write(extracted_text)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "S1lGP9y23y2K"
},
"source": [
"**Gemini 1.5 Flash**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "SpAq__743nE3"
},
"outputs": [],
"source": [
"num_docs = 11\n",
"for doc_id in range(11, num_docs+1):\n",
" extracted_text = extract_text_gemini(genimi_gemini_flash, (\"/content/scan-font{}.pdf\").format(doc_id))\n",
" output_file_path = (\"/content/gemini_flash_font_output_{}.txt\").format(doc_id)\n",
" with open(output_file_path, \"w\") as txt_file:\n",
" txt_file.write(extracted_text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "JIY6uTML6Hru"
},
"outputs": [],
"source": [
"import vertexai\n",
"\n",
"from vertexai.generative_models import GenerativeModel, Part\n",
"\n",
"# TODO(developer): Update and un-comment below line\n",
"# project_id = \"PROJECT_ID\"\n",
"\n",
"vertexai.init(project=940337059666, location=\"us-central1\")\n",
"\n",
"model = GenerativeModel(model_name=\"gemini-1.0-pro-vision-001\")\n",
"\n",
"response = model.generate_content(\n",
" [\n",
" Part.from_uri(\n",
" \"gs://cloud-samples-data/generative-ai/image/scones.jpg\",\n",
" mime_type=\"image/jpeg\",\n",
" ),\n",
" \"What is shown in this image?\",\n",
" ]\n",
")\n",
"\n",
"print(response.text)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "czybTmwiKSRW"
},
"source": [
"# **GPT 4**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "C3x8DAM0KaWr"
},
"outputs": [],
"source": [
"!pip install openai"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "02MxqPcgznWt"
},
"outputs": [],
"source": [
"import openai"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "f1Rfi-5czSG8"
},
"outputs": [],
"source": [
"openai.api_key = 'sk-proj-YOl2xepEsNppWm3xLshlT3BlbkFJL04qQgahGxFcFGEClnQK'\n",
"image_media_type = \"image/png\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "szlmMiN9ztl4"
},
"outputs": [],
"source": [
"def extract_text_gpt(model, pdf_path):\n",
" images = convert_pdf_to_images(pdf_path)\n",
" extracted_text = \"\"\n",
" headers = {\n",
" \"Content-Type\": \"application/json\",\n",
" \"Authorization\": f\"Bearer {openai.api_key}\"\n",
" }\n",
"\n",
" for image in images:\n",
" base64_string = encode_image_to_base64(image)\n",
" payload = {\n",
" \"model\": model,\n",
" \"messages\": [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": \"Extract bounding boxes of all the tables present in this image. Return bounding boxes as liat of lists and don't provide any other text in the response.\"\n",
" },\n",
" {\n",
" \"type\": \"image_url\",\n",
" \"image_url\": {\n",
" \"url\": f\"data:image/jpeg;base64,{base64_string}\"\n",
" }\n",
" }\n",
" ]\n",
" }\n",
" ],\n",
" }\n",
"\n",
" response = requests.post(\"https://api.openai.com/v1/chat/completions\", headers=headers, json=payload)\n",
" response_json = response.json()\n",
"\n",
" if \"choices\" in response_json and len(response_json[\"choices\"]) > 0:\n",
" extracted_text += response_json[\"choices\"][0][\"message\"][\"content\"]\n",
"\n",
" return extracted_text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"background_save": true,
"base_uri": "https://localhost:8080/",
"height": 52
},
"id": "24_FkL6_flsn",
"outputId": "7a4c2ca4-14bc-40f6-b47c-98f311e6808f"
},
"outputs": [
{
"data": {
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
},
"text/plain": [
"'In the image you provided, there isn\\'t a traditional visual \"table\" with borders, but there is a list that resembles a table structure, with items and corresponding page numbers. I\\'ll provide the bounding box for this section as it contains structured data which could be interpreted similar to a table layout.\\n\\nHere is the bounding box for the textual data that looks like a columnar listing (which may serve as \"table\" in loose terms) present in the image:\\n\\n- **Top-left corner**: (x: 25, y: 470)\\n- **Bottom-right corner**: (x: 415, y: 670)\\n\\nThis bounding box outlines the area where the list of engine components and their corresponding sections in a manual are presented.The image contains two tables with the following bounding boxes:\\n\\n1. The first table, titled \"DIMENSIONS\", has the bounding box:\\n - Top-left corner: (39, 100)\\n - Bottom-right corner: (397, 366)\\n\\n2. The second table, titled \"ENGINE SPECIFICATION\", is located:\\n - Top-left corner: (39, 386)\\n - Bottom-right corner: (397, 599)\\n\\nThese coordinates are approximate and measured in pixels from the top-left corner of the image which is assumed to be (0,0).The image contains a single large table. The bounding box for the table can be approximately described as covering most of the page, with a little margin on all sides. Here is a rough estimation of the bounding box coordinates based on the pixel dimensions given (assuming the total image size is the standard document size, typically at 437 pixels width and 613 pixels height):\\n\\n- **Left:** 30 pixels\\n- **Top:** 70 pixels\\n- **Right:** 407 pixels\\n- **Bottom:** 590 pixels\\n\\nThis encompasses the whole table from the headers \"Light Bulb\", \"Bulb type\", and \"Wattage\" down to the notes at the bottom of the table.The image contains one primary table positioned approximately at the top portion of the image. The bounding box for this table can be described as follows:\\n\\n- Left edge around column 35 pixels\\n- Top edge around row 140 pixels\\n- Right edge around column 403 pixels\\n- Bottom edge around row 385 pixels\\n\\nThese dimensions provide a general bounding box for the visible table in the image.The image contains a single table. Here are the bounding box coordinates for the table:\\n\\n- Top-left corner: (32, 140)\\n- Top-right corner: (408, 140)\\n- Bottom-left corner: (32, 374)\\n- Bottom-right corner: (408, 374)\\n\\nThese coordinates are in pixels and approximate, based on the visible boundaries of the table in the image.The image contains two tables. Here are their bounding boxes described in a format (x, y, width, height) based on a coordinate system with the top-left corner of the image as the origin (0, 0):\\n\\n1. The first table \"Temperature Range for SAE Viscosity Numbers\":\\n - x: 62\\n - y: 433\\n - width: 315\\n - height: 120\\n\\n2. The second table \"Petrol Engine Oil\":\\n - x: 92\\n - y: 580\\n - width: 255\\n - height: 45\\n\\nThese coordinates and dimensions are approximations that match the locations of tables as visible in the image provided.The image you provided doesn\\'t contain any tables. It features a dual column layout with text and images explaining how to locate the vehicle identification number (VIN) and vehicle certification label for a car. Each column includes an image of a car interior and a descriptive text underneath. There are no visible tables to extract bounding boxes from in this image.'"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extracted_text = extract_text_gpt(\"gpt-4-turbo\", \"/content/hyundai_exter-24-30.pdf\")\n",
"extracted_text"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "slcGEKRq1kwh"
},
"source": [
"**GPT 4 Turbo**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "24KZ_nZN0DyO"
},
"outputs": [],
"source": [
"for language in range(len(languages)):\n",
" pdf_path = f\"/content/{languages[language]}.pdf\"\n",
" extracted_text = extract_text_gpt(\"gpt-4-turbo\", pdf_path)\n",
" output_file_path = f\"/content/gpt4_turbo_{languages[language]}_output.txt\"\n",
" with open(output_file_path, \"w\") as txt_file:\n",
" txt_file.write(extracted_text)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-ZBQwUd61znH"
},
"source": [
"**GPT 4o**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "AmMdPEU51Wtn"
},
"outputs": [],
"source": [
"for language in range(len(languages)):\n",
" pdf_path = f\"/content/{languages[language]}.pdf\"\n",
" extracted_text = extract_text_gpt(\"gpt-4o\", pdf_path)\n",
" output_file_path = f\"/content/gpt4o_{languages[language]}_output.txt\"\n",
" with open(output_file_path, \"w\") as txt_file:\n",
" txt_file.write(extracted_text)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "MN8aesQO4rFx"
},
"source": [
"# **Google Vision**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "WZr-dP4N5zdl"
},
"outputs": [],
"source": [
"!pip install google-cloud-vision\n",
"!pip install pdf2image"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "dt7RZATJ4ILK",
"collapsed": true
},
"outputs": [],
"source": [
"!sudo apt-get update\n",
"!apt-get install poppler-utils"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"id": "Vd78XcqKElbz"
},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"/content/ai-drive-test-vision-ocr.json\""
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"id": "jePfJZnC3ZVf"
},
"outputs": [],
"source": [
"from pdf2image import convert_from_path\n",
"import base64\n",
"from io import BytesIO\n",
"from PIL import Image\n",
"from google.cloud import vision\n",
"\n",
"def pdf_to_images(pdf_path):\n",
" images = convert_from_path(pdf_path)\n",
" image_paths = []\n",
" for i, image in enumerate(images):\n",
" image_path = f\"/tmp/page_{i}.png\"\n",
" image.save(image_path, \"PNG\")\n",
" image_paths.append(image_path)\n",
" return image_paths"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "snf2koZ16oYU"
},
"outputs": [],
"source": [
"def extract_text_vision(path):\n",
" client = vision.ImageAnnotatorClient()\n",
"\n",
" with open(path, \"rb\") as image_file:\n",
" content = image_file.read()\n",
"\n",
" image = vision.Image(content=content)\n",
" response = client.document_text_detection(image=image)\n",
"\n",
" extracted_text = \"\"\n",
" for page in response.full_text_annotation.pages:\n",
" for block in page.blocks:\n",
" for paragraph in block.paragraphs:\n",
" for word in paragraph.words:\n",
" word_text = \"\".join([symbol.text for symbol in word.symbols])\n",
" extracted_text += word_text + \" \"\n",
" extracted_text += \"\\n\"\n",
"\n",
" return extracted_text"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "4JsTrKvHEfoy"
},
"outputs": [],
"source": [
"def detect_documents_vision(pdf_path):\n",
" pages = pdf_to_images(pdf_path)\n",
" extracted_text = \"\"\n",
" for pg in pages:\n",
" extracted_text += extract_text_vision(pg)\n",
" return extracted_text"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"id": "IgVJNXqf7n-D"
},
"outputs": [],
"source": [
"for doc_id in range(1, 2):\n",
" pdf_path = \"/content/japanese2.pdf\"\n",
" extracted_text = detect_documents_vision(pdf_path)\n",
" output_file_path = f\"/content/vision_japanese_output.txt\"\n",
" with open(output_file_path, \"w\") as txt_file:\n",
" txt_file.write(extracted_text)"
]
},
{
"cell_type": "markdown",
"source": [
"# **Florence**-2-large"
],
"metadata": {
"id": "tedrUKKhBH-6"
}
},
{
"cell_type": "code",
"source": [
"!pip install einops flash_attn timm"
],
"metadata": {
"id": "B4noKtMMBpaD"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from PIL import Image, ImageDraw\n",
"from IPython.display import display\n",
"\n",
"def draw_boxes(image_path, boxes):\n",
" image = Image.open(image_path).convert(\"RGB\")\n",
" draw = ImageDraw.Draw(image)\n",
"\n",
" for box in boxes:\n",
" draw.rectangle(box, outline=\"red\", width=3)\n",
" display(image)"
],
"metadata": {
"id": "14KPiTfyUpMi"
},
"execution_count": 22,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import fitz\n",
"import requests\n",
"from PIL import Image\n",
"from transformers import AutoProcessor, AutoModelForCausalLM"
],
"metadata": {
"id": "ve5DeQuwBPbg"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/Florence-2-large\", trust_remote_code=True)\n",
"processor = AutoProcessor.from_pretrained(\"microsoft/Florence-2-large\", trust_remote_code=True)"
],
"metadata": {
"id": "MaXP9YRMBhXl"
},
"execution_count": 19,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model = AutoModelForCausalLM.from_pretrained(\"microsoft/Florence-2-base\", trust_remote_code=True)\n",
"processor = AutoProcessor.from_pretrained(\"microsoft/Florence-2-base\", trust_remote_code=True)"
],
"metadata": {
"id": "1rcc-e5AGn_n"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"prompt = \"Extract bounding boxes of all the tables present in this page\""
],
"metadata": {
"id": "xwgEmKdvBknf"
},
"execution_count": 16,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def extract_text_florence(pdf_path):\n",
" images = convert_from_path(pdf_path)\n",
" extracted_text = \"\"\n",
" # for image in images:\n",
" image = images[1]\n",
" inputs = processor(text=prompt, images=image, return_tensors=\"pt\")\n",
"\n",
" generated_ids = model.generate(\n",
" input_ids=inputs[\"input_ids\"],\n",
" pixel_values=inputs[\"pixel_values\"],\n",
" max_new_tokens=1024,\n",
" num_beams=3,\n",
" do_sample=False\n",
" )\n",
" generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]\n",
"\n",
" parsed_answer = processor.post_process_generation(generated_text, task=\"\", image_size=(image.width, image.height))\n",
"\n",
" print(parsed_answer)\n",
" return parsed_answer\n"
],
"metadata": {
"id": "bB-majwGBHAd"
},
"execution_count": 38,
"outputs": []
},
{
"cell_type": "code",
"source": [
"extracted_data = extract_text_florence(\"/content/table-data.pdf\")\n",
"bbox = extracted_data['']['bboxes']\n",
"bbox"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Mch5i9PfDpu1",
"outputId": "0be21a77-ea1b-4e8c-a332-47c19694f0eb"
},
"execution_count": 39,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{'': {'bboxes': [[141.4739990234375, 219.3100128173828, 1173.7821044921875, 599.6900024414062], [0.6460000276565552, 0.9100000262260437, 1290.06201171875, 1817.27001953125]], 'labels': ['bounding boxes of all the tables present', 'this page']}}\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[[141.4739990234375, 219.3100128173828, 1173.7821044921875, 599.6900024414062],\n",
" [0.6460000276565552, 0.9100000262260437, 1290.06201171875, 1817.27001953125]]"
]
},
"metadata": {},
"execution_count": 39
}
]
},
{
"cell_type": "code",
"source": [
"draw_boxes(\"/content/table-2.png\", bbox)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "VLcTEUukUqZN",
"outputId": "2b61feaa-e134-4b77-967d-10104ed57ab2"
},
"execution_count": 40,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"import torch\n",
"from transformers import DetrImageProcessor, DetrForObjectDetection\n",
"from PIL import Image\n",
"import requests\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.patches as patches"
],
"metadata": {
"id": "g-anGRt3X6wu"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"processor = DetrImageProcessor.from_pretrained(\"microsoft/Florence-2-large\")\n",
"model = DetrForObjectDetection.from_pretrained(\"microsoft/Florence-2-large\")"
],
"metadata": {
"id": "jMgIhWFxX9IW"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"\n",
"\n",
"# Load the processor and model\n",
"\n",
"\n",
"# Load the image\n",
"images = convert_from_path(\"/content/table-data.pdf\")\n",
"image = images[1]\n",
"\n",
"# Preprocess the image\n",
"inputs = processor(images=image, return_tensors=\"pt\")\n",
"\n",
"# Perform object detection\n",
"outputs = model(**inputs)\n",
"\n",
"# Extract boxes and labels\n",
"target_sizes = torch.tensor([image.size[::-1]])\n",
"results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]\n",
"\n",
"# Filter boxes for tables (assuming label 74 is for 'table')\n",
"table_boxes = [box for box, score, label in zip(results[\"boxes\"], results[\"scores\"], results[\"labels\"]) if label == 74 and score > 0.9]\n",
"\n",
"# Draw bounding boxes on the image\n",
"fig, ax = plt.subplots(1, figsize=(16, 16))\n",
"ax.imshow(image)\n",
"\n",
"for box in table_boxes:\n",
" xmin, ymin, xmax, ymax = box\n",
" rect = patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, linewidth=2, edgecolor='r', facecolor='red')\n",
" ax.add_patch(rect)\n",
"\n",
"plt.show()\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "vy8IeDcWXrrC",
"outputId": "338de2cc-631f-4dcc-d7ef-7e38ce3a99c3"
},
"execution_count": 43,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"