diff --git "a/ocr_notebooks/table_extraction.ipynb" "b/ocr_notebooks/table_extraction.ipynb" new file mode 100644--- /dev/null +++ "b/ocr_notebooks/table_extraction.ipynb" @@ -0,0 +1,3398 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "collapsed_sections": [ + "SFMoKtzdyYHU", + "cRHuvf0kKUr7" + ] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# **Utils**" + ], + "metadata": { + "id": "ekEn765o-nNk" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install pillow" + ], + "metadata": { + "id": "y4Op9_Ey-qp8", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "0811c257-4608-4880-85fb-8809bd35d0de" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (9.4.0)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from PIL import Image, ImageDraw\n", + "from IPython.display import display\n", + "\n", + "def draw_boxes(image_path, boxes):\n", + " image = Image.open(image_path).convert(\"RGB\")\n", + " draw = ImageDraw.Draw(image)\n", + "\n", + " for box in boxes:\n", + " draw.rectangle(box, outline=\"red\", width=3)\n", + " display(image)" + ], + "metadata": { + "id": "HNQ0V2Ec-3qY" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def convert_pdf_to_images(pdf_path):\n", + " images = []\n", + " with fitz.open(pdf_path) as doc:\n", + " for page_num in range(len(doc)):\n", + " page = doc.load_page(page_num)\n", + " pix = page.get_pixmap()\n", + " images.append(pix)\n", + " return images\n", + "\n", + "def encode_image_to_base64(image):\n", + " image_bytes = image.tobytes()\n", + " base64_encoded = base64.b64encode(image_bytes)\n", + " base64_string = base64_encoded.decode(\"utf-8\")\n", + " return base64_string" + ], + "metadata": { + "id": "YcrPwmvHUSLY" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def calculate_scaling_factors(extracted_bbox, ground_truth_bbox):\n", + " extracted_width = extracted_bbox[2] - extracted_bbox[0]\n", + " extracted_height = extracted_bbox[3] - extracted_bbox[1]\n", + " ground_truth_width = ground_truth_bbox[2] - ground_truth_bbox[0]\n", + " ground_truth_height = ground_truth_bbox[3] - ground_truth_bbox[1]\n", + "\n", + " scale_x = ground_truth_width / extracted_width\n", + " scale_y = ground_truth_height / extracted_height\n", + " print(scale_x, scale_y)\n", + "\n", + " return scale_x, scale_y\n", + "\n", + "def apply_scaling(bbox, scale_x, scale_y):\n", + " x1 = bbox[0] * scale_x\n", + " y1 = bbox[1] * scale_y\n", + " x2 = bbox[2] * scale_x\n", + " y2 = bbox[3] * scale_y\n", + " return [x1, y1, x2, y2]\n", + "\n", + "def scale_bounding_boxes(extracted_boxes, ground_truth_boxes):\n", + " scaled_boxes = []\n", + " for extracted_page, ground_truth_page in zip(extracted_boxes, ground_truth_boxes):\n", + " if not extracted_page or not ground_truth_page:\n", + " # If either page is empty, add empty list and continue\n", + " scaled_boxes.append([])\n", + " continue\n", + "\n", + " # Calculate scaling factors based on the first bounding box pair of the current page\n", + " scale_x, scale_y = calculate_scaling_factors(extracted_page[0], ground_truth_page[0])\n", + "\n", + " scaled_page = [apply_scaling(bbox, scale_x, scale_y) for bbox in extracted_page]\n", + " scaled_boxes.append(scaled_page)\n", + "\n", + " return scaled_boxes" + ], + "metadata": { + "id": "hsebsXnrqMQR" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def calculate_iou(box1, box2):\n", + " x1_max = max(box1[0], box2[0])\n", + " y1_max = max(box1[1], box2[1])\n", + " x2_min = min(box1[2], box2[2])\n", + " y2_min = min(box1[3], box2[3])\n", + "\n", + " intersection_area = max(0, x2_min - x1_max) * max(0, y2_min - y1_max)\n", + " box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])\n", + " box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])\n", + " union_area = box1_area + box2_area - intersection_area\n", + "\n", + " iou = intersection_area / union_area if union_area != 0 else 0\n", + " return iou\n", + "\n", + "def match_bounding_boxes(gt_boxes, pred_boxes):\n", + " matched_ious = []\n", + " used_predictions = set()\n", + " for gt_box in gt_boxes:\n", + " best_iou = 0\n", + " best_pred_idx = -1\n", + " for i, pred_box in enumerate(pred_boxes):\n", + " if i in used_predictions:\n", + " continue\n", + " iou = calculate_iou(gt_box, pred_box)\n", + " if iou > best_iou:\n", + " best_iou = iou\n", + " best_pred_idx = i\n", + " if best_pred_idx >= 0:\n", + " used_predictions.add(best_pred_idx)\n", + " matched_ious.append(best_iou)\n", + " else:\n", + " matched_ious.append(0)\n", + " return matched_ious\n", + "\n", + "def evaluate_models(ground_truth, predictions):\n", + " model_ious = {}\n", + " for model, pred_boxes in predictions.items():\n", + " ious = []\n", + " for gt_boxes, model_boxes in zip(ground_truth, pred_boxes):\n", + " matched_ious = match_bounding_boxes(gt_boxes, model_boxes)\n", + " ious.extend(matched_ious)\n", + " model_ious[model] = np.mean(ious)\n", + " return model_ious\n", + "\n", + "def plot_iou(ground_truth, table_bounding_boxes):\n", + " model_ious = t\n", + "\n", + " plt.figure(figsize=(13, 6))\n", + " plt.bar(model_ious.keys(), model_ious.values(), color=\"#568c64\")\n", + " plt.xlabel('Models')\n", + " plt.ylabel('Average IoU')\n", + " plt.title('Table Extraction - Model Comparison Based on IoU')\n", + " plt.ylim(0, 1)\n", + " plt.show()\n", + "\n", + " print(model_ious)" + ], + "metadata": { + "id": "C9kd39msyXsf" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "GroundTruth = [\n", + " [[91, 160, 828, 826]], # hardware 1\n", + " [[106, 162, 844, 426], [108, 539, 840, 770]], # hardware 2\n", + " [[553, 125, 1471, 480]], # hardware 3\n", + " [[11, 186, 673, 532], [404, 108, 673, 167]], # bank statement 1\n", + " [[63, 478, 936, 1127]], # bank statement 2\n", + " [[644, 269, 1275, 962]], # bank statement 3\n", + " [[45, 442, 560, 657]], # bank statement 4\n", + " [[81, 458, 458, 825], [489, 745, 748, 805]], # data sheet 3\n", + " [[63, 374, 592, 726]], # invoice 1\n", + " [[44, 340, 611, 456], [44, 502, 611, 702]], # invoice 2\n", + " [[60, 413, 776, 683]], # invoice 3\n", + " [[428, 144, 768, 549]], # real estate listings 2\n", + " [[50, 143, 767, 845]], # real estate listings 3\n", + " [[126, 72, 689, 308], [128, 376, 688, 584], [148, 821, 668, 895]], # research paper 1\n", + " [[23, 366, 791, 769]], # SEC 10k 1\n", + " [[22, 596, 793, 762], [24, 897, 793, 1013]], # SEC 144 1\n", + " [[22, 117, 793, 1022]], # SEC 144 2\n", + " [[94, 101, 592, 267]], # text book 2\n", + " [[223, 102, 721, 398]], # text book 3\n", + "]" + ], + "metadata": { + "id": "pgX0a1PycAdf" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ground_truth = [\n", + " [[91, 160, 828, 826]],\n", + " [[106, 162, 844, 426], [108, 539, 840, 770]],\n", + " [[553, 125, 1471, 480]]\n", + "]" + ], + "metadata": { + "id": "uDSBRnIUeWSK" + }, + "execution_count": 105, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "t = {'PyMuPDF': 0.59256987881234, 'GPT 4 Turbo': 0.26977835468383293, 'GPT 4o': 0.27406410283611266, 'Table Transformer': 0.6399495573127585, 'img2table': 0.49895505703101684, 'Gemini Pro': 0.15192114262846507, 'Gemini Flash': 0.17320909579531665}" + ], + "metadata": { + "id": "D7l_RhWMNj3p" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "plot_iou(GroundTruth, table_bounding_boxes)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 601 + }, + "id": "PdZDLLHLAQs8", + "outputId": "fbfb89a8-e308-4871-9a80-4bfd723909ce" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'PyMuPDF': 0.59256987881234, 'GPT 4 Turbo': 0.26977835468383293, 'GPT 4o': 0.27406410283611266, 'Table Transformer': 0.6399495573127585, 'img2table': 0.49895505703101684, 'Gemini Pro': 0.15192114262846507, 'Gemini Flash': 0.17320909579531665}\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# **Table Transformer**" + ], + "metadata": { + "id": "JGm-OrWh-sQO" + } + }, + { + "cell_type": "code", + "source": [ + "number_of_table_images = 19\n", + "table_bounding_boxes = {}" + ], + "metadata": { + "id": "2dCcj7H3bFyO" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install timm" + ], + "metadata": { + "id": "mm9Rhpf18k_W" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "os.kill(os.getpid(), 9)" + ], + "metadata": { + "id": "J1HcrjZU9YR9" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from huggingface_hub import hf_hub_download\n", + "from transformers import AutoImageProcessor, TableTransformerForObjectDetection\n", + "import torch\n", + "from PIL import Image" + ], + "metadata": { + "id": "D7pVEZWc73fP" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "image_processor = AutoImageProcessor.from_pretrained(\"microsoft/table-transformer-detection\")\n", + "model = TableTransformerForObjectDetection.from_pretrained(\"microsoft/table-transformer-detection\")" + ], + "metadata": { + "id": "_sfOxQ0p8g0w" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "Ns7MiUY12W7o" + }, + "outputs": [], + "source": [ + "def get_bounding_box_table_transformer(image_path):\n", + " image = Image.open(image_path).convert(\"RGB\")\n", + "\n", + " inputs = image_processor(images=image, return_tensors=\"pt\")\n", + " outputs = model(**inputs)\n", + "\n", + " target_sizes = torch.tensor([image.size[::-1]])\n", + " results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[0]\n", + "\n", + " for score, label, box in zip(results[\"scores\"], results[\"labels\"], results[\"boxes\"]):\n", + " box = [round(i, 2) for i in box.tolist()]\n", + " print(\n", + " f\"Detected {model.config.id2label[label.item()]} with confidence \"\n", + " f\"{round(score.item(), 3)} at location {box}\"\n", + " )\n", + " # draw_boxes(image_path, results[\"boxes\"].tolist())\n", + " return results[\"boxes\"].tolist()" + ] + }, + { + "cell_type": "code", + "source": [ + "bounding_boxes = []\n", + "for image_id in range(1, number_of_table_images+1):\n", + " bounding_boxes.append(get_bounding_box_table_transformer(f\"/content/table-{image_id}.png\"))" + ], + "metadata": { + "id": "hLXfHKOfYs0-", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "outputId": "4ac1d316-c102-4ccf-c878-641a6fc91ed3" + }, + "execution_count": 68, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Detected table with confidence 1.0 at location [96.0, 166.24, 822.08, 820.72]\n", + "Detected table with confidence 0.999 at location [112.75, 166.88, 817.07, 423.66]\n", + "Detected table with confidence 0.998 at location [113.62, 551.71, 717.57, 769.02]\n", + "Detected table with confidence 0.991 at location [84.09, 124.33, 1449.32, 477.9]\n", + "Detected table with confidence 0.999 at location [13.07, 186.75, 660.28, 508.17]\n", + "Detected table with confidence 0.999 at location [96.56, 493.15, 922.1, 1075.55]\n", + "Detected table with confidence 0.998 at location [653.11, 278.32, 1266.22, 960.26]\n", + "Detected table with confidence 0.999 at location [46.12, 455.55, 553.68, 637.68]\n", + "Detected table with confidence 0.999 at location [87.42, 458.74, 435.34, 820.38]\n", + "Detected table with confidence 0.971 at location [67.07, 167.88, 589.33, 669.51]\n", + "Detected table with confidence 0.985 at location [49.59, 347.63, 550.93, 448.3]\n", + "Detected table with confidence 0.997 at location [80.58, 431.38, 759.34, 581.56]\n", + "Detected table with confidence 0.999 at location [434.34, 174.71, 752.25, 535.47]\n", + "Detected table with confidence 1.0 at location [62.11, 143.34, 760.56, 828.18]\n", + "Detected table with confidence 0.999 at location [158.72, 824.56, 656.57, 893.26]\n", + "Detected table with confidence 0.999 at location [26.3, 376.54, 786.51, 762.81]\n", + "Detected table with confidence 0.917 at location [32.61, 895.06, 776.1, 1008.1]\n", + "Detected table with confidence 0.999 at location [34.37, 158.28, 779.47, 1011.17]\n", + "Detected table with confidence 0.956 at location [105.81, 111.87, 583.19, 250.39]\n", + "Detected table with confidence 1.0 at location [235.99, 125.88, 707.31, 377.27]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "table_bounding_boxes[\"Table Transformer\"] = bounding_boxes" + ], + "metadata": { + "id": "p5bnNc9Gq0Fh" + }, + "execution_count": 74, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **Gemini**" + ], + "metadata": { + "id": "k6J-oSrhMMji" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install google-generativeai" + ], + "metadata": { + "id": "RGW86jAkMS7_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import ast\n", + "import google.generativeai as genai\n", + "from PIL import Image, ImageDraw" + ], + "metadata": { + "id": "vUv_RtDwMWin" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "genai.configure(api_key= 'AIzaSyBwk94xRhPOIkvO0E3pYhXQ7Rrk5my5IyY')\n", + "gemini_pro_vision = genai.GenerativeModel('gemini-pro-vision')\n", + "genimi_gemini_flash = genai.GenerativeModel('gemini-1.5-flash-latest')\n", + "example_image = Image.open(\"/content/bank_statement_1.png\").convert(\"RGB\")\n", + "example_bbox = [[404,108,673,167], [11,186,673,532]]\n", + "prompt = f\"Extract the bounding boxes of all the tables present in this image. Return the bounding boxes as list of lists. Example: For this image -> {example_image} the extracted bounding box is {example_bbox}. This is just an example for understanding the requirement. Dont return the same bounding box.\"" + ], + "metadata": { + "id": "jfKpg4JlMYMs" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "prompt = f\"Extract the bounding boxes of all the tables present in this image. Return the bounding boxes as list of lists. Do not include anyother text or symbols in the output\"" + ], + "metadata": { + "id": "LfUKGju1X9AE" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def get_bounding_box_gemini(model, image_path):\n", + " img = Image.open(image_path).convert(\"RGB\")\n", + " response = model.generate_content(\n", + " [img, prompt], stream=False\n", + " )\n", + " response.resolve()\n", + " return response.text" + ], + "metadata": { + "id": "FP-TS5nuMsKf" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "temp_b = [[[364, 76, 737, 350], [364, 402, 737, 676], [364, 76, 737, 350], [364, 402, 737, 676]], [[464, 72, 1013, 300], [1116, 72, 1665, 300]], [[434, 63, 784, 301], [434, 320, 784, 558]], [[416, 57, 775, 284], [416, 300, 775, 527]], [[564, 68, 994, 306], [564, 322, 994, 558]], [[416, 77, 737, 1014], [416, 1036, 737, 1272]], [[564, 76, 744, 174], [564, 190, 743, 314], [565, 330, 743, 494], [565, 510, 743, 673]], [], [[516, 148, 734, 304], [516, 320, 735, 477]], [[526, 764, 744, 881]], [[503, 64, 749, 290], [503, 306, 749, 532]], [[545, 72, 1013, 185], [545, 200, 1013, 313], [545, 330, 1013, 442], [545, 458, 1013, 570], [545, 598, 1013, 710], [545, 736, 1013, 848], [545, 868, 1013, 980]], [[515, 77, 1014, 446], [515, 500, 1014, 869]],[[46, 51, 794, 160], [46, 172, 794, 282], [46, 295, 794, 404], [46, 416, 794, 526], [46, 539, 794, 648], [46, 660, 794, 772], [46, 783, 794, 892], [851, 51, 1599, 160], [851, 172, 1599, 282], [851, 295, 1599, 404], [851, 416, 1599, 526], [851, 539, 1599, 648], [851, 660, 1599, 772], [851, 783, 1599, 892]], [[517, 76, 1014, 306], [517, 332, 1014, 561], [517, 588, 1014, 817], [517, 845, 1014, 1072]], [[408, 571, 776, 715], [408, 720, 776, 864], [408, 869, 776, 1013]], [[418.0, 594.0, 756.0, 812.0], [418.0, 100.0, 756.0, 318.0]], [[718, 132, 1039, 794]], [[514, 74, 744, 188]]]" + ], + "metadata": { + "id": "0v3XIhU2ncNl" + }, + "execution_count": 40, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "bounding_boxes = []\n", + "for image_id in range(11, number_of_table_images+1):\n", + " res = get_bounding_box_gemini(gemini_pro_vision, f\"/content/table-{image_id}.png\")\n", + " print(image_id, res)\n", + " bounding_boxes.append(ast.literal_eval(res))\n", + "bounding_boxes" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 992 + }, + "collapsed": true, + "id": "A4WO-IjX5dnH", + "outputId": "e18ffa6d-060f-426b-db6a-14a955661a09" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "11 [[503, 64, 749, 290], [503, 306, 749, 532]]\n", + "12 [[545, 72, 1013, 185], [545, 200, 1013, 313], [545, 330, 1013, 442], [545, 458, 1013, 570], [545, 598, 1013, 710], [545, 736, 1013, 848], [545, 868, 1013, 980]]\n", + "13 [[515, 77, 1014, 446], [515, 500, 1014, 869]]\n", + "14 [\n", + " [46, 51, 794, 160],\n", + " [46, 172, 794, 282],\n", + " [46, 295, 794, 404],\n", + " [46, 416, 794, 526],\n", + " [46, 539, 794, 648],\n", + " [46, 660, 794, 772],\n", + " [46, 783, 794, 892],\n", + " [851, 51, 1599, 160],\n", + " [851, 172, 1599, 282],\n", + " [851, 295, 1599, 404],\n", + " [851, 416, 1599, 526],\n", + " [851, 539, 1599, 648],\n", + " [851, 660, 1599, 772],\n", + " [851, 783, 1599, 892]\n", + "]\n", + "15 [[517, 76, 1014, 306], [517, 332, 1014, 561], [517, 588, 1014, 817], [517, 845, 1014, 1072]]\n", + "16 [[408, 571, 776, 715], [408, 720, 776, 864], [408, 869, 776, 1013]]\n", + "17 [[418.0, 594.0, 756.0, 812.0], [418.0, 100.0, 756.0, 318.0]]\n", + "18 [[718, 132, 1039, 794]]\n", + "19 [[514, 74, 744, 188]]\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[[[503, 64, 749, 290], [503, 306, 749, 532]],\n", + " [[545, 72, 1013, 185],\n", + " [545, 200, 1013, 313],\n", + " [545, 330, 1013, 442],\n", + " [545, 458, 1013, 570],\n", + " [545, 598, 1013, 710],\n", + " [545, 736, 1013, 848],\n", + " [545, 868, 1013, 980]],\n", + " [[515, 77, 1014, 446], [515, 500, 1014, 869]],\n", + " [[46, 51, 794, 160],\n", + " [46, 172, 794, 282],\n", + " [46, 295, 794, 404],\n", + " [46, 416, 794, 526],\n", + " [46, 539, 794, 648],\n", + " [46, 660, 794, 772],\n", + " [46, 783, 794, 892],\n", + " [851, 51, 1599, 160],\n", + " [851, 172, 1599, 282],\n", + " [851, 295, 1599, 404],\n", + " [851, 416, 1599, 526],\n", + " [851, 539, 1599, 648],\n", + " [851, 660, 1599, 772],\n", + " [851, 783, 1599, 892]],\n", + " [[517, 76, 1014, 306],\n", + " [517, 332, 1014, 561],\n", + " [517, 588, 1014, 817],\n", + " [517, 845, 1014, 1072]],\n", + " [[408, 571, 776, 715], [408, 720, 776, 864], [408, 869, 776, 1013]],\n", + " [[418.0, 594.0, 756.0, 812.0], [418.0, 100.0, 756.0, 318.0]],\n", + " [[718, 132, 1039, 794]],\n", + " [[514, 74, 744, 188]]]" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "table_bounding_boxes[\"Gemini Pro\"] = temp_b" + ], + "metadata": { + "id": "UcIf1nxjA4zs" + }, + "execution_count": 23, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import re\n", + "\n", + "def parse_bboxs_gemini_flash(input_string):\n", + " lines = [line for line in input_string.strip().split('\\n') if line]\n", + " bounding_boxes = [list(map(int, re.findall(r'\\d+', line))) for line in lines]\n", + " return bounding_boxes" + ], + "metadata": { + "id": "e8ff-l0Vr6Be" + }, + "execution_count": 25, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "bounding_boxes = []\n", + "for image_id in range(1, number_of_table_images+1):\n", + " res = parse_bboxs_gemini_flash(get_bounding_box_gemini(genimi_gemini_flash, f\"/content/table-{image_id}.png\"))\n", + " print(image_id, res)\n", + " bounding_boxes.append(res)\n", + "bounding_boxes" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "collapsed": true, + "id": "Y4dJ1fSm5kif", + "outputId": "db1c3828-b645-4f6c-f33d-0cb8e42cc6d7" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1 [[94, 112, 818, 880]]\n", + "2 [[107, 115, 343, 840], [396, 115, 591, 813]]\n", + "3 [[134, 47, 463, 911]]\n", + "4 [[342, 15, 952, 967], [222, 616, 284, 965]]\n", + "5 [[280, 63, 371, 454], [410, 85, 910, 928]]\n", + "6 [[], [], [], [104], [336], [235], [662], [], [], [255], [345], [903], [662], [], [], []]\n", + "7 [[608, 67, 892, 928], [342, 78, 400, 569]]\n", + "8 [[139, 608, 170, 879], [417, 102, 797, 532], [692, 576, 754, 891], [573, 630, 658, 834]]\n", + "9 [[402, 91, 780, 911]]\n", + "10 [[354, 78, 500, 862], [526, 74, 761, 890]]\n", + "11 [[336, 77, 585, 922]]\n", + "12 [[138, 522, 541, 926]]\n", + "13 [[148, 69, 798, 932]]\n", + "14 [[291, 85, 355, 913], [558, 85, 601, 912], [781, 191, 855, 817], [885, 139, 914, 904]]\n", + "15 [[357, 43, 724, 962]]\n", + "16 [[543, 38, 697, 923], [833, 39, 964, 942]]\n", + "17 [[217, 44, 949, 960]]\n", + "18 [[96, 126, 253, 725, 100, 740, 161, 891]]\n", + "19 [[116, 277, 367, 861]]\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[[[94, 112, 818, 880]],\n", + " [[107, 115, 343, 840], [396, 115, 591, 813]],\n", + " [[134, 47, 463, 911]],\n", + " [[342, 15, 952, 967], [222, 616, 284, 965]],\n", + " [[280, 63, 371, 454], [410, 85, 910, 928]],\n", + " [[],\n", + " [],\n", + " [],\n", + " [104],\n", + " [336],\n", + " [235],\n", + " [662],\n", + " [],\n", + " [],\n", + " [255],\n", + " [345],\n", + " [903],\n", + " [662],\n", + " [],\n", + " [],\n", + " []],\n", + " [[608, 67, 892, 928], [342, 78, 400, 569]],\n", + " [[139, 608, 170, 879],\n", + " [417, 102, 797, 532],\n", + " [692, 576, 754, 891],\n", + " [573, 630, 658, 834]],\n", + " [[402, 91, 780, 911]],\n", + " [[354, 78, 500, 862], [526, 74, 761, 890]],\n", + " [[336, 77, 585, 922]],\n", + " [[138, 522, 541, 926]],\n", + " [[148, 69, 798, 932]],\n", + " [[291, 85, 355, 913],\n", + " [558, 85, 601, 912],\n", + " [781, 191, 855, 817],\n", + " [885, 139, 914, 904]],\n", + " [[357, 43, 724, 962]],\n", + " [[543, 38, 697, 923], [833, 39, 964, 942]],\n", + " [[217, 44, 949, 960]],\n", + " [[96, 126, 253, 725, 100, 740, 161, 891]],\n", + " [[116, 277, 367, 861]]]" + ] + }, + "metadata": {}, + "execution_count": 27 + } + ] + }, + { + "cell_type": "code", + "source": [ + "scaled_boxes = scale_bounding_boxes(bounding_boxes, GroundTruth)" + ], + "metadata": { + "id": "SU59RW14owDJ", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "outputId": "489b9591-c156-465e-8cf2-bde94c64c2a2" + }, + "execution_count": 34, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1.0179558011049723 0.8671875\n", + "3.1271186440677967 0.3641379310344828\n", + "2.790273556231003 0.41087962962962965\n", + "1.0852459016393443 0.3634453781512605\n", + "9.593406593406593 1.659846547314578\n", + "1.8133802816901408 0.2497096399535424\n", + "12.161290322580646 1.3542435424354244\n", + "1.3994708994708995 0.4292682926829268\n", + "3.8835616438356166 0.14795918367346939\n", + "2.8755020080321283 0.31952662721893493\n", + "0.8436724565756824 1.0024752475247525\n", + "1.103076923076923 0.813441483198146\n", + "8.796875 0.28502415458937197\n", + "2.092643051771117 0.4385201305767138\n", + "5.0064935064935066 0.18757062146892656\n", + "1.0532786885245902 0.9879912663755459\n", + "3.171974522292994 0.27712854757929883\n", + "1.9840637450199203 0.5068493150684932\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "table_bounding_boxes[\"Gemini Flash\"] = bounding_boxes" + ], + "metadata": { + "id": "0obpPk6PrfSp" + }, + "execution_count": 37, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **PyMuPDF**" + ], + "metadata": { + "id": "FLUoK1eEDdz6" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install pymupdf" + ], + "metadata": { + "id": "vTbQnKbhDbGN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import io\n", + "import pymupdf\n", + "import fitz" + ], + "metadata": { + "id": "UASxsiDsEZOa" + }, + "execution_count": 76, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def extract_bounding_box_pymupdf(pdf_path):\n", + " bounding_boxes = []\n", + " pages = pymupdf.open(pdf_path)\n", + " for page_num in range(len(pages)):\n", + " page = pages[page_num]\n", + " tabs = page.find_tables()\n", + " page_tables = []\n", + " for table in range(len(tabs.tables)):\n", + " page_tables.append(list(tabs.tables[table].bbox))\n", + " bounding_boxes.append(page_tables)\n", + " return bounding_boxes" + ], + "metadata": { + "id": "4tPKkL9WDhny" + }, + "execution_count": 89, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "bounding_boxes = extract_bounding_box_pymupdf(\"/content/table-data.pdf\")\n", + "bounding_boxes" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "wAVMMzeKOUyK", + "outputId": "eebfca12-cb13-4c64-d529-5f5349e8e031" + }, + "execution_count": 92, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[[[18.487498389350044,\n", + " 19.124998092651367,\n", + " 593.9155578613281,\n", + " 109.12499237060547]],\n", + " [],\n", + " []]" + ] + }, + "metadata": {}, + "execution_count": 92 + } + ] + }, + { + "cell_type": "code", + "source": [ + "scaled_boxes = scale_bounding_boxes(bounding_boxes, ground_truth)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cDRQ3HYOsGqZ", + "outputId": "e920c106-bf06-4d0a-abc0-3c4638dad60a" + }, + "execution_count": 38, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2.13103094886968 2.140958468255127\n", + "1.690586078570765 5.6974702138220685\n", + "2.6620540127239725 2.66533617646015\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "draw_boxes(\"/content/table-2.png\", scaled_boxes[1])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "collapsed": true, + "id": "06HAD3ejkAZP", + "outputId": "7320bc3a-fb52-41ce-f9c5-900d8cc83f62" + }, + "execution_count": 41, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "table_bounding_boxes[\"PyMuPDF\"] = scaled_boxes" + ], + "metadata": { + "id": "FI5bbFlmsFpi" + }, + "execution_count": 44, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **GPT 4**" + ], + "metadata": { + "id": "2AsVXS8rUAau" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install openai" + ], + "metadata": { + "id": "GJmOrHo2UDiR" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import openai\n", + "import fitz\n", + "import base64\n", + "import requests" + ], + "metadata": { + "id": "iThqg5lpUFK0" + }, + "execution_count": 28, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "openai.api_key = 'sk-proj-YOl2xepEsNppWm3xLshlT3BlbkFJL04qQgahGxFcFGEClnQK'\n", + "image_media_type = \"image/png\"" + ], + "metadata": { + "id": "tJiWKY7rUG3D" + }, + "execution_count": 29, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def convert_pdf_to_images(pdf_path):\n", + " images = []\n", + " with fitz.open(pdf_path) as doc:\n", + " for page_num in range(len(doc)):\n", + " page = doc.load_page(page_num)\n", + " pix = page.get_pixmap()\n", + " images.append(pix)\n", + " return images\n", + "\n", + "def encode_image_to_base64(image):\n", + " image_bytes = image.tobytes()\n", + " base64_encoded = base64.b64encode(image_bytes)\n", + " base64_string = base64_encoded.decode(\"utf-8\")\n", + " return base64_string" + ], + "metadata": { + "id": "5EBsT_ZCQuxW" + }, + "execution_count": 30, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "example_image = Image.open(\"/content/bank_statement_1.png\").convert(\"RGB\")\n", + "example_bbox = [[404,108,673,167], [11,186,673,532]]\n", + "prompt = f\"Extract the bounding boxes of all the tables present in this image. Return the bounding boxes as list of lists. And don't provide any other text in the response.\"" + ], + "metadata": { + "id": "dU4cPU86md8R" + }, + "execution_count": 56, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def extract_bounding_box_gpt(model, pdf_path):\n", + " images = convert_pdf_to_images(pdf_path)\n", + " extracted_bbox = []\n", + " headers = {\n", + " \"Content-Type\": \"application/json\",\n", + " \"Authorization\": f\"Bearer {openai.api_key}\"\n", + " }\n", + "\n", + " for image in images:\n", + " base64_string = encode_image_to_base64(image)\n", + " payload = {\n", + " \"model\": model,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": prompt\n", + " },\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\n", + " \"url\": f\"data:image/jpeg;base64,{base64_string}\"\n", + " }\n", + " }\n", + " ]\n", + " }\n", + " ],\n", + " }\n", + "\n", + " response = requests.post(\"https://api.openai.com/v1/chat/completions\", headers=headers, json=payload)\n", + " response_json = response.json()\n", + "\n", + " if \"choices\" in response_json and len(response_json[\"choices\"]) > 0:\n", + " extracted_bbox.append(ast.literal_eval(response_json[\"choices\"][0][\"message\"][\"content\"]))\n", + "\n", + " return extracted_bbox" + ], + "metadata": { + "id": "oB5gjXowUIUZ" + }, + "execution_count": 57, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "extracted_bbox = extract_bounding_box_gpt(\"gpt-4-turbo\", \"/content/table-data.pdf\")\n", + "extracted_bbox" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "rkSwN68WULHm", + "outputId": "104e279b-71c1-433e-8cfe-055c6ecc788e" + }, + "execution_count": 58, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[[[41, 75, 416, 556]],\n", + " [[42, 98, 422, 398], [42, 438, 422, 540]],\n", + " [[91, 92, 704, 293]],\n", + " [[40, 220, 490, 470]],\n", + " [[29, 224, 725, 350], [33, 402, 723, 1026]],\n", + " [[207, 148, 789, 719]],\n", + " [[34, 507, 412, 562], [22, 578, 426, 742]],\n", + " [[68, 199, 343, 236], [67, 248, 342, 366], [65, 374, 343, 517]],\n", + " [[41, 467, 287, 587]],\n", + " [[35, 415, 293, 493], [48, 538, 268, 577]],\n", + " [[35, 335, 385, 480]],\n", + " [[44, 85, 365, 167], [44, 542, 365, 760]],\n", + " [[26, 109, 383, 522]],\n", + " [[22, 284, 385, 858], [22, 902, 385, 1188], [22, 1581, 385, 1833]],\n", + " [[59, 181, 353, 253], [59, 299, 350, 372]],\n", + " [[49, 325, 476, 428], [48, 630, 475, 812], [48, 949, 475, 1032]],\n", + " [[111, 300, 391, 658]],\n", + " [[177, 283, 390, 120]],\n", + " [[108, 375, 309, 618]]]" + ] + }, + "metadata": {}, + "execution_count": 58 + } + ] + }, + { + "cell_type": "code", + "source": [ + "scaled_boxes = scale_bounding_boxes(extracted_bbox, GroundTruth)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "X7dG9YCksmM2", + "outputId": "3cdf3b28-b476-4a93-baff-15ab599392cf" + }, + "execution_count": 61, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1.9653333333333334 1.3846153846153846\n", + "1.9421052631578948 0.88\n", + "1.497553017944535 1.7661691542288558\n", + "1.471111111111111 1.384\n", + "1.2543103448275863 5.150793650793651\n", + "1.0841924398625429 1.2136602451838878\n", + "1.3624338624338623 3.909090909090909\n", + "1.3709090909090909 9.91891891891892\n", + "2.1504065040650406 2.933333333333333\n", + "2.197674418604651 1.4871794871794872\n", + "2.045714285714286 1.8620689655172413\n", + "1.0591900311526479 4.939024390243903\n", + "2.008403361344538 1.6997578692493946\n", + "1.5509641873278237 0.41114982578397213\n", + "2.6122448979591835 5.597222222222222\n", + "1.8056206088992974 1.6116504854368932\n", + "2.7535714285714286 2.5279329608938546\n", + "2.3380281690140845 -1.01840490797546\n", + "2.4776119402985075 1.2181069958847737\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "table_bounding_boxes[\"GPT 4 Turbo\"] = scaled_boxes" + ], + "metadata": { + "id": "47Rn-VKqqGEq" + }, + "execution_count": 46, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "extracted_bbox = extract_bounding_box_gpt(\"gpt-4o\", \"/content/table-data.pdf\")\n", + "extracted_bbox" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "dNOSNu7GRMtz", + "outputId": "7cd39e3e-74f0-492e-e8c0-00d779297129" + }, + "execution_count": 48, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[[[41, 118, 488, 403]],\n", + " [[100, 78, 468, 275], [100, 296, 468, 440]],\n", + " [[307, 119, 794, 274]],\n", + " [[32, 379, 321, 394], [31, 102, 504, 373]],\n", + " [[50, 273, 684, 823]],\n", + " [[84, 125, 702, 840]],\n", + " [[25, 157, 247, 340],\n", + " [281, 157, 465, 340],\n", + " [25, 346, 465, 451],\n", + " [22, 472, 465, 675]],\n", + " [[63, 320, 266, 439],\n", + " [63, 447, 265, 525],\n", + " [80, 76, 279, 175],\n", + " [55, 135, 320, 371]],\n", + " [[44, 258, 456, 298],\n", + " [44, 294, 456, 332],\n", + " [44, 331, 456, 369],\n", + " [44, 368, 456, 406],\n", + " [44, 405, 456, 445],\n", + " [44, 442, 456, 480],\n", + " [44, 480, 456, 518],\n", + " [44, 518, 456, 558]],\n", + " [[55, 236, 359, 361], [60, 376, 387, 525], [52, 533, 348, 554]],\n", + " [[52, 308, 518, 434], [52, 444, 243, 474]],\n", + " [[47, 49, 170, 135], [33, 150, 241, 405], [334, 138, 471, 410]],\n", + " [[67, 220, 993, 2001]],\n", + " [[79, 59, 673, 225], [79, 326, 674, 490], [81, 709, 674, 746]],\n", + " [[46, 289, 444, 553]],\n", + " [[50, 181, 563, 266], [50, 278, 729, 335], [50, 352, 729, 413]],\n", + " [[45, 77, 525, 106],\n", + " [25, 108, 545, 165],\n", + " [20, 181, 550, 236],\n", + " [20, 241, 550, 297],\n", + " [20, 301, 550, 356],\n", + " [20, 361, 550, 416],\n", + " [20, 421, 550, 476],\n", + " [20, 481, 550, 536],\n", + " [20, 541, 550, 596]],\n", + " [[59, 132, 501, 227], [132, 244, 303, 260]],\n", + " [[92, 79, 516, 277]]]" + ] + }, + "metadata": {}, + "execution_count": 48 + } + ] + }, + { + "cell_type": "code", + "source": [ + "scaled_boxes = scale_bounding_boxes(extracted_bbox, GroundTruth)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "embI9wAys2ZH", + "outputId": "fd470ce9-056d-4a08-9fc9-d41017c30640" + }, + "execution_count": 51, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1.6487695749440716 2.336842105263158\n", + "2.005434782608696 1.3401015228426396\n", + "1.8850102669404518 2.2903225806451615\n", + "2.290657439446367 23.066666666666666\n", + "1.3769716088328077 1.18\n", + "1.0210355987055015 0.9692307692307692\n", + "2.31981981981982 1.174863387978142\n", + "1.8571428571428572 3.0840336134453783\n", + "1.2839805825242718 8.8\n", + "1.8651315789473684 0.928\n", + "1.536480686695279 2.142857142857143\n", + "2.7642276422764227 4.709302325581396\n", + "0.7742980561555075 0.39416058394160586\n", + "0.9478114478114478 1.4216867469879517\n", + "1.92964824120603 1.5265151515151516\n", + "1.5029239766081872 1.9529411764705882\n", + "1.60625 31.20689655172414\n", + "1.1266968325791855 1.7473684210526317\n", + "1.1745283018867925 1.494949494949495\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "table_bounding_boxes[\"GPT 4o\"] = scaled_boxes" + ], + "metadata": { + "id": "S7alFPQEqTEl" + }, + "execution_count": 52, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **img2table**" + ], + "metadata": { + "id": "2ZELpJaMsAkl" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install img2table" + ], + "metadata": { + "id": "ydQhLMEPr5wC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from img2table.document import Image" + ], + "metadata": { + "id": "Sc71P2lVsIeQ" + }, + "execution_count": 94, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def extract_bounding_box_img2table(image_path):\n", + " img = Image(src=image_path)\n", + " extracted_tables = img.extract_tables()\n", + " bbox_values = [[table.bbox.x1, table.bbox.y1, table.bbox.x2, table.bbox.y2] for table in extracted_tables]\n", + " return bbox_values" + ], + "metadata": { + "id": "pxsLJ0z9r-8-" + }, + "execution_count": 95, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "bounding_boxes = []\n", + "for image_id in range(1, number_of_table_images+1):\n", + " bounding_boxes.append(extract_bounding_box_img2table(f\"/content/table-{image_id}.png\"))\n", + "bounding_boxes" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "QpadNqqrtn-F", + "outputId": "b3945957-cfe8-4d05-d15f-a8663eb91feb" + }, + "execution_count": 96, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[[[93, 164, 830, 825], [93, 164, 827, 189]],\n", + " [[106, 280, 844, 398], [106, 542, 840, 775], [107, 162, 840, 187]],\n", + " [[78, 511, 513, 655], [553, 128, 1470, 481], [885, 152, 968, 169]],\n", + " [],\n", + " [[65, 481, 934, 1123]],\n", + " [],\n", + " [],\n", + " [[82, 458, 458, 826], [490, 105, 748, 152], [490, 747, 748, 805]],\n", + " [],\n", + " [[46, 342, 609, 454], [46, 505, 608, 701]],\n", + " [[62, 415, 774, 681]],\n", + " [],\n", + " [[49, 194, 768, 844]],\n", + " [[128, 89, 687, 306], [130, 378, 686, 582], [150, 824, 666, 893]],\n", + " [],\n", + " [[26, 26, 791, 186]],\n", + " [],\n", + " [],\n", + " []]" + ] + }, + "metadata": {}, + "execution_count": 96 + } + ] + }, + { + "cell_type": "code", + "source": [ + "table_bounding_boxes[\"img2table\"] = bounding_boxes" + ], + "metadata": { + "id": "bh8i_5rSt96q" + }, + "execution_count": 102, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **ExtractTable**" + ], + "metadata": { + "id": "SFMoKtzdyYHU" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install -U ExtractTable" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jZuKNeDmw_ZC", + "outputId": "21044963-7572-45c1-ae22-71e4e5624add" + }, + "execution_count": 75, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting ExtractTable\n", + " Downloading ExtractTable-2.4.0-py3-none-any.whl (19 kB)\n", + "Requirement already satisfied: requests>=2.21 in /usr/local/lib/python3.10/dist-packages (from ExtractTable) (2.31.0)\n", + "Requirement already satisfied: pandas>=0.24 in /usr/local/lib/python3.10/dist-packages (from ExtractTable) (2.0.3)\n", + "Collecting PyPDF2>=1.26 (from ExtractTable)\n", + " Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->ExtractTable) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->ExtractTable) (2023.4)\n", + "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->ExtractTable) (2024.1)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->ExtractTable) (1.25.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.21->ExtractTable) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.21->ExtractTable) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.21->ExtractTable) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.21->ExtractTable) (2024.6.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas>=0.24->ExtractTable) (1.16.0)\n", + "Installing collected packages: PyPDF2, ExtractTable\n", + "Successfully installed ExtractTable-2.4.0 PyPDF2-3.0.1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from ExtractTable import ExtractTable\n", + "et_sess = ExtractTable(api_key=\"Iu0sfQOnq5TufIroG7V2mXZfxR702JI8GirF0hBE\")\n", + "print(et_sess.check_usage())\n", + "# table_data = et_sess.process_file(filepath=\"/content/table-3.png\", output_format=\"df\")\n", + "table_data = et_sess.process_file(filepath=\"/content/tables1-3.pdf\", output_format=\"df\", pages=\"all\")" + ], + "metadata": { + "id": "mr2BNAmtxC3D" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "for table in table_data:\n", + " display(table)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "rP0fJCVtxfnY", + "outputId": "1bb5d9f9-f5fb-494a-9d9a-9171a4f4158d" + }, + "execution_count": 144, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0 1 \\\n", + "0 \n", + "1 \n", + "2 Type A \n", + "3 \n", + "4 \n", + "5 Front Type B \n", + "6 \n", + "7 \n", + "8 Type C \n", + "9 \n", + "10 Side repeater \n", + "11 Tail light \n", + "12 Stop light \n", + "13 Turn signal \n", + "14 Rear Back up light \n", + "15 Reflex reflector \n", + "16 High mounted \n", + "17 License plate \n", + "18 Map lamp \n", + "19 Interior Room lamp \n", + "20 Luggage \n", + "\n", + " 2 3 \\\n", + "0 Light Bulb Bulb type \n", + "1 Headlight (high/low) H19LL \n", + "2 Position light W5W \n", + "3 Turn signal light PY21W \n", + "4 Headlight (high/low) H19LL \n", + "5 Position light/ Daytime Running Light (if equi... LED \n", + "6 Turn signal light PY21W \n", + "7 Headlight (high/low) 9005HL \n", + "8 Position light/ Daytime Running Light (if equi... LED \n", + "9 Turn signal light PY21W \n", + "10 light (outside mirror) * WY5W \n", + "11 LED \n", + "12 P21/5W \n", + "13 light PY21W \n", + "14 W16W \n", + "15 - \n", + "16 stop light* W5W \n", + "17 light W5W \n", + "18 FESTOON \n", + "19 FESTOON \n", + "20 compartment lamp * FESTOON \n", + "\n", + " 4 \n", + "0 Wattage \n", + "1 60/55 W \n", + "2 5 W \n", + "3 21 W \n", + "4 60/55 W \n", + "5 POS/DRL : 1.6/12.2 W \n", + "6 21 W \n", + "7 60 W \n", + "8 POS/DRL : 1.6/12.2 W \n", + "9 21W \n", + "10 5 W \n", + "11 2.5 W \n", + "12 21 W \n", + "13 21 W \n", + "14 16 W \n", + "15 - \n", + "16 5W * 04 EA \n", + "17 5W * 02 EA \n", + "18 8 W * 02EA \n", + "19 8 W \n", + "20 10 W " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01234
0Light BulbBulb typeWattage
1Headlight (high/low)H19LL60/55 W
2Type APosition lightW5W5 W
3Turn signal lightPY21W21 W
4Headlight (high/low)H19LL60/55 W
5FrontType BPosition light/ Daytime Running Light (if equi...LEDPOS/DRL : 1.6/12.2 W
6Turn signal lightPY21W21 W
7Headlight (high/low)9005HL60 W
8Type CPosition light/ Daytime Running Light (if equi...LEDPOS/DRL : 1.6/12.2 W
9Turn signal lightPY21W21W
10Side repeaterlight (outside mirror) *WY5W5 W
11Tail lightLED2.5 W
12Stop lightP21/5W21 W
13Turn signallightPY21W21 W
14RearBack up lightW16W16 W
15Reflex reflector--
16High mountedstop light*W5W5W * 04 EA
17License platelightW5W5W * 02 EA
18Map lampFESTOON8 W * 02EA
19InteriorRoom lampFESTOON8 W
20Luggagecompartment lamp *FESTOON10 W
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "table", + "summary": "{\n \"name\": \"table\",\n \"rows\": 21,\n \"fields\": [\n {\n \"column\": \"0\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Front\",\n \"Interior\",\n \"\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"1\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 15,\n \"samples\": [\n \"Reflex reflector\",\n \"License plate\",\n \"\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"2\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"stop light*\",\n \"Headlight (high/low)\",\n \"light (outside mirror) *\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"3\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 11,\n \"samples\": [\n \"9005HL\",\n \"Bulb type\",\n \"-\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"4\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 15,\n \"samples\": [\n \"-\",\n \"5W * 02 EA\",\n \"Wattage\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0 1 2\n", + "0 Item mm (in.)\n", + "1 Overall length 3,815 (150.2)\n", + "2 Overall width 1,710 (67.3)\n", + "3 Overall height 1,585 (62.4) / 1,618* (63.7*)\n", + "4 165/70 R14 1,487 (58.5)\n", + "5 Front tread 175/65 R15 1,475 (58.1)\n", + "6 165/70 R14 1,504 (59.2)\n", + "7 Rear tread 175/65 R15 1,492 (58.7)\n", + "8 Wheelbase 2,450 (96.5)" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
0Itemmm (in.)
1Overall length3,815 (150.2)
2Overall width1,710 (67.3)
3Overall height1,585 (62.4) / 1,618* (63.7*)
4165/70 R141,487 (58.5)
5Front tread175/65 R151,475 (58.1)
6165/70 R141,504 (59.2)
7Rear tread175/65 R151,492 (58.7)
8Wheelbase2,450 (96.5)
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "table", + "summary": "{\n \"name\": \"table\",\n \"rows\": 9,\n \"fields\": [\n {\n \"column\": \"0\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 8,\n \"samples\": [\n \"Overall length\",\n \"Front tread\",\n \"Item\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"1\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"\",\n \"165/70 R14\",\n \"175/65 R15\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"2\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 9,\n \"samples\": [\n \"1,492 (58.7)\",\n \"3,815 (150.2)\",\n \"1,475 (58.1)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0 1\n", + "0 Petrol Engine\n", + "1 Item 1.2 MPI\n", + "2 Displacement CC. 1,197\n", + "3 Bore X Stroke mm 71 X 75.6\n", + "4 Firing order In-line\n", + "5 No. of cylinders 4" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01
0Petrol Engine
1Item1.2 MPI
2Displacement CC.1,197
3Bore X Stroke mm71 X 75.6
4Firing orderIn-line
5No. of cylinders4
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "table", + "summary": "{\n \"name\": \"table\",\n \"rows\": 6,\n \"fields\": [\n {\n \"column\": \"0\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"\",\n \"Item\",\n \"No. of cylinders\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"1\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Petrol Engine\",\n \"1.2 MPI\",\n \"4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0 1 2 3 4 \\\n", + "0 Group Mass Group Age Group Fr Rear Out- board Lh \n", + "1 0 Up to 10 kg Up to 9 months U \n", + "2 0+ Up to 13 kg Up to 24 months X U \n", + "3 I 9 to 18 kg 9 months to 48 months X U \n", + "4 II 15 to 25 kg Approx. 3 to 7 Years X U \n", + "5 III 22 to 36 kg Approx. 6 to 12 Years X U \n", + "\n", + " 5 6 \n", + "0 Rear Out- board Rh Rear Center \n", + "1 U X \n", + "2 U X \n", + "3 U X \n", + "4 U X \n", + "5 U X " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456
0GroupMass GroupAge GroupFrRear Out- board LhRear Out- board RhRear Center
10Up to 10 kgUp to 9 monthsUUX
20+Up to 13 kgUp to 24 monthsXUUX
3I9 to 18 kg9 months to 48 monthsXUUX
4II15 to 25 kgApprox. 3 to 7 YearsXUUX
5III22 to 36 kgApprox. 6 to 12 YearsXUUX
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "table", + "summary": "{\n \"name\": \"table\",\n \"rows\": 6,\n \"fields\": [\n {\n \"column\": \"0\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Group\",\n \"0\",\n \"III\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"1\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Mass Group\",\n \"Up to 10 kg\",\n \"22 to 36 kg\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"2\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Age Group\",\n \"Up to 9 months\",\n \"Approx. 6 to 12 Years\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"3\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Fr\",\n \"\",\n \"X\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"4\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"U\",\n \"Rear Out- board Lh\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"5\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"U\",\n \"Rear Out- board Rh\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"6\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"X\",\n \"Rear Center\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# **Vision**" + ], + "metadata": { + "id": "Yd3vxVWNRC1W" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install google-cloud-vision\n", + "!pip install pdf2image" + ], + "metadata": { + "id": "bSr0YgBva6BN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!sudo apt-get update\n", + "!apt-get install poppler-utils" + ], + "metadata": { + "id": "iXCgdvOYSgxL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import io\n", + "import os\n", + "from google.cloud import vision\n", + "from google.cloud.vision_v1 import types\n", + "import numpy as np" + ], + "metadata": { + "id": "Cz9B6uC7OWqB" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"/content/ai-drive-test-vision-ocr.json\"" + ], + "metadata": { + "id": "WBOaOXdZOi3C" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from pdf2image import convert_from_path\n", + "import base64\n", + "from io import BytesIO\n", + "from PIL import Image\n", + "from google.cloud import vision\n", + "\n", + "def pdf_to_images(pdf_path):\n", + " images = convert_from_path(pdf_path)\n", + " image_paths = []\n", + " for i, image in enumerate(images):\n", + " image_path = f\"/tmp/page_{i}.png\"\n", + " image.save(image_path, \"PNG\")\n", + " image_paths.append(image_path)\n", + " return image_paths" + ], + "metadata": { + "id": "HcqDrIRyRk8x" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def get_table_bounding_boxes(image_path):\n", + " client = vision.ImageAnnotatorClient()\n", + "\n", + " # Load the image file into memory\n", + " with open(image_path, \"rb\") as image_file:\n", + " content = image_file.read()\n", + " image = vision.Image(content=content)\n", + "\n", + " # Perform text detection on the image file\n", + " response = client.document_text_detection(image=image)\n", + "\n", + " bounding_boxes = []\n", + " for page in response.full_text_annotation.pages:\n", + " for block in page.blocks:\n", + " print(block.block_type.name)\n", + " for paragraph in block.paragraphs:\n", + " for word in paragraph.words:\n", + " word_text = \"\".join([symbol.text for symbol in word.symbols])\n", + " print(word_text)\n", + " if block.block_type.name == \"TABLE\":\n", + " print(block.block_type.name)\n", + " vertices = [[vertex.x, vertex.y] for vertex in block.bounding_box.vertices]\n", + " bounding_boxes.append(vertices)\n", + "\n", + " if response.error.message:\n", + " raise Exception(f'{response.error.message}')\n", + "\n", + " return bounding_boxes" + ], + "metadata": { + "id": "gNdXWM-_avGJ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def detect_documents_vision(pages):\n", + " for pg in range(len(pages)):\n", + " if pg>24 and pg<30:\n", + " get_table_bounding_boxes(pages[pg])" + ], + "metadata": { + "id": "KDHL2uCWRdD_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pages = pdf_to_images(\"/content/hyundai_exter.pdf\")" + ], + "metadata": { + "id": "JGs9uAkPVxT9" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "detect_documents_vision(pages)" + ], + "metadata": { + "id": "2BhmUheESAZy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "get_table_bounding_boxes(\"/content/table-3.png\")" + ], + "metadata": { + "id": "nFc1vAiwXIjf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install google-cloud-documentai" + ], + "metadata": { + "id": "XYoCjsKLchTh" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "from google.cloud import documentai_v1 as documentai\n", + "from google.cloud.documentai_v1 import types\n", + "import io" + ], + "metadata": { + "id": "My81wvIPckau" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def get_table_bounding_boxes(image_path, project_id, location, processor_id):\n", + " \"\"\"Detects tables in an image and returns the bounding boxes.\n", + "\n", + " Args:\n", + " image_path (str): The path to the image file.\n", + " project_id (str): Google Cloud project ID.\n", + " location (str): Google Cloud location.\n", + " processor_id (str): Document AI processor ID.\n", + "\n", + " Returns:\n", + " list of lists: Bounding boxes of tables, each bounding box is represented as a list of four vertices.\n", + " \"\"\"\n", + " client = documentai.DocumentProcessorServiceClient()\n", + "\n", + " # The full resource name of the processor\n", + " name = f'projects/{project_id}/locations/{location}/processors/{processor_id}'\n", + "\n", + " # Read the image file\n", + " with io.open(image_path, 'rb') as image_file:\n", + " image_content = image_file.read()\n", + "\n", + " # Load the image content into a document\n", + " raw_document = types.RawDocument(content=image_content, mime_type='image/jpeg')\n", + "\n", + " # Configure the process request\n", + " request = types.ProcessRequest(name=name, raw_document=raw_document)\n", + "\n", + " # Process the document\n", + " result = client.process_document(request=request)\n", + "\n", + " document = result.document\n", + "\n", + " # Extract bounding boxes for tables\n", + " bounding_boxes = []\n", + " for page in document.pages:\n", + " for table in page.tables:\n", + " vertices = [[vertex.x, vertex.y] for vertex in table.layout.bounding_poly.vertices]\n", + " bounding_boxes.append(vertices)\n", + "\n", + " return bounding_boxes" + ], + "metadata": { + "id": "GksVH-hRbP6B" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Example usage:\n", + "# Set these variables to your specific values\n", + "project_id = 'rock-fortress-423520-h1'\n", + "location = 'us'\n", + "processor_id = '69643c68165167c1'\n", + "\n", + "print(get_table_bounding_boxes(\"/content/table-1.png\", project_id, location, processor_id))" + ], + "metadata": { + "id": "W6KIETuBcpfM" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **Florence**-2-large" + ], + "metadata": { + "id": "tedrUKKhBH-6" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install einops flash_attn timm" + ], + "metadata": { + "id": "B4noKtMMBpaD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import fitz\n", + "import requests\n", + "from PIL import Image\n", + "from transformers import AutoProcessor, AutoModelForCausalLM" + ], + "metadata": { + "id": "ve5DeQuwBPbg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model = AutoModelForCausalLM.from_pretrained(\"microsoft/Florence-2-large\", trust_remote_code=True)\n", + "processor = AutoProcessor.from_pretrained(\"microsoft/Florence-2-large\", trust_remote_code=True)" + ], + "metadata": { + "id": "MaXP9YRMBhXl" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model = AutoModelForCausalLM.from_pretrained(\"microsoft/Florence-2-base\", trust_remote_code=True)\n", + "processor = AutoProcessor.from_pretrained(\"microsoft/Florence-2-base\", trust_remote_code=True)" + ], + "metadata": { + "id": "1rcc-e5AGn_n" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "prompt = \"Extract bounding boxes of all the tables present in this page\"" + ], + "metadata": { + "id": "xwgEmKdvBknf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def extract_text_florence(pdf_path):\n", + " images = convert_from_path(pdf_path)\n", + " extracted_text = \"\"\n", + " # for image in images:\n", + " image = images[0]\n", + " inputs = processor(text=prompt, images=image, return_tensors=\"pt\")\n", + "\n", + " generated_ids = model.generate(\n", + " input_ids=inputs[\"input_ids\"],\n", + " pixel_values=inputs[\"pixel_values\"],\n", + " max_new_tokens=1024,\n", + " num_beams=3,\n", + " do_sample=False\n", + " )\n", + " generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]\n", + "\n", + " parsed_answer = processor.post_process_generation(generated_text, task=\"\", image_size=(image.width, image.height))\n", + "\n", + " print(parsed_answer)\n" + ], + "metadata": { + "id": "bB-majwGBHAd" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "extract_text_florence(\"/content/table-data.pdf\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Mch5i9PfDpu1", + "outputId": "d31acc29-d91e-4e1f-9d5e-dc9b3730e0d8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'': {'bboxes': [[123.38600158691406, 222.95001220703125, 1156.986083984375, 1152.9700927734375], [0.6460000276565552, 0.9100000262260437, 1290.06201171875, 1817.27001953125]], 'labels': ['bounding boxes of all the tables present', 'this page']}}\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# **Marker**" + ], + "metadata": { + "id": "cRHuvf0kKUr7" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install marker-pdf" + ], + "metadata": { + "id": "pIQGdq7eKXyf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install poetry" + ], + "metadata": { + "id": "tctaIQpmKlSZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!git clone https://github.com/VikParuchuri/marker.git" + ], + "metadata": { + "id": "oMXfhQVYLHFA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "%cd /content/marker\n", + "!poetry install" + ], + "metadata": { + "id": "nAvKUsyULDym" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!marker_single /content/hyundai_exter-25.pdf /content --max_pages 10 --langs English" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sgk_Wk8yL6U0", + "outputId": "f72a374c-7677-4bad-c86f-e90e8a83f9e6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2024-06-07 19:39:52.061416: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "2024-06-07 19:39:52.061488: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "2024-06-07 19:39:52.180601: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2024-06-07 19:39:52.398901: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2024-06-07 19:39:54.989202: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "Loaded detection model vikp/surya_det2 on device cpu with dtype torch.float32\n", + "Loaded detection model vikp/surya_layout2 on device cpu with dtype torch.float32\n", + "Loaded reading order model vikp/surya_order on device cpu with dtype torch.float32\n", + "Loaded recognition model vikp/surya_rec on device cpu with dtype torch.float32\n", + "Loaded texify model to cpu with torch.float32 dtype\n", + "Detecting bboxes: 100% 1/1 [00:32<00:00, 32.02s/it]\n", + "Detecting bboxes: 100% 1/1 [00:32<00:00, 32.56s/it]\n", + "Finding reading order: 100% 1/1 [00:16<00:00, 16.46s/it]\n", + "Saved markdown to the /content/hyundai_exter-25 folder\n" + ] + } + ] + } + ] +} \ No newline at end of file