Spaces:

shimer56
/

Extract_PDF

Sleeping

App Files Files Community

shimer56 commited on Jun 21

Commit

d2cb17f

•

1 Parent(s): 50d37cb

Upload folder using huggingface_hub

Browse files

Files changed (28) hide show

.gitignore +4 -0
README.md +2 -8
app.py +84 -0
constants.py +2 -0
extract_images/input_docs/uploaded_pdf.pdf +0 -0
extract_images/services.py +147 -0
extract_tables/input_docs/uploaded_pdf.pdf +0 -0
extract_tables/services.py +191 -0
ocr_notebooks/OCR_Benchmarking.ipynb +0 -0
ocr_notebooks/image_extraction.ipynb +0 -0
ocr_notebooks/table_extraction.ipynb +0 -0
requirements.txt +16 -0
test_extraction/input_docs/output_gemini_flash.png +0 -0
test_extraction/input_docs/page_0.png +0 -0
test_extraction/input_docs/page_1.png +0 -0
test_extraction/input_docs/page_2.png +0 -0
test_extraction/output_files/extracted_table.csv +48 -0
test_extraction/output_files/output_gemini_pro.png +0 -0
test_extraction/output_files/output_initial_gpt4_4o.png +0 -0
test_extraction/output_files/output_initial_gpt4_turbo.png +0 -0
test_extraction/output_files/output_scaled_gpt4_4o.png +0 -0
test_extraction/output_files/output_scaled_gpt4_turbo.png +0 -0
test_extraction/requirements_versioned.txt +1 -0
test_extraction/sample.py +137 -0
test_extraction/sample1.py +58 -0
test_extraction/sample_11.py +141 -0
utils.py +108 -0
visual_assets/.gitkeep +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+*cache*
+.venv/
+flagged/
+__pycache__/

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Extract PDF
-emoji: 💻
-colorFrom: gray
-colorTo: green
 sdk: gradio
 sdk_version: 4.36.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Extract_PDF
+app_file: app.py
 sdk: gradio
 sdk_version: 4.36.1
 ---

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import gradio as gr
+from extract_images.services import (
+    extract_images_pymupdf,
+    extract_images_pdfplumber,
+    extract_images_gemini,
+    extract_images_gpt,
+)
+from extract_tables.services import (
+    extract_tables_pymupdf,
+    extract_tables_tab_transformer,
+    extract_tables_img2table,
+    extract_tables_gemini,
+    extract_tables_gpt,
+)
+from utils import clear_directory
+def handle_model_selection(pdf_file, model_option):
+    if model_option == "PyMuPDF":
+        images = extract_images_pymupdf(pdf_file)
+        tables = extract_tables_pymupdf(pdf_file)
+    elif model_option == "PdfPlumber (Extracts Images only)":
+        images = extract_images_pdfplumber(pdf_file)
+        tables = None
+    elif model_option == "Table Transformer (Extracts Tables only)":
+        images = None
+        tables = extract_tables_tab_transformer(pdf_file)
+    elif model_option == "img2table (Extracts Tables only)":
+        images = None
+        tables = extract_tables_img2table(pdf_file)
+    elif model_option == "Gemini Pro":
+        images = extract_images_gemini("gemini-pro-vision", pdf_file)
+        tables = extract_tables_gemini("gemini-pro-vision", pdf_file)
+    elif model_option == "Gemini Flash":
+        images = extract_images_gemini("gemini-1.5-flash-latest", pdf_file)
+        tables = extract_tables_gemini("gemini-1.5-flash-latest", pdf_file)
+    elif model_option == "GPT 4 Turbo":
+        images = extract_images_gpt("gpt-4-turbo", pdf_file)
+        tables = extract_tables_gpt("gpt-4-turbo", pdf_file)
+    elif model_option == "GPT 4o":
+        images = extract_images_gpt("gpt-4o", pdf_file)
+        tables = extract_tables_gpt("gpt-4o", pdf_file)
+    clear_directory("extract_tables/table_outputs")
+    clear_directory("extract_images/image_outputs")
+    return images, tables
+interface = gr.Interface(
+    fn=handle_model_selection,
+    inputs=[
+        gr.File(type="binary", label="Upload PDF"),
+        gr.Dropdown(
+            label="Select Model",
+            choices=[
+                "PdfPlumber (Extracts Images only)",
+                "Table Transformer (Extracts Tables only)",
+                "img2table (Extracts Tables only)",
+                "PyMuPDF",
+                "Gemini Pro",
+                "Gemini Flash",
+                "GPT 4 Turbo",
+                "GPT 4o",
+            ],
+            value="PyMuPDF",
+        ),
+    ],
+    outputs=[
+        gr.Gallery(label="Extracted Images"),
+        gr.Gallery(label="Extracted Tables"),
+    ],
+    title="PDF Image and Table Extractor",
+    description="Upload a PDF to extract images and tables. Choose the model for extraction.",
+)
+interface.launch(share=True)

constants.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ GEMINI_API_KEY = "AIzaSyBwk94xRhPOIkvO0E3pYhXQ7Rrk5my5IyY"
2	+ OPENAI_API_KEY = "sk-proj-YOl2xepEsNppWm3xLshlT3BlbkFJL04qQgahGxFcFGEClnQK"

extract_images/input_docs/uploaded_pdf.pdf ADDED Viewed

Binary file (41.4 kB). View file

extract_images/services.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import pymupdf
+from io import BytesIO
+from PIL import Image
+import pdfplumber
+import ast
+import google.generativeai as genai
+from PIL import Image, ImageDraw
+import openai
+import requests
+from constants import GEMINI_API_KEY, OPENAI_API_KEY
+from utils import (
+    draw_boxes,
+    pdf_to_images,
+    parse_bboxs_gemini_flash,
+    convert_pdf_to_images,
+    encode_image_to_base64,
+)
+def extract_images_pymupdf(pdf_file):
+    pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
+    with open(pdf_path, "wb") as f:
+        f.write(pdf_file)
+    doc = pymupdf.open(pdf_path)
+    images = []
+    for page_idx, page in enumerate(doc):
+        for img_index, img in enumerate(doc.get_page_images(page_idx)):
+            xref = img[0]
+            base_image = doc.extract_image(xref)
+            image_bytes = base_image["image"]
+            image = Image.open(BytesIO(image_bytes))
+            images.append(image)
+    return images if images != [] else None
+def extract_images_pdfplumber(pdf_file):
+    pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
+    with open(pdf_path, "wb") as f:
+        f.write(pdf_file)
+    images = []
+    pdf_obj = pdfplumber.open(pdf_path)
+    for page_idx, page in enumerate(pdf_obj.pages):
+        page_bbox = []
+        for image_idx, image in enumerate(page.images):
+            page_height = page.height
+            image_bbox = (
+                image["x0"],
+                page_height - image["y1"],
+                image["x1"],
+                page_height - image["y0"],
+            )
+            page_bbox.append(image_bbox)
+            cropped_page = page.crop(image_bbox)
+            image_obj = cropped_page.to_image(resolution=400)
+            image_path = (
+                f"extract_images/image_outputs/image-{page_idx}-{image_idx}.png"
+            )
+            image_obj.save(image_path)
+            image = Image.open(image_path)
+            images.append(image)
+    return images if images != [] else None
+def extract_images_gemini(model, pdf_file):
+    genai.configure(api_key=GEMINI_API_KEY)
+    gemini_model = genai.GenerativeModel(model)
+    prompt = f"Extract the bounding boxes of all the images present in this page. Return the bounding boxes as list of lists. Do not include anyother text or symbols in the output"
+    pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
+    with open(pdf_path, "wb") as f:
+        f.write(pdf_file)
+    images = []
+    pdf_images = pdf_to_images(pdf_path)
+    for page in pdf_images:
+        img = Image.open(page).convert("RGB")
+        response = gemini_model.generate_content([img, prompt], stream=False)
+        response.resolve()
+        print(response.text)
+        if model == "gemini-pro-vision":
+            page_bbox = ast.literal_eval(response.text)
+        elif model == "gemini-1.5-flash-latest":
+            page_bbox = parse_bboxs_gemini_flash(response.text)
+        image = draw_boxes(page, page_bbox)
+        images.append(image)
+    return images
+def extract_images_gpt(model, pdf_file):
+    openai.api_key = OPENAI_API_KEY
+    image_media_type = "image/png"
+    pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
+    with open(pdf_path, "wb") as f:
+        f.write(pdf_file)
+    images = convert_pdf_to_images(pdf_path)
+    image_paths = pdf_to_images(pdf_path)
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {openai.api_key}",
+    }
+    extracted_images = []
+    for page_idx, image in enumerate(images):
+        base64_string = encode_image_to_base64(image)
+        payload = {
+            "model": model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Extract bounding boxes of all the images present in this page. Return bounding boxes as liat of lists and don't provide any other text in the response.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_string}"
+                            },
+                        },
+                    ],
+                }
+            ],
+        }
+        response = requests.post(
+            "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
+        )
+        response_json = response.json()
+        if "choices" in response_json and len(response_json["choices"]) > 0:
+            extracted_images.append(
+                draw_boxes(
+                    image_paths[page_idx],
+                    ast.literal_eval(response_json["choices"][0]["message"]["content"]),
+                )
+            )
+    return extracted_images

extract_tables/input_docs/uploaded_pdf.pdf ADDED Viewed

Binary file (41.4 kB). View file

extract_tables/services.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import pymupdf
+import fitz
+import io
+from io import BytesIO
+from PIL import Image
+from transformers import AutoImageProcessor, TableTransformerForObjectDetection
+import torch
+import ast
+import google.generativeai as genai
+import openai
+import requests
+from constants import GEMINI_API_KEY, OPENAI_API_KEY
+from utils import (
+    draw_boxes,
+    pdf_to_images,
+    parse_bboxs_gemini_flash,
+    convert_pdf_to_images,
+    encode_image_to_base64,
+)
+def get_bounding_box_pymupdf(pdf_path):
+    bounding_boxes = []
+    pages = pymupdf.open(pdf_path)
+    for page_num in range(len(pages)):
+        page = pages[page_num]
+        tabs = page.find_tables()
+        page_tables = []
+        for table in range(len(tabs.tables)):
+            page_tables.append(list(tabs.tables[table].bbox))
+        bounding_boxes.append(page_tables)
+    return bounding_boxes
+def extract_tables_pymupdf(pdf_file):
+    pdf_path = "extract_tables/input_docs/uploaded_pdf.pdf"
+    with open(pdf_path, "wb") as f:
+        f.write(pdf_file)
+    bounding_boxes = get_bounding_box_pymupdf(pdf_path)
+    pages = fitz.open(pdf_path)
+    tables = []
+    for page_num, page_tables in enumerate(bounding_boxes, start=1):
+        page = pages[page_num - 1]
+        for table_num, bbox in enumerate(page_tables, start=1):
+            clip = page.get_pixmap(clip=bbox, alpha=False)
+            img = Image.frombytes("RGB", [clip.width, clip.height], clip.samples)
+            img_bytes = io.BytesIO()
+            img.save(img_bytes, format="PNG")
+            img_bytes = img_bytes.getvalue()
+            image = Image.open(BytesIO(img_bytes))
+            tables.append(image)
+    return tables
+def extract_tables_tab_transformer(pdf_file):
+    image_processor = AutoImageProcessor.from_pretrained(
+        "microsoft/table-transformer-detection"
+    )
+    model = TableTransformerForObjectDetection.from_pretrained(
+        "microsoft/table-transformer-detection"
+    )
+    pdf_path = "extract_tables/input_docs/uploaded_pdf.pdf"
+    with open(pdf_path, "wb") as f:
+        f.write(pdf_file)
+    tables = []
+    pdf_images = pdf_to_images(pdf_path)
+    for page in pdf_images:
+        image = Image.open(page).convert("RGB")
+        inputs = image_processor(images=image, return_tensors="pt")
+        outputs = model(**inputs)
+        target_sizes = torch.tensor([image.size[::-1]])
+        results = image_processor.post_process_object_detection(
+            outputs, threshold=0.9, target_sizes=target_sizes
+        )[0]
+        image = draw_boxes(page, results["boxes"].tolist())
+        tables.append(image)
+    return tables
+def extract_tables_img2table(pdf_file):
+    from img2table.document import Image
+    pdf_path = "extract_tables/input_docs/uploaded_pdf.pdf"
+    with open(pdf_path, "wb") as f:
+        f.write(pdf_file)
+    tables = []
+    pdf_images = pdf_to_images(pdf_path)
+    for image_path in pdf_images:
+        img = Image(src=image_path)
+        extracted_tables = img.extract_tables()
+        bbox_values = [
+            [table.bbox.x1, table.bbox.y1, table.bbox.x2, table.bbox.y2]
+            for table in extracted_tables
+        ]
+        image = draw_boxes(image_path, bbox_values)
+        tables.append(image)
+    return tables
+def extract_tables_gemini(model, pdf_file):
+    genai.configure(api_key=GEMINI_API_KEY)
+    gemini_model = genai.GenerativeModel(model)
+    prompt = f"Extract the bounding boxes of all the tables present in this image. Return the bounding boxes as list of lists. Do not include anyother text or symbols in the output"
+    pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
+    with open(pdf_path, "wb") as f:
+        f.write(pdf_file)
+    tables = []
+    pdf_images = pdf_to_images(pdf_path)
+    for page in pdf_images:
+        img = Image.open(page).convert("RGB")
+        response = gemini_model.generate_content([img, prompt], stream=False)
+        response.resolve()
+        print(response.text)
+        if model == "gemini-pro-vision":
+            page_bbox = ast.literal_eval(response.text)
+        elif model == "gemini-1.5-flash-latest":
+            page_bbox = parse_bboxs_gemini_flash(response.text)
+        image = draw_boxes(page, page_bbox)
+        tables.append(image)
+    return tables
+def extract_tables_gpt(model, pdf_file):
+    openai.api_key = OPENAI_API_KEY
+    image_media_type = "image/png"
+    pdf_path = "extract_images/input_docs/uploaded_pdf.pdf"
+    with open(pdf_path, "wb") as f:
+        f.write(pdf_file)
+    images = convert_pdf_to_images(pdf_path)
+    image_paths = pdf_to_images(pdf_path)
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {openai.api_key}",
+    }
+    extracted_tables = []
+    for page_idx, image in enumerate(images):
+        base64_string = encode_image_to_base64(image)
+        payload = {
+            "model": model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Extract bounding boxes of all the tables present in this page. Return bounding boxes as liat of lists and don't provide any other text in the response.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_string}"
+                            },
+                        },
+                    ],
+                }
+            ],
+        }
+        response = requests.post(
+            "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
+        )
+        response_json = response.json()
+        if "choices" in response_json and len(response_json["choices"]) > 0:
+            extracted_tables.append(
+                draw_boxes(
+                    image_paths[page_idx],
+                    ast.literal_eval(response_json["choices"][0]["message"]["content"]),
+                )
+            )
+    return extracted_tables

ocr_notebooks/OCR_Benchmarking.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

ocr_notebooks/image_extraction.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

ocr_notebooks/table_extraction.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+black
+numpy
+pytest
+gradio
+pdf2image
+tensorflow
+pytesseract
+opencv-python
+python-resize-image
+google-generativeai
+openai
+pdfplumber
+pymupdf
+timm
+transformers
+img2table

test_extraction/input_docs/output_gemini_flash.png ADDED Viewed

test_extraction/input_docs/page_0.png ADDED Viewed

test_extraction/input_docs/page_1.png ADDED Viewed

test_extraction/input_docs/page_2.png ADDED Viewed

test_extraction/output_files/extracted_table.csv ADDED Viewed

	@@ -0,0 +1,48 @@

+2-13,,,,
+02,,,,
+7,5(63(&,),&$7,21$1'
+35(6685(/$%(/,,,,
+OAI3019023,,,,
+OAI3019023,,,,
+The tires supplied on your new,,,,
+vehicle are chosen to provide the best,,,,
+performance for normal driving.,,,,
+The tire label located on the driver’s,,,,
+side center pillar gives the tire pressures,,,,
+recommended for your vehicle.,,,,
+(1*,1(180%(5,,,
+OAH2088004,,,,
+OAH2088004,,,,
+The engine number is stamped on the,,,,
+engine block as shown in the drawing.,,,,
+10,,,,
+SRS warning indicator in the instrument,,,,
+cluster illuminates continuously, it means,,,
+that there is malfunction in the system. Re,,,,
+move the CRS from front passenger seat,,,,
+and contact your TATA MOTORS autho,,,,
+rised service center.,,,,
+NOTE,,,,
+The above image’s are for reference,,,,
+purpose only.,,,,
+SAFETY,,,,
+2-6,,,,
+Vehicle Information,,,,
+1. Engine coolant reservoir ..............................................................................................................9-23,,,,
+2. Engine oil filler cap  .....................................................................................................................9-20,,,,
+3. Brake/clutch* fluid reservoir,,,,
+......................................................................................................9-26,,,,
+4. Air cleaner  ...................................................................................................................................9-30,,,,
+5. Fuse box  ......................................................................................................................................9-54,,,,
+6. Battery  .........................................................................................................................................9-37,,,,
+7. Windshield washer fluid reservoir ..............................................................................................9-28,,,,
+8. Radiator cap,,,,
+.................................................................................................................................9-24,,,,
+9. Engine oil dipstick .......................................................................................................................9-20,,,,
+(1*,1(&203$570(1729(59,(:,,
+The actual engine room in the vehicle may differ from the illustration.,,,,
+OAI3089001,,,,
+OAI3089001,,,,
+,,,,
+ Petrol Engine (Kappa 1.2 MPI),,,,
+Petrol Engine (Kappa 1.2 MPI),,,,

test_extraction/output_files/output_gemini_pro.png ADDED Viewed

test_extraction/output_files/output_initial_gpt4_4o.png ADDED Viewed

test_extraction/output_files/output_initial_gpt4_turbo.png ADDED Viewed

test_extraction/output_files/output_scaled_gpt4_4o.png ADDED Viewed

test_extraction/output_files/output_scaled_gpt4_turbo.png ADDED Viewed

test_extraction/requirements_versioned.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ��

test_extraction/sample.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# import gradio as gr
+# import fitz  # PyMuPDF
+# from PIL import Image
+# from io import BytesIO
+# import pandas as pd
+# import os
+# def extract_images_and_tables(pdf_file):
+#     pdf_path = "temp.pdf"
+#     with open(pdf_path, "wb") as f:
+#         f.write(pdf_file)
+#     pdf_document = fitz.open(pdf_path)
+#     images = []
+#     for page_index in range(len(pdf_document)):
+#         for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
+#             xref = img[0]
+#             base_image = pdf_document.extract_image(xref)
+#             image_bytes = base_image["image"]
+#             image = Image.open(BytesIO(image_bytes))
+#             images.append(image)
+#     tables = []
+#     for page_num in range(len(pdf_document)):
+#         page = pdf_document.load_page(page_num)
+#         text = page.get_text("text")
+#         lines = [line.strip() for line in text.split("\n") if line.strip()]
+#         if any("," in line for line in lines):
+#             rows = [line.split(",") for line in lines]
+#             tables.extend(rows)
+#     table_content = ""
+#     if tables:
+#         max_columns = max(len(row) for row in tables)
+#         tables = [row + [""] * (max_columns - len(row)) for row in tables]
+#         df = pd.DataFrame(tables[1:], columns=tables[0])
+#         table_content = df.to_csv(index=False)
+#     pdf_document.close()
+#     # Remove the temporary PDF file
+#     os.remove(pdf_path)
+#     return images, table_content
+# interface = gr.Interface(
+#     fn=extract_images_and_tables,
+#     inputs=gr.File(type="binary"),
+#     outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
+#     title="PDF Image and Table Extractor",
+#     description="Upload a PDF to extract images and tables."
+# )
+# interface.launch(share=True)
+import gradio as gr
+import fitz  # PyMuPDF
+from PIL import Image
+from io import BytesIO
+import pandas as pd
+import os
+def extract_images_and_tables(pdf_file, model_option):
+    pdf_path = "temp.pdf"
+    with open(pdf_path, "wb") as f:
+        f.write(pdf_file)
+    pdf_document = fitz.open(pdf_path)
+    images = []
+    for page_index in range(len(pdf_document)):
+        for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
+            xref = img[0]
+            base_image = pdf_document.extract_image(xref)
+            image_bytes = base_image["image"]
+            image = Image.open(BytesIO(image_bytes))
+            images.append(image)
+    tables = []
+    for page_num in range(len(pdf_document)):
+        page = pdf_document.load_page(page_num)
+        text = page.get_text("text")
+        lines = [line.strip() for line in text.split("\n") if line.strip()]
+        if any("," in line for line in lines):
+            rows = [line.split(",") for line in lines]
+            tables.extend(rows)
+    table_content = ""
+    if tables:
+        max_columns = max(len(row) for row in tables)
+        tables = [row + [""] * (max_columns - len(row)) for row in tables]
+        df = pd.DataFrame(tables[1:], columns=tables[0])
+        table_content = df.to_csv(index=False)
+    pdf_document.close()
+    os.remove(pdf_path)
+    return images, table_content
+def handle_model_selection(pdf_file, model_option):
+    return extract_images_and_tables(pdf_file, model_option)
+interface = gr.Interface(
+    fn=handle_model_selection,
+    inputs=[
+        gr.File(type="binary", label="Upload PDF"),
+        gr.Dropdown(label="Select Model", choices=["Model 1", "Model 2", "Model 3"], value="Model 1")
+    ],
+    outputs=[gr.Gallery(label="Extracted Images"), gr.Textbox(label="Extracted Tables")],
+    title="PDF Image and Table Extractor",
+    description="Upload a PDF to extract images and tables. Choose the model for extraction."
+)
+interface.launch(share=True)

test_extraction/sample1.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import gradio as gr
+from google.cloud import vision
+from PIL import Image
+import pandas as pd
+import os
+from io import BytesIO
+def extract_tables_with_google_vision(image_file):
+    # Initialize Google Cloud Vision client
+    client = vision.ImageAnnotatorClient()
+    # Read the image file
+    with BytesIO(image_file) as image_stream:
+        image = Image.open(image_stream)
+        # Convert image to bytes
+        img_bytes = image_stream.getvalue()
+    # Perform text detection on the image
+    image = vision.Image(content=img_bytes)
+    response = client.text_detection(image=image)
+    texts = response.text_annotations
+    # Extract text lines
+    lines = [text.description for text in texts]
+    # Check if lines resemble a table (e.g., have commas)
+    tables = []
+    is_table = False
+    table_rows = []
+    for line in lines:
+        if "," in line:  # Assuming comma-separated values indicate a table
+            is_table = True
+            table_rows.append([cell.strip() for cell in line.split(",")])
+        else:
+            if is_table:
+                tables.extend(table_rows)
+                is_table = False
+                table_rows = []
+    table_content = ""
+    if tables:
+        df = pd.DataFrame(tables[1:], columns=tables[0])
+        table_content = df.to_csv(index=False)
+    return table_content
+interface = gr.Interface(
+    fn=extract_tables_with_google_vision,
+    inputs=gr.Image(type="pil", label="Upload a PDF page image"),
+    outputs=gr.Textbox(label="Extracted Tables"),
+    title="PDF Table Extractor with Google Cloud Vision",
+    description="Upload an image of a PDF page to extract tables.",
+    allow_flagging=False
+)
+interface.launch()

test_extraction/sample_11.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import gradio as gr
+import fitz  # PyMuPDF
+from PIL import Image, ImageDraw
+from io import BytesIO
+import pandas as pd
+import os
+import numpy as np
+import google.generativeai as genai
+import openai
+import base64
+import requests
+import tempfile
+import ast
+genai.configure(api_key="AIzaSyBwk94xRhPOIkvO0E3pYhXQ7Rrk5my5IyY")
+openai.api_key = "sk-proj-YOl2xepEsNppWm3xLshlT3BlbkFJL04qQgahGxFcFGEClnQK"
+import gradio as gr
+import fitz  # PyMuPDF
+from PIL import Image
+from io import BytesIO
+import pandas as pd
+import numpy as np
+import tempfile
+# Define the model extraction functions
+def extract_bounding_box_pymupdf(pdf_content):
+    bounding_boxes = []
+    pdf_file = fitz.open(stream=pdf_content, filetype="pdf")
+    for page_index in range(len(pdf_file)):
+        page_bbox = []
+        page = pdf_file[page_index]
+        image_list = page.get_images(full=True)
+        for image_index, img in enumerate(page.get_images(full=True), start=1):
+            rect = page.get_image_bbox(img[7])
+            bbox = list(rect)
+            page_bbox.append(bbox)
+        bounding_boxes.append(page_bbox)
+    pdf_file.close()  # Close the PDF file after use
+    return bounding_boxes
+def extract_bounding_boxes_gemini(api_key, images):
+    # Placeholder for Gemini API integration
+    bounding_boxes = [[(0, 0, 100, 100)]] * len(images)  # Dummy bounding boxes
+    return bounding_boxes
+def extract_bounding_box_gpt(api_key, pdf_content):
+    # Placeholder for GPT-4 API integration
+    bounding_boxes = [[(0, 0, 100, 100)]] * len(
+        fitz.open(stream=pdf_content, filetype="pdf")
+    )  # Dummy bounding boxes
+    return bounding_boxes
+def extract_images_and_tables(pdf_file, model_option):
+    if isinstance(pdf_file, str):
+        # If input is a file path (usually in testing or local execution)
+        with open(pdf_file, "rb") as f:
+            pdf_bytes = f.read()
+    elif isinstance(pdf_file, bytes):
+        # If input is bytes (from Gradio)
+        pdf_bytes = pdf_file
+    else:
+        raise TypeError("Unsupported input type for pdf_file.")
+    pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
+    images = []
+    for page_index in range(len(pdf_document)):
+        for img_index, img in enumerate(pdf_document.get_page_images(page_index)):
+            xref = img[0]
+            base_image = pdf_document.extract_image(xref)
+            image_bytes = base_image["image"]
+            image = Image.open(BytesIO(image_bytes))
+            images.append(image)
+    tables = []
+    for page_num in range(len(pdf_document)):
+        page = pdf_document.load_page(page_num)
+        text = page.get_text("text")
+        lines = [line.strip() for line in text.split("\n") if line.strip()]
+        if any("," in line for line in lines):
+            rows = [line.split(",") for line in lines]
+            tables.extend(rows)
+    table_content = ""
+    if tables:
+        max_columns = max(len(row) for row in tables)
+        tables = [row + [""] * (max_columns - len(row)) for row in tables]
+        df = pd.DataFrame(tables[1:], columns=tables[0])
+        table_content = df.to_csv(index=False)
+    pdf_document.close()
+    if model_option == "PyMuPDF":
+        bounding_boxes = extract_bounding_box_pymupdf(pdf_bytes)
+    elif model_option == "Gemini":
+        bounding_boxes = extract_bounding_boxes_gemini(
+            "your_gemini_api_key_here", images
+        )
+    elif model_option == "GPT-4":
+        bounding_boxes = extract_bounding_box_gpt("your_gpt4_api_key_here", pdf_bytes)
+    else:
+        bounding_boxes = []
+    return images, table_content, bounding_boxes
+def handle_model_selection(pdf_file, model_option):
+    return extract_images_and_tables(pdf_file, model_option)
+# Define the Gradio interface
+interface = gr.Interface(
+    fn=handle_model_selection,
+    inputs=[
+        gr.File(type="filepath", label="Upload PDF"),
+        gr.Dropdown(
+            label="Select Model",
+            choices=["PyMuPDF", "Gemini", "GPT-4"],
+            value="PyMuPDF",
+        ),
+    ],
+    outputs=[
+        gr.Gallery(label="Extracted Images"),
+        gr.Textbox(label="Extracted Tables"),
+        gr.JSON(label="Extracted Bounding Boxes"),
+    ],
+    title="PDF Image and Table Extractor",
+    description="Upload a PDF to extract images and tables. Choose the model for extraction.",
+)
+interface.launch(share=True)

utils.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import numpy as np
+from PIL import Image, ImageDraw
+from pdf2image import convert_from_path
+import os
+import shutil
+import re
+import fitz
+import base64
+def draw_boxes(image_path, boxes):
+    image = Image.open(image_path)
+    draw = ImageDraw.Draw(image)
+    for box in boxes:
+        draw.rectangle(box, outline="red", width=2)
+    return image
+def pdf_to_images(pdf_path):
+    images = convert_from_path(pdf_path)
+    image_paths = []
+    for idx, image in enumerate(images):
+        image_file_path = f"extract_tables/table_outputs/pdf-image-{idx + 1}.png"
+        image.save(image_file_path, format="PNG")
+        image_paths.append(image_file_path)
+    return image_paths
+def parse_bboxs_gemini_flash(input_string):
+    lines = [line for line in input_string.strip().split("\n") if line]
+    bounding_boxes = [list(map(int, re.findall(r"\d+", line))) for line in lines]
+    return bounding_boxes
+def convert_pdf_to_images(pdf_path):
+    images = []
+    with fitz.open(pdf_path) as doc:
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            pix = page.get_pixmap()
+            images.append(pix)
+    return images
+def encode_image_to_base64(image):
+    image_bytes = image.tobytes()
+    base64_encoded = base64.b64encode(image_bytes)
+    base64_string = base64_encoded.decode("utf-8")
+    return base64_string
+def calculate_scaling_factors(groundtruth_boxes, extracted_boxes):
+    assert len(groundtruth_boxes) == len(
+        extracted_boxes
+    ), "Mismatch in the number of bounding boxes."
+    x_factors = []
+    y_factors = []
+    for gt_box, ext_box in zip(groundtruth_boxes, extracted_boxes):
+        gt_xmin, gt_ymin, gt_xmax, gt_ymax = gt_box
+        ext_xmin, ext_ymin, ext_xmax, ext_ymax = ext_box
+        gt_width = gt_xmax - gt_xmin
+        gt_height = gt_ymax - gt_ymin
+        ext_width = ext_xmax - ext_xmin
+        ext_height = ext_ymax - ext_ymin
+        x_factors.append(ext_width / gt_width)
+        y_factors.append(ext_height / gt_height)
+    x_scale = np.mean(x_factors)
+    y_scale = np.mean(y_factors)
+    return x_scale, y_scale
+def scale_bounding_boxes(extracted_boxes, scaling_factors):
+    scaled_boxes = []
+    for page_boxes in extracted_boxes:
+        scaled_page_boxes = []
+        for box in page_boxes:
+            scaled_box = [
+                box[0] / scaling_factors[0],
+                box[1] / scaling_factors[1],
+                box[2] / scaling_factors[0],
+                box[3] / scaling_factors[1],
+            ]
+            scaled_page_boxes.append(scaled_box)
+        scaled_boxes.append(scaled_page_boxes)
+    return scaled_boxes
+def clear_directory(directory_path):
+    if os.path.exists(directory_path):
+        for filename in os.listdir(directory_path):
+            file_path = os.path.join(directory_path, filename)
+            try:
+                if os.path.isfile(file_path) or os.path.islink(file_path):
+                    os.unlink(file_path)
+                elif os.path.isdir(file_path):
+                    shutil.rmtree(file_path)
+            except Exception as e:
+                print(f"Failed to delete {file_path}. Reason: {e}")

visual_assets/.gitkeep ADDED Viewed

File without changes