Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import fitz | |
import PyPDF2 | |
def extract_text_and_figures(pdf_path): | |
""" | |
Extracts text and figures from a PDF file. | |
Args: | |
pdf_path (str): The path to the PDF file. | |
Returns: | |
tuple: A tuple containing two lists: | |
* A list of extracted text blocks. | |
* A list of extracted figures (as bytes). | |
""" | |
texts = [] | |
figures = [] | |
# Open the PDF using PyMuPDF (fitz) for image extraction | |
doc = fitz.open(pdf_path) | |
for page_num, page in enumerate(doc): | |
text = page.get_text("text") # Extract text as plain text | |
texts.append(text) | |
# Process images on the page | |
image_list = page.get_images() | |
for image_index, img in enumerate(image_list): | |
xref = img[0] # Image XREF | |
pix = fitz.Pixmap(doc, xref) # Create Pixmap image | |
# Save image in desired format (here, PNG) | |
if pix.n < 5: # Grayscale or RGB | |
img_bytes = pix.tobytes("png") | |
else: # CMYK: Convert to RGB first | |
pix = fitz.Pixmap(fitz.csRGB, pix) | |
img_bytes = pix.tobytes("png") | |
figures.append(img_bytes) | |
# Extract additional text using PyPDF2 (in case fitz didn't get everything) | |
with open(pdf_path, 'rb') as pdf_file: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text = page.extract_text() | |
texts.append(text) | |
try: | |
os.remove(pdf_path) | |
except FileNotFoundError: | |
print(f"File '{pdf_path}' not found.") | |
except PermissionError: | |
print(f"Unable to remove '{pdf_path}'. Check permissions.") | |
return texts, figures | |