shimer56 commited on
Commit
63f6421
1 Parent(s): 1e9056d

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -2
  2. utils.py +20 -6
requirements.txt CHANGED
@@ -13,5 +13,4 @@ pdfplumber
13
  pymupdf
14
  timm
15
  transformers
16
- img2table
17
- poppler-utils
 
13
  pymupdf
14
  timm
15
  transformers
16
+ img2table
 
utils.py CHANGED
@@ -18,14 +18,28 @@ def draw_boxes(image_path, boxes):
18
  return image
19
 
20
 
21
- def pdf_to_images(pdf_path):
22
- images = convert_from_path(pdf_path)
 
 
 
 
 
23
 
24
  image_paths = []
25
- for idx, image in enumerate(images):
26
- image_file_path = f"extract_tables/table_outputs/pdf-image-{idx + 1}.png"
27
- image.save(image_file_path, format="PNG")
28
- image_paths.append(image_file_path)
 
 
 
 
 
 
 
 
 
29
 
30
  return image_paths
31
 
 
18
  return image
19
 
20
 
21
+ def pdf_to_images(
22
+ pdf_path, output_dir="extract_tables/table_outputs", output_format="png"
23
+ ):
24
+ if not os.path.exists(output_dir):
25
+ os.makedirs(output_dir)
26
+
27
+ pdf_document = fitz.open(pdf_path)
28
 
29
  image_paths = []
30
+ for page_num in range(len(pdf_document)):
31
+ page = pdf_document.load_page(page_num)
32
+ pix = page.get_pixmap(dpi=300)
33
+
34
+ image_file_path = os.path.join(
35
+ output_dir, f"pdf-image-{page_num + 1}.{output_format}"
36
+ )
37
+
38
+ try:
39
+ pix.save(image_file_path)
40
+ image_paths.append(image_file_path)
41
+ except Exception as e:
42
+ print(f"Error saving image {image_file_path}: {e}")
43
 
44
  return image_paths
45