Spaces:

bachpc
/

table-structure-recognition

Build error

App Files Files Community

bachpc commited on Apr 12, 2023

Commit

b310dda

•

1 Parent(s): 6353415

Add extract to excel

Browse files

Files changed (2) hide show

app.py +110 -32
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -4,24 +4,32 @@ import cv2
 import numpy as np
 import pandas as pd
 import torch
 import io
 # import sys
 # import json
 from collections import OrderedDict, defaultdict
 import xml.etree.ElementTree as ET
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 from paddleocr import PaddleOCR
-import pytesseract
-from pytesseract import Output
 import postprocess
 ocr_instance = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=True)
 detection_model = torch.hub.load('ultralytics/yolov5', 'custom', 'weights/detection_wts.pt', force_reload=True)
 structure_model = torch.hub.load('ultralytics/yolov5', 'custom', 'weights/structure_wts.pt', force_reload=True)
 imgsz = 640
 detection_class_names = ['table', 'table rotated']
@@ -285,36 +293,36 @@ def visualize_cells(pil_img, cells):
     return PIL.Image.open(img_buf)
-def pytess(cell_pil_img):
-    return ' '.join(pytesseract.image_to_data(cell_pil_img, output_type=Output.DICT, config='-c tessedit_char_blacklist=œ˜â€œï¬â™Ã©œ¢!|”?«“¥ --tessdata-dir tessdata --oem 3 --psm 6')['text']).strip()
-def resize(pil_img, size=1800):
-    length_x, width_y = pil_img.size
-    factor = max(1, size / length_x)
-    size = int(factor * length_x), int(factor * width_y)
-    pil_img = pil_img.resize(size, PIL.Image.ANTIALIAS)
-    return pil_img, factor
-def image_smoothening(img):
-    ret1, th1 = cv2.threshold(img, 180, 255, cv2.THRESH_BINARY)
-    ret2, th2 = cv2.threshold(th1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-    blur = cv2.GaussianBlur(th2, (1, 1), 0)
-    ret3, th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-    return th3
-def remove_noise_and_smooth(pil_img):
-    img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
-    filtered = cv2.adaptiveThreshold(img.astype(np.uint8), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 41, 3)
-    kernel = np.ones((1, 1), np.uint8)
-    opening = cv2.morphologyEx(filtered, cv2.MORPH_OPEN, kernel)
-    closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel)
-    img = image_smoothening(img)
-    or_image = cv2.bitwise_or(img, closing)
-    pil_img = PIL.Image.fromarray(or_image)
-    return pil_img
 # def extract_text_from_cells(pil_img, cells):
@@ -438,6 +446,53 @@ def cells_to_html(cells):
 #     return html_code
 def main():
     st.set_page_config(layout='wide')
@@ -453,7 +508,7 @@ def main():
         else:
             tabs = st.tabs(
-                ['Table Detection', 'Table Structure Recognition']
             )
             print(filename)
@@ -462,24 +517,29 @@ def main():
             detection_result = table_detection(pil_img)
             crop_images, vis_det_img = crop_image(pil_img, detection_result)
             with tabs[0]:
                 st.image(vis_det_img)
             with tabs[1]:
-                str_cols = st.columns((len(crop_images), ) * 5)
                 str_cols[0].subheader('Table image')
                 str_cols[1].subheader('OCR result')
                 str_cols[2].subheader('Structure result')
                 str_cols[3].subheader('Cells result')
-                str_cols[4].subheader('CSV result')
                 for i, img in enumerate(crop_images):
                     ocr_result = ocr(img)
                     structure_result = table_structure(img)
                     table_structures, cells, confidence_score = convert_stucture(ocr_result, img, structure_result)
                     cells = extract_text_from_cells(cells)
                     html_result = cells_to_html(cells)
-                    df, csv_result = cells_to_csv(cells)
                     #print(df)
                     vis_ocr_img = visualize_ocr(img, ocr_result)
@@ -490,12 +550,30 @@ def main():
                     str_cols[1].image(vis_ocr_img)
                     str_cols[2].image(vis_str_img)
                     str_cols[3].image(vis_cells_img)
-                    #str_cols[4].dataframe(df)
-                    str_cols[4].download_button('Download table', csv_result, f'table-{i}.csv', 'text/csv', key=f'download-csv-{i}')
                     st.write('\n')
                     st.markdown(html_result, unsafe_allow_html=True)
 if __name__ == '__main__':
     main()

 import numpy as np
 import pandas as pd
 import torch
+import os
 import io
 # import sys
 # import json
 from collections import OrderedDict, defaultdict
 import xml.etree.ElementTree as ET
+from tempfile import TemporaryDirectory
+import xlsxwriter
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 from paddleocr import PaddleOCR
+# import pytesseract
+# from pytesseract import Output
 import postprocess
 ocr_instance = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=True)
+torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
 detection_model = torch.hub.load('ultralytics/yolov5', 'custom', 'weights/detection_wts.pt', force_reload=True)
+torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
 structure_model = torch.hub.load('ultralytics/yolov5', 'custom', 'weights/structure_wts.pt', force_reload=True)
 imgsz = 640
 detection_class_names = ['table', 'table rotated']
     return PIL.Image.open(img_buf)
+# def pytess(cell_pil_img):
+#     return ' '.join(pytesseract.image_to_data(cell_pil_img, output_type=Output.DICT, config='-c tessedit_char_blacklist=œ˜â€œï¬â™Ã©œ¢!|”?«“¥ --tessdata-dir tessdata --oem 3 --psm 6')['text']).strip()
+# def resize(pil_img, size=1800):
+#     length_x, width_y = pil_img.size
+#     factor = max(1, size / length_x)
+#     size = int(factor * length_x), int(factor * width_y)
+#     pil_img = pil_img.resize(size, PIL.Image.ANTIALIAS)
+#     return pil_img, factor
+# def image_smoothening(img):
+#     ret1, th1 = cv2.threshold(img, 180, 255, cv2.THRESH_BINARY)
+#     ret2, th2 = cv2.threshold(th1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+#     blur = cv2.GaussianBlur(th2, (1, 1), 0)
+#     ret3, th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+#     return th3
+# def remove_noise_and_smooth(pil_img):
+#     img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
+#     filtered = cv2.adaptiveThreshold(img.astype(np.uint8), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 41, 3)
+#     kernel = np.ones((1, 1), np.uint8)
+#     opening = cv2.morphologyEx(filtered, cv2.MORPH_OPEN, kernel)
+#     closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel)
+#     img = image_smoothening(img)
+#     or_image = cv2.bitwise_or(img, closing)
+#     pil_img = PIL.Image.fromarray(or_image)
+#     return pil_img
 # def extract_text_from_cells(pil_img, cells):
 #     return html_code
+def cells_to_excel(cells, file_path):
+    def int2xlsx(i):
+        if i < 26:
+            return chr(i + 65)
+        return f'{chr(i // 26 + 64)}{chr(i % 26 + 65)}'
+    cells = sorted(cells, key=lambda k: min(k['column_nums']))
+    cells = sorted(cells, key=lambda k: min(k['row_nums']))
+    workbook = xlsxwriter.Workbook(file_path)
+    cell_format = workbook.add_format(
+        {
+            'align': 'center',
+            'valign': 'vcenter',
+        }
+    )
+    worksheet = workbook.add_worksheet(name='Table')
+    table_start_index = 0
+    for cell in cells:
+        start_row = min(cell['row_nums'])
+        end_row = max(cell['row_nums'])
+        start_col = min(cell['column_nums'])
+        end_col = max(cell['column_nums'])
+        if start_row == end_row and start_col == end_col:
+            worksheet.write(
+                table_start_index + start_row,
+                start_col,
+                cell['cell_text'],
+                cell_format,
+            )
+        else:
+            if start_col == end_col and start_row == end_row:
+                excel_index = f'{int2xlsx(table_start_index + start_col)}{table_start_index + start_row + 1}'
+            else:
+                excel_index = f'{int2xlsx(table_start_index + start_col)}{table_start_index + start_row + 1}:{int2xlsx(table_start_index + end_col)}{table_start_index + end_row + 1}'
+            worksheet.merge_range(
+                excel_index, cell['cell_text'], cell_format
+            )
+    workbook.close()
 def main():
     st.set_page_config(layout='wide')
         else:
             tabs = st.tabs(
+                ['Table Detection', 'Table Structure Recognition', 'Extracted Table(s)']
             )
             print(filename)
             detection_result = table_detection(pil_img)
             crop_images, vis_det_img = crop_image(pil_img, detection_result)
+            all_cells = []
             with tabs[0]:
+                st.header('Table Detection')
                 st.image(vis_det_img)
             with tabs[1]:
+                st.header('Table Structure Recognition')
+                str_cols = st.columns((len(crop_images), ) * 4)
                 str_cols[0].subheader('Table image')
                 str_cols[1].subheader('OCR result')
                 str_cols[2].subheader('Structure result')
                 str_cols[3].subheader('Cells result')
                 for i, img in enumerate(crop_images):
                     ocr_result = ocr(img)
                     structure_result = table_structure(img)
                     table_structures, cells, confidence_score = convert_stucture(ocr_result, img, structure_result)
                     cells = extract_text_from_cells(cells)
+                    all_cells.append(cells)
                     html_result = cells_to_html(cells)
+                    #df, csv_result = cells_to_csv(cells)
                     #print(df)
                     vis_ocr_img = visualize_ocr(img, ocr_result)
                     str_cols[1].image(vis_ocr_img)
                     str_cols[2].image(vis_str_img)
                     str_cols[3].image(vis_cells_img)
                     st.write('\n')
                     st.markdown(html_result, unsafe_allow_html=True)
+            with tabs[2]:
+                st.header('Extracted Table(s)')
+                for idx, col in enumerate(st.columns(len(all_cells))):
+                    with col:
+                        if len(all_cells) > 1:
+                            st.header(f'Table {idx + 1}')
+                        with TemporaryDirectory() as temp_dir_path:
+                            df = None
+                            xlsx_path = os.path.join(temp_dir_path, f'debug_{idx}.xlsx')
+                            cells_to_excel(all_cells[idx], xlsx_path)
+                            with open(xlsx_path, 'rb') as ref:
+                                df = pd.read_excel(ref)
+                                st.dataframe(df)
+                                st.download_button(
+                                    'Download Excel File',
+                                    ref,
+                                    file_name=f'output_{idx}.xlsx',
+                                )
 if __name__ == '__main__':
     main()

requirements.txt CHANGED Viewed

@@ -76,3 +76,4 @@ setuptools>=65.5.1 # Snyk vulnerability fix
 # Other
 pytesseract==0.3.10
 # beautifulsoup4==4.11.1

 # Other
 pytesseract==0.3.10
 # beautifulsoup4==4.11.1
+xlsxwriter