Spaces:

not-lain
/

utils

Running

not-lain commited on 10 days ago

Commit

0bce450

•

1 Parent(s): c982cf8

extract from docx

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 from pdf2image import convert_from_path
 import pdfplumber
 def convert_pdf_to_image(file):
     images = convert_from_path(file)
@@ -10,11 +11,19 @@ def extract_text_from_pdf(file):
     text = ""
     with pdfplumber.open(file) as pdf:
         for page in pdf.pages:
-            text += page.extract_text()
     return text
 pdf_to_img = gr.Interface(convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img")
 pdf_to_text = gr.Interface(extract_text_from_pdf, gr.File(), gr.Textbox(placeholder="Extracted text will appear here"), api_name="pdf_to_text")
-demo = gr.TabbedInterface([pdf_to_img, pdf_to_text], ["PDF to Image", "Extract Text"])
 demo.launch(debug=True)

 import gradio as gr
 from pdf2image import convert_from_path
 import pdfplumber
+from docx import Document
 def convert_pdf_to_image(file):
     images = convert_from_path(file)
     text = ""
     with pdfplumber.open(file) as pdf:
         for page in pdf.pages:
+            text += page.extract_text() + "\n"
+    return text
+def extract_text_from_docx(file):
+    text = ""
+    doc = Document(file.name)
+    for paragraph in doc.paragraphs:
+        text += paragraph.text + "\n"
     return text
 pdf_to_img = gr.Interface(convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img")
 pdf_to_text = gr.Interface(extract_text_from_pdf, gr.File(), gr.Textbox(placeholder="Extracted text will appear here"), api_name="pdf_to_text")
+docx_to_text = gr.Interface(extract_text_from_docx, gr.File(), gr.Textbox(placeholder="Extracted text from DOCX will appear here"), api_name="docx_to_text")
+demo = gr.TabbedInterface([pdf_to_img, pdf_to_text, docx_to_text], ["PDF to Image", "Extract PDF Text", "Extract DOCX Text"])
 demo.launch(debug=True)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 pdf2image
 gradio
-pdfplumber

 pdf2image
 gradio
+pdfplumber
+python-docx