not-lain commited on
Commit
0bce450
1 Parent(s): c982cf8

extract from docx

Browse files
Files changed (2) hide show
  1. app.py +11 -2
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  from pdf2image import convert_from_path
3
  import pdfplumber
 
4
 
5
  def convert_pdf_to_image(file):
6
  images = convert_from_path(file)
@@ -10,11 +11,19 @@ def extract_text_from_pdf(file):
10
  text = ""
11
  with pdfplumber.open(file) as pdf:
12
  for page in pdf.pages:
13
- text += page.extract_text()
 
 
 
 
 
 
 
14
  return text
15
 
16
  pdf_to_img = gr.Interface(convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img")
17
  pdf_to_text = gr.Interface(extract_text_from_pdf, gr.File(), gr.Textbox(placeholder="Extracted text will appear here"), api_name="pdf_to_text")
 
18
 
19
- demo = gr.TabbedInterface([pdf_to_img, pdf_to_text], ["PDF to Image", "Extract Text"])
20
  demo.launch(debug=True)
 
1
  import gradio as gr
2
  from pdf2image import convert_from_path
3
  import pdfplumber
4
+ from docx import Document
5
 
6
  def convert_pdf_to_image(file):
7
  images = convert_from_path(file)
 
11
  text = ""
12
  with pdfplumber.open(file) as pdf:
13
  for page in pdf.pages:
14
+ text += page.extract_text() + "\n"
15
+ return text
16
+
17
+ def extract_text_from_docx(file):
18
+ text = ""
19
+ doc = Document(file.name)
20
+ for paragraph in doc.paragraphs:
21
+ text += paragraph.text + "\n"
22
  return text
23
 
24
  pdf_to_img = gr.Interface(convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img")
25
  pdf_to_text = gr.Interface(extract_text_from_pdf, gr.File(), gr.Textbox(placeholder="Extracted text will appear here"), api_name="pdf_to_text")
26
+ docx_to_text = gr.Interface(extract_text_from_docx, gr.File(), gr.Textbox(placeholder="Extracted text from DOCX will appear here"), api_name="docx_to_text")
27
 
28
+ demo = gr.TabbedInterface([pdf_to_img, pdf_to_text, docx_to_text], ["PDF to Image", "Extract PDF Text", "Extract DOCX Text"])
29
  demo.launch(debug=True)
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  pdf2image
2
  gradio
3
- pdfplumber
 
 
1
  pdf2image
2
  gradio
3
+ pdfplumber
4
+ python-docx