extract from docx
Browse files- app.py +11 -2
- requirements.txt +2 -1
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
from pdf2image import convert_from_path
|
3 |
import pdfplumber
|
|
|
4 |
|
5 |
def convert_pdf_to_image(file):
|
6 |
images = convert_from_path(file)
|
@@ -10,11 +11,19 @@ def extract_text_from_pdf(file):
|
|
10 |
text = ""
|
11 |
with pdfplumber.open(file) as pdf:
|
12 |
for page in pdf.pages:
|
13 |
-
text += page.extract_text()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
return text
|
15 |
|
16 |
pdf_to_img = gr.Interface(convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img")
|
17 |
pdf_to_text = gr.Interface(extract_text_from_pdf, gr.File(), gr.Textbox(placeholder="Extracted text will appear here"), api_name="pdf_to_text")
|
|
|
18 |
|
19 |
-
demo = gr.TabbedInterface([pdf_to_img, pdf_to_text], ["PDF to Image", "Extract Text"])
|
20 |
demo.launch(debug=True)
|
|
|
1 |
import gradio as gr
|
2 |
from pdf2image import convert_from_path
|
3 |
import pdfplumber
|
4 |
+
from docx import Document
|
5 |
|
6 |
def convert_pdf_to_image(file):
|
7 |
images = convert_from_path(file)
|
|
|
11 |
text = ""
|
12 |
with pdfplumber.open(file) as pdf:
|
13 |
for page in pdf.pages:
|
14 |
+
text += page.extract_text() + "\n"
|
15 |
+
return text
|
16 |
+
|
17 |
+
def extract_text_from_docx(file):
|
18 |
+
text = ""
|
19 |
+
doc = Document(file.name)
|
20 |
+
for paragraph in doc.paragraphs:
|
21 |
+
text += paragraph.text + "\n"
|
22 |
return text
|
23 |
|
24 |
pdf_to_img = gr.Interface(convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img")
|
25 |
pdf_to_text = gr.Interface(extract_text_from_pdf, gr.File(), gr.Textbox(placeholder="Extracted text will appear here"), api_name="pdf_to_text")
|
26 |
+
docx_to_text = gr.Interface(extract_text_from_docx, gr.File(), gr.Textbox(placeholder="Extracted text from DOCX will appear here"), api_name="docx_to_text")
|
27 |
|
28 |
+
demo = gr.TabbedInterface([pdf_to_img, pdf_to_text, docx_to_text], ["PDF to Image", "Extract PDF Text", "Extract DOCX Text"])
|
29 |
demo.launch(debug=True)
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
pdf2image
|
2 |
gradio
|
3 |
-
pdfplumber
|
|
|
|
1 |
pdf2image
|
2 |
gradio
|
3 |
+
pdfplumber
|
4 |
+
python-docx
|