not-lain commited on
Commit
5f1077a
1 Parent(s): 0bce450

docx and doc

Browse files
Files changed (2) hide show
  1. app.py +59 -8
  2. packages.txt +2 -1
app.py CHANGED
@@ -1,12 +1,16 @@
1
  import gradio as gr
2
  from pdf2image import convert_from_path
3
  import pdfplumber
4
- from docx import Document
 
 
 
5
 
6
  def convert_pdf_to_image(file):
7
  images = convert_from_path(file)
8
  return images
9
 
 
10
  def extract_text_from_pdf(file):
11
  text = ""
12
  with pdfplumber.open(file) as pdf:
@@ -14,16 +18,63 @@ def extract_text_from_pdf(file):
14
  text += page.extract_text() + "\n"
15
  return text
16
 
 
17
  def extract_text_from_docx(file):
18
  text = ""
19
- doc = Document(file.name)
20
  for paragraph in doc.paragraphs:
21
- text += paragraph.text + "\n"
22
  return text
23
 
24
- pdf_to_img = gr.Interface(convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img")
25
- pdf_to_text = gr.Interface(extract_text_from_pdf, gr.File(), gr.Textbox(placeholder="Extracted text will appear here"), api_name="pdf_to_text")
26
- docx_to_text = gr.Interface(extract_text_from_docx, gr.File(), gr.Textbox(placeholder="Extracted text from DOCX will appear here"), api_name="docx_to_text")
27
 
28
- demo = gr.TabbedInterface([pdf_to_img, pdf_to_text, docx_to_text], ["PDF to Image", "Extract PDF Text", "Extract DOCX Text"])
29
- demo.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from pdf2image import convert_from_path
3
  import pdfplumber
4
+ from docx import Document
5
+ import subprocess
6
+ import os
7
+
8
 
9
  def convert_pdf_to_image(file):
10
  images = convert_from_path(file)
11
  return images
12
 
13
+
14
  def extract_text_from_pdf(file):
15
  text = ""
16
  with pdfplumber.open(file) as pdf:
 
18
  text += page.extract_text() + "\n"
19
  return text
20
 
21
+
22
  def extract_text_from_docx(file):
23
  text = ""
24
+ doc = Document(file.name)
25
  for paragraph in doc.paragraphs:
26
+ text += paragraph.text + "\n"
27
  return text
28
 
 
 
 
29
 
30
+ def convert_doc_to_text(doc_path):
31
+ try:
32
+ subprocess.run(
33
+ ["unoconv", "--format", "txt", doc_path],
34
+ capture_output=True,
35
+ text=True,
36
+ check=True,
37
+ )
38
+ txt_file_path = doc_path.replace(".doc", ".txt")
39
+ with open(txt_file_path, "r") as f:
40
+ text = f.read()
41
+ text = text.lstrip("\ufeff")
42
+ os.remove(txt_file_path)
43
+ return text
44
+ except subprocess.CalledProcessError as e:
45
+ print(f"Error converting {doc_path} to text: {e}")
46
+ return ""
47
+
48
+
49
+ def extract_text_from_doc_or_docx(file):
50
+ if file.name.endswith(".docx"):
51
+ return extract_text_from_docx(file)
52
+ elif file.name.endswith(".doc"):
53
+ return convert_doc_to_text(file.name)
54
+ else:
55
+ return "Unsupported file type. Please upload a .doc or .docx file."
56
+
57
+
58
+ pdf_to_img = gr.Interface(
59
+ convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
60
+ )
61
+ pdf_to_text = gr.Interface(
62
+ extract_text_from_pdf,
63
+ gr.File(),
64
+ gr.Textbox(placeholder="Extracted text will appear here"),
65
+ api_name="pdf_to_text",
66
+ )
67
+
68
+ doc_or_docx_to_text = gr.Interface(
69
+ extract_text_from_doc_or_docx,
70
+ gr.File(),
71
+ gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"),
72
+ api_name="doc_or_docx_to_text",
73
+ )
74
+
75
+ demo = gr.TabbedInterface(
76
+ [pdf_to_img, pdf_to_text, doc_or_docx_to_text],
77
+ ["PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text"],
78
+ )
79
+
80
+ demo.launch(debug=True)
packages.txt CHANGED
@@ -1 +1,2 @@
1
- poppler-utils
 
 
1
+ poppler-utils
2
+ unoconv