Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
|
|
6 |
import os
|
7 |
from threading import Thread
|
8 |
|
9 |
-
|
10 |
import docx
|
11 |
from pptx import Presentation
|
12 |
|
@@ -56,11 +56,11 @@ def extract_text(path):
|
|
56 |
return open(path, 'r').read()
|
57 |
|
58 |
def extract_pdf(path):
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
return
|
64 |
|
65 |
def extract_docx(path):
|
66 |
doc = docx.Document(path)
|
@@ -68,6 +68,7 @@ def extract_docx(path):
|
|
68 |
for paragraph in doc.paragraphs:
|
69 |
data.append(paragraph.text)
|
70 |
content = '\n\n'.join(data)
|
|
|
71 |
|
72 |
def extract_pptx(path):
|
73 |
prs = Presentation(path)
|
@@ -91,8 +92,8 @@ def mode_load(path):
|
|
91 |
else:
|
92 |
content = extract_text(path)
|
93 |
choice = "doc"
|
94 |
-
print(content)
|
95 |
-
return choice, content
|
96 |
elif file_type in ["png", "jpg", "jpeg", "bmp", "tiff", "webp"]:
|
97 |
content = Image.open(path).convert('RGB')
|
98 |
choice = "image"
|
|
|
6 |
import os
|
7 |
from threading import Thread
|
8 |
|
9 |
+
import fitz
|
10 |
import docx
|
11 |
from pptx import Presentation
|
12 |
|
|
|
56 |
return open(path, 'r').read()
|
57 |
|
58 |
def extract_pdf(path):
|
59 |
+
doc = fitz.open(path)
|
60 |
+
text = ""
|
61 |
+
for page in doc:
|
62 |
+
text += page.get_text()
|
63 |
+
return text
|
64 |
|
65 |
def extract_docx(path):
|
66 |
doc = docx.Document(path)
|
|
|
68 |
for paragraph in doc.paragraphs:
|
69 |
data.append(paragraph.text)
|
70 |
content = '\n\n'.join(data)
|
71 |
+
return content
|
72 |
|
73 |
def extract_pptx(path):
|
74 |
prs = Presentation(path)
|
|
|
92 |
else:
|
93 |
content = extract_text(path)
|
94 |
choice = "doc"
|
95 |
+
print(content[:100])
|
96 |
+
return choice, content[:5000]
|
97 |
elif file_type in ["png", "jpg", "jpeg", "bmp", "tiff", "webp"]:
|
98 |
content = Image.open(path).convert('RGB')
|
99 |
choice = "image"
|