kkk
Browse files- app.py +15 -21
- requirements.txt +1 -2
app.py
CHANGED
@@ -29,29 +29,23 @@ docs = st.sidebar.file_uploader("Upload documents", accept_multiple_files=True,
|
|
29 |
query = st.text_input("Enter search query")
|
30 |
click = st.button("Search")
|
31 |
|
|
|
|
|
|
|
32 |
def extract_text(doc):
|
33 |
-
|
34 |
-
|
35 |
-
fp.write(doc.read())
|
36 |
-
|
37 |
-
if doc.type == 'text/plain':
|
38 |
-
fp.seek(0)
|
39 |
-
return fp.read().decode("utf-8")
|
40 |
-
|
41 |
-
# Rest of logic
|
42 |
-
if doc.name.endswith(".pdf"):
|
43 |
-
fp.seek(0)
|
44 |
-
with pdfplumber.open(fp) as pdf:
|
45 |
-
pages = [page.extract_text() for page in pdf.pages]
|
46 |
-
return "\n".join(pages)
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
55 |
|
56 |
if click and query:
|
57 |
doc_contents = []
|
|
|
29 |
query = st.text_input("Enter search query")
|
30 |
click = st.button("Search")
|
31 |
|
32 |
+
import pdfplumber
|
33 |
+
import docx2txt
|
34 |
+
|
35 |
def extract_text(doc):
|
36 |
+
if doc.type == 'text/plain':
|
37 |
+
return doc.read().decode('utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
if doc.name.endswith('.pdf'):
|
40 |
+
with pdfplumber.open(doc) as pdf:
|
41 |
+
pages = [page.extract_text() for page in pdf.pages]
|
42 |
+
return '\n'.join(pages)
|
43 |
+
|
44 |
+
if doc.name.endswith('.docx'):
|
45 |
+
raw_text = doc.read()
|
46 |
+
return docx2txt.process(raw_text)
|
47 |
+
|
48 |
+
return None
|
49 |
|
50 |
if click and query:
|
51 |
doc_contents = []
|
requirements.txt
CHANGED
@@ -2,5 +2,4 @@ torch
|
|
2 |
transformers
|
3 |
textract
|
4 |
docx2txt
|
5 |
-
pdfplumber
|
6 |
-
tempfile
|
|
|
2 |
transformers
|
3 |
textract
|
4 |
docx2txt
|
5 |
+
pdfplumber
|
|