notabaka commited on
Commit
60eae40
1 Parent(s): 79ecc72
Files changed (1) hide show
  1. app.py +19 -17
app.py CHANGED
@@ -28,6 +28,25 @@ docs = st.sidebar.file_uploader("Upload documents", accept_multiple_files=True,
28
  query = st.text_input("Enter search query")
29
  click = st.button("Search")
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  if click and query:
32
  doc_contents = []
33
 
@@ -46,22 +65,5 @@ if click and query:
46
  for doc, score in ranked_docs:
47
  st.write(f"{doc.name} (score: {score:.2f})")
48
 
49
- def extract_text(doc):
50
- if doc.type == 'text/plain':
51
- return doc.getvalue().decode("utf-8")
52
-
53
- if doc.type == "application/pdf":
54
- with pdfplumber.open(doc) as pdf:
55
- pages = [page.extract_text() for page in pdf.pages]
56
- return "\n".join(pages)
57
-
58
- if doc.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
59
- return docx2txt.process(doc)
60
-
61
- if doc.name.endswith(".xlsx"):
62
- text = textract.process(doc)
63
- return text.decode("utf-8")
64
 
65
- return None
66
-
67
 
 
28
  query = st.text_input("Enter search query")
29
  click = st.button("Search")
30
 
31
+ def extract_text(doc):
32
+ if doc.type == 'text/plain':
33
+ return doc.getvalue().decode("utf-8")
34
+
35
+ if doc.type == "application/pdf":
36
+ with pdfplumber.open(doc) as pdf:
37
+ pages = [page.extract_text() for page in pdf.pages]
38
+ return "\n".join(pages)
39
+
40
+ if doc.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
41
+ return docx2txt.process(doc)
42
+
43
+ if doc.name.endswith(".xlsx"):
44
+ text = textract.process(doc)
45
+ return text.decode("utf-8")
46
+
47
+ return None
48
+
49
+
50
  if click and query:
51
  doc_contents = []
52
 
 
65
  for doc, score in ranked_docs:
66
  st.write(f"{doc.name} (score: {score:.2f})")
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
 
 
69