notabaka commited on
Commit
0ee4a85
1 Parent(s): 2524123
Files changed (2) hide show
  1. app.py +15 -21
  2. requirements.txt +1 -2
app.py CHANGED
@@ -29,29 +29,23 @@ docs = st.sidebar.file_uploader("Upload documents", accept_multiple_files=True,
29
  query = st.text_input("Enter search query")
30
  click = st.button("Search")
31
 
 
 
 
32
  def extract_text(doc):
33
- # Write temp file
34
- with tempfile.TemporaryFile() as fp:
35
- fp.write(doc.read())
36
-
37
- if doc.type == 'text/plain':
38
- fp.seek(0)
39
- return fp.read().decode("utf-8")
40
-
41
- # Rest of logic
42
- if doc.name.endswith(".pdf"):
43
- fp.seek(0)
44
- with pdfplumber.open(fp) as pdf:
45
- pages = [page.extract_text() for page in pdf.pages]
46
- return "\n".join(pages)
47
 
48
- if doc.name.endswith(".docx"):
49
- fp.seek(0)
50
- return docx2txt.process(fp)
51
-
52
- # other cases
53
-
54
- return None
 
 
 
55
 
56
  if click and query:
57
  doc_contents = []
 
29
  query = st.text_input("Enter search query")
30
  click = st.button("Search")
31
 
32
+ import pdfplumber
33
+ import docx2txt
34
+
35
  def extract_text(doc):
36
+ if doc.type == 'text/plain':
37
+ return doc.read().decode('utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ if doc.name.endswith('.pdf'):
40
+ with pdfplumber.open(doc) as pdf:
41
+ pages = [page.extract_text() for page in pdf.pages]
42
+ return '\n'.join(pages)
43
+
44
+ if doc.name.endswith('.docx'):
45
+ raw_text = doc.read()
46
+ return docx2txt.process(raw_text)
47
+
48
+ return None
49
 
50
  if click and query:
51
  doc_contents = []
requirements.txt CHANGED
@@ -2,5 +2,4 @@ torch
2
  transformers
3
  textract
4
  docx2txt
5
- pdfplumber
6
- tempfile
 
2
  transformers
3
  textract
4
  docx2txt
5
+ pdfplumber