notabaka commited on
Commit
f52a963
1 Parent(s): faa2e50
Files changed (1) hide show
  1. app.py +16 -6
app.py CHANGED
@@ -26,24 +26,34 @@ st.title("Text Similarity Model")
26
 
27
  task = 'Given a web search query, retrieve relevant passages that answer the query'
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  docs = st.sidebar.file_uploader("Upload documents", accept_multiple_files=True, type=['txt','pdf','xlsx','docx'])
30
  query = st.text_input("Enter search query")
31
  click = st.button("Search")
32
 
33
- import pdfplumber
34
- import docx2txt
35
 
36
  def extract_text(doc):
37
  if doc.type == 'text/plain':
38
  return doc.read().decode('utf-8')
39
 
40
  if doc.name.endswith(".pdf"):
41
- raw = doc.read()
42
 
43
- # Handle null bytes
44
- raw = raw.replace(b'\x00', b'')
45
 
46
- with pdfplumber.open(raw) as pdf:
47
  pages = [page.extract_text() for page in pdf.pages]
48
 
49
  return "\n".join(pages)
 
26
 
27
  task = 'Given a web search query, retrieve relevant passages that answer the query'
28
 
29
+
30
+ UPLOAD_DIR = "uploads"
31
+
32
+ if not os.path.exists(UPLOAD_DIR):
33
+ os.mkdir(UPLOAD_DIR)
34
+
35
+ def save_upload(uploaded_file):
36
+ filepath = os.path.join(UPLOAD_DIR, uploaded_file.name)
37
+ with open(filepath,"wb") as f:
38
+ f.write(uploaded_file.getbuffer())
39
+
40
+ return filepath
41
+
42
  docs = st.sidebar.file_uploader("Upload documents", accept_multiple_files=True, type=['txt','pdf','xlsx','docx'])
43
  query = st.text_input("Enter search query")
44
  click = st.button("Search")
45
 
46
+
 
47
 
48
  def extract_text(doc):
49
  if doc.type == 'text/plain':
50
  return doc.read().decode('utf-8')
51
 
52
  if doc.name.endswith(".pdf"):
53
+ docPath = save_upload(doc)
54
 
 
 
55
 
56
+ with pdfplumber.open(docPath) as pdf:
57
  pages = [page.extract_text() for page in pdf.pages]
58
 
59
  return "\n".join(pages)