notabaka commited on
Commit
ff9f02f
1 Parent(s): 6ed8967
Files changed (2) hide show
  1. app.py +3 -21
  2. requirements.txt +1 -3
app.py CHANGED
@@ -2,12 +2,8 @@ import streamlit as st
2
  import torch
3
  import torch.nn.functional as F
4
  from torch import Tensor
5
- from transformers import AutoTokenizer, AutoModel
6
- import tempfile
7
  import textract
8
- import docx2txt
9
- import pdfplumber
10
- import io
11
  import os
12
 
13
  def last_token_pool(last_hidden_states: Tensor,
@@ -46,23 +42,9 @@ click = st.button("Search")
46
 
47
 
48
 
49
- def extract_text(doc):
50
- if doc.type == 'text/plain':
51
- return doc.read().decode('utf-8')
52
-
53
- if doc.name.endswith(".pdf"):
54
- docPath = save_upload(doc)
55
-
56
-
57
- with pdfplumber.open(docPath) as pdf:
58
- pages = [page.extract_text() for page in pdf.pages]
59
 
60
- return "\n".join(pages)
61
-
62
-
63
- if doc.name.endswith('.docx'):
64
- raw_text = doc.read()
65
- return docx2txt.process(raw_text)
66
 
67
  return None
68
 
 
2
  import torch
3
  import torch.nn.functional as F
4
  from torch import Tensor
5
+
 
6
  import textract
 
 
 
7
  import os
8
 
9
  def last_token_pool(last_hidden_states: Tensor,
 
42
 
43
 
44
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ def extract_text(doc):
47
+ return textract.process(doc).decode('utf-8')
 
 
 
 
48
 
49
  return None
50
 
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
  torch
2
  transformers
3
- textract
4
- docx2txt
5
- pdfplumber
 
1
  torch
2
  transformers
3
+ textract