ugmSorcero commited on
Commit
4107940
1 Parent(s): d36f6ee

Adds file support (txt, pdf, csv)

Browse files
interface/components.py CHANGED
@@ -1,5 +1,5 @@
1
  import streamlit as st
2
- from interface.utils import get_pipelines, extract_text_from_url
3
  from interface.draw_pipelines import get_pipeline_graph
4
 
5
 
@@ -84,3 +84,27 @@ def component_article_url(container):
84
  {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
85
  ]
86
  return corpus
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from interface.utils import get_pipelines, extract_text_from_url, extract_text_from_file
3
  from interface.draw_pipelines import get_pipeline_graph
4
 
5
 
 
84
  {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
85
  ]
86
  return corpus
87
+
88
+
89
+ def component_file_input(container):
90
+ """Draw the extract text from file widget"""
91
+ with container:
92
+ files = []
93
+ doc_id = 1
94
+ with st.expander("Enter Files"):
95
+ while True:
96
+ file = st.file_uploader("Upload a .txt, .pdf, .csv file", key=doc_id)
97
+ if file != None:
98
+ extracted_text = extract_text_from_file(file)
99
+ if extracted_text != None:
100
+ files.append({"text": extracted_text})
101
+ doc_id += 1
102
+ st.markdown("---")
103
+ else:
104
+ break
105
+ else:
106
+ break
107
+ corpus = [
108
+ {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
109
+ ]
110
+ return corpus
interface/pages.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  from streamlit_option_menu import option_menu
3
  from core.search_index import index, search
4
  from interface.components import (
 
5
  component_show_pipeline,
6
  component_show_search_result,
7
  component_text_input,
@@ -59,6 +60,7 @@ def page_index(container):
59
  input_funcs = {
60
  "Raw Text": (component_text_input, "card-text"),
61
  "URL": (component_article_url, "card-link"),
 
62
  }
63
  selected_input = option_menu(
64
  "Input Text",
 
2
  from streamlit_option_menu import option_menu
3
  from core.search_index import index, search
4
  from interface.components import (
5
+ component_file_input,
6
  component_show_pipeline,
7
  component_show_search_result,
8
  component_text_input,
 
60
  input_funcs = {
61
  "Raw Text": (component_text_input, "card-text"),
62
  "URL": (component_article_url, "card-link"),
63
+ "File": (component_file_input, "card-file"),
64
  }
65
  selected_input = option_menu(
66
  "Input Text",
interface/utils.py CHANGED
@@ -1,7 +1,10 @@
 
1
  import core.pipelines as pipelines_functions
2
  from inspect import getmembers, isfunction
3
  from newspaper import Article
 
4
  import streamlit as st
 
5
 
6
 
7
  def get_pipelines():
@@ -21,3 +24,60 @@ def extract_text_from_url(url: str):
21
  article.parse()
22
 
23
  return article.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import StringIO
2
  import core.pipelines as pipelines_functions
3
  from inspect import getmembers, isfunction
4
  from newspaper import Article
5
+ from PyPDF2 import PdfFileReader
6
  import streamlit as st
7
+ import pandas as pd
8
 
9
 
10
  def get_pipelines():
 
24
  article.parse()
25
 
26
  return article.text
27
+
28
+
29
+ def extract_text_from_file(file):
30
+ # read text file
31
+ if file.type == "text/plain":
32
+ # To convert to a string based IO:
33
+ stringio = StringIO(file.getvalue().decode("utf-8"))
34
+
35
+ # To read file as string:
36
+ file_text = stringio.read()
37
+
38
+ return file_text
39
+
40
+ # read pdf file
41
+ elif file.type == "application/pdf":
42
+ pdfReader = PdfFileReader(file)
43
+ count = pdfReader.numPages
44
+ all_text = ""
45
+
46
+ for i in range(count):
47
+ try:
48
+ page = pdfReader.getPage(i)
49
+ all_text += page.extractText()
50
+ except:
51
+ continue
52
+ file_text = all_text
53
+
54
+ return file_text
55
+
56
+ # read csv file
57
+ elif file.type == "text/csv":
58
+ csv = pd.read_csv(file)
59
+ # get columns of type string
60
+ string_columns = csv.select_dtypes(include=['object']).columns
61
+ # get data from columns and join it together
62
+ file_text = ""
63
+ for row in csv[string_columns].values.tolist():
64
+ # remove NaNs
65
+ row = [x for x in row if str(x) != 'nan']
66
+ for column in row:
67
+ txt = ""
68
+ if isinstance(column, list):
69
+ try:
70
+ txt = " ".join(column)
71
+ except:
72
+ continue
73
+ elif isinstance(column, str):
74
+ txt = column
75
+ else:
76
+ continue
77
+ file_text += " " + txt
78
+ return file_text
79
+
80
+ else:
81
+ st.warning(f"File type {file.type} not supported")
82
+ return None
83
+
requirements.txt CHANGED
@@ -3,4 +3,5 @@ streamlit_option_menu==0.3.2
3
  farm-haystack==1.8.0
4
  black==22.8.0
5
  plotly==5.10.0
6
- newspaper3k==0.2.8
 
 
3
  farm-haystack==1.8.0
4
  black==22.8.0
5
  plotly==5.10.0
6
+ newspaper3k==0.2.8
7
+ PyPDF2==2.10.7