ugaray96 commited on
Commit
d36f6ee
2 Parent(s): 3900908 1b47089

Merge pull request #2 from ugm2/feature/add_url_indexing

Browse files
interface/components.py CHANGED
@@ -1,5 +1,5 @@
1
  import streamlit as st
2
- from interface.utils import get_pipelines
3
  from interface.draw_pipelines import get_pipeline_graph
4
 
5
 
@@ -64,3 +64,23 @@ def component_text_input(container):
64
  {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(texts)
65
  ]
66
  return corpus
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from interface.utils import get_pipelines, extract_text_from_url
3
  from interface.draw_pipelines import get_pipeline_graph
4
 
5
 
 
64
  {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(texts)
65
  ]
66
  return corpus
67
+
68
+
69
+ def component_article_url(container):
70
+ """Draw the Article URL widget"""
71
+ with container:
72
+ urls = []
73
+ doc_id = 1
74
+ with st.expander("Enter URLs"):
75
+ while True:
76
+ url = st.text_input(f"URL {doc_id}", key=doc_id)
77
+ if url != "":
78
+ urls.append({"text": extract_text_from_url(url)})
79
+ doc_id += 1
80
+ st.markdown("---")
81
+ else:
82
+ break
83
+ corpus = [
84
+ {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
85
+ ]
86
+ return corpus
interface/pages.py CHANGED
@@ -5,6 +5,7 @@ from interface.components import (
5
  component_show_pipeline,
6
  component_show_search_result,
7
  component_text_input,
 
8
  )
9
 
10
 
@@ -57,6 +58,7 @@ def page_index(container):
57
 
58
  input_funcs = {
59
  "Raw Text": (component_text_input, "card-text"),
 
60
  }
61
  selected_input = option_menu(
62
  "Input Text",
 
5
  component_show_pipeline,
6
  component_show_search_result,
7
  component_text_input,
8
+ component_article_url,
9
  )
10
 
11
 
 
58
 
59
  input_funcs = {
60
  "Raw Text": (component_text_input, "card-text"),
61
+ "URL": (component_article_url, "card-link"),
62
  }
63
  selected_input = option_menu(
64
  "Input Text",
interface/utils.py CHANGED
@@ -1,5 +1,7 @@
1
  import core.pipelines as pipelines_functions
2
  from inspect import getmembers, isfunction
 
 
3
 
4
 
5
  def get_pipelines():
@@ -10,3 +12,12 @@ def get_pipelines():
10
  " ".join([n.capitalize() for n in name.split("_")]) for name in pipeline_names
11
  ]
12
  return pipeline_names, pipeline_funcs
 
 
 
 
 
 
 
 
 
 
1
  import core.pipelines as pipelines_functions
2
  from inspect import getmembers, isfunction
3
+ from newspaper import Article
4
+ import streamlit as st
5
 
6
 
7
  def get_pipelines():
 
12
  " ".join([n.capitalize() for n in name.split("_")]) for name in pipeline_names
13
  ]
14
  return pipeline_names, pipeline_funcs
15
+
16
+
17
+ @st.experimental_memo
18
+ def extract_text_from_url(url: str):
19
+ article = Article(url)
20
+ article.download()
21
+ article.parse()
22
+
23
+ return article.text
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
- streamlit
2
- streamlit_option_menu
3
- farm-haystack
4
- black
5
- plotly
 
 
1
+ streamlit==1.12.2
2
+ streamlit_option_menu==0.3.2
3
+ farm-haystack==1.8.0
4
+ black==22.8.0
5
+ plotly==5.10.0
6
+ newspaper3k==0.2.8