Keane Moraes commited on
Commit
232a10d
1 Parent(s): 4268ace

extracting keywords from texts

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. app.py +17 -7
  3. generation.py +19 -2
  4. utils.py +29 -18
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /__pycache__*
2
+ recursive-exclude * *.py[co]
app.py CHANGED
@@ -1,14 +1,24 @@
1
  import streamlit as st
2
- from .generation import Insights
 
3
 
4
  import time
5
 
6
  st.title("Drop the first document")
7
- file1 = st.file_uploader("Upload a file", type=["md", "txt"])
8
  st.title("Drop the second document")
9
- file2 = st.file_uploader("Upload a file", type=["md", "txt"])
10
 
11
- st.title("Contents of the first file")
12
- st.write(file1.read())
13
- st.title("Contents of the second file")
14
- st.write(file2.read())
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from generation import Insights
3
+
4
 
5
  import time
6
 
7
  st.title("Drop the first document")
8
+ file1 = st.file_uploader("Upload a file", type=["md", "txt"], key="first")
9
  st.title("Drop the second document")
10
+ file2 = st.file_uploader("Upload a file", type=["md", "txt"], key="second")
11
 
12
+ if file1 is not None and file2 is not None:
13
+ st.title("Contents of the first file")
14
+ st.title("Contents of the second file")
15
+
16
+ st.title("Generating insights")
17
+ with st.spinner('Generating insights...'):
18
+ insight1 = Insights(file1.read().decode("utf-8"))
19
+ insight2 = Insights(file2.read().decode("utf-8"))
20
+ st.write(insight1.text)
21
+ st.write(insight2.text)
22
+ st.write(insight1.generate_topics())
23
+ st.write(insight2.generate_topics())
24
+ st.success('Done!')
generation.py CHANGED
@@ -1,8 +1,25 @@
1
  import openai
 
 
2
 
3
  class Insights:
4
 
5
- def __init__(self) -> None:
6
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
 
1
  import openai
2
+ from utils import *
3
+ import mdforest
4
 
5
  class Insights:
6
 
7
+ EMBEDDING_MAX_TOKENS = 8191
8
+
9
+ def __init__(self, text:str) -> None:
10
+ self.corpus = preprocess(mdforest.clean_markdown(text))
11
+ self.text = create_nest_sentences(self.corpus, self.EMBEDDING_MAX_TOKENS)
12
+ self.keywords = []
13
+ self.model = load_keyword_model()
14
+
15
+ def generate_topics(self) -> list:
16
+ print("We are here for generating topics")
17
+ for sentence in self.text:
18
+ self.keywords = self.keywords + generate_keywords(self.model, sentence)
19
+ return self.keywords
20
+
21
+
22
+
23
+
24
 
25
 
utils.py CHANGED
@@ -1,14 +1,24 @@
1
  import streamlit as st
2
  from keybert import KeyBERT
 
3
  from transformers import AutoTokenizer
4
  import re
5
 
 
 
 
 
 
 
 
 
 
6
 
7
- def create_nest_sentences(document:str, token_max_length = 1024):
8
  nested = []
9
  sent = []
10
  length = 0
11
- tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
12
 
13
  for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
14
  tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
@@ -25,19 +35,20 @@ def create_nest_sentences(document:str, token_max_length = 1024):
25
  nested.append(sent)
26
  return nested
27
 
28
- @st.cache_data
29
- def load_keyword_model():
30
- kw_model = KeyBERT()
31
- return kw_model
32
-
33
-
34
- def keyword_gen(kw_model, sequence:str):
35
- keywords = kw_model.extract_keywords(
36
- sequence,
37
- keyphrase_ngram_range=(1, 2),
38
- stop_words='english',
39
- use_mmr=True,
40
- diversity=0.5,
41
- top_n=10
42
- )
43
- return keywords
 
 
1
  import streamlit as st
2
  from keybert import KeyBERT
3
+ from nltk.corpus import stopwords
4
  from transformers import AutoTokenizer
5
  import re
6
 
7
+ @st.cache_data
8
+ def load_autotoken():
9
+ autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
10
+ return autotok
11
+
12
+ @st.cache_data
13
+ def load_keyword_model():
14
+ kw_model = KeyBERT()
15
+ return kw_model
16
 
17
+ def create_nest_sentences(document:str, token_max_length = 8191):
18
  nested = []
19
  sent = []
20
  length = 0
21
+ tokenizer = load_autotoken()
22
 
23
  for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
24
  tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
 
35
  nested.append(sent)
36
  return nested
37
 
38
+ def preprocess(text) -> str:
39
+ stop_words = set(stopwords.words("english"))
40
+ text = text.lower()
41
+ text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
42
+ words = text.split()
43
+ words = [w for w in words if not w in stop_words]
44
+ return " ".join(words)
45
+
46
+ def generate_keywords(kw_model, document: str) -> list:
47
+ atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
48
+ complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
49
+ final_topics = []
50
+ for extraction in atomic_extractions:
51
+ final_topics.append(extraction[0])
52
+ for extraction in complex_extractions:
53
+ final_topics.append(extraction[0])
54
+ return final_topics