Keane Moraes commited on
Commit
28e14c5
β€’
1 Parent(s): 232a10d

changes for topic modelling and embeddings

Browse files
Files changed (4) hide show
  1. app.py +7 -7
  2. requirements.txt +5 -0
  3. generation.py β†’ topics.py +20 -5
  4. utils.py +13 -1
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import streamlit as st
2
- from generation import Insights
3
-
4
 
5
  import time
6
 
@@ -10,15 +9,16 @@ st.title("Drop the second document")
10
  file2 = st.file_uploader("Upload a file", type=["md", "txt"], key="second")
11
 
12
  if file1 is not None and file2 is not None:
13
- st.title("Contents of the first file")
14
- st.title("Contents of the second file")
15
-
16
  st.title("Generating insights")
17
  with st.spinner('Generating insights...'):
18
  insight1 = Insights(file1.read().decode("utf-8"))
19
  insight2 = Insights(file2.read().decode("utf-8"))
20
- st.write(insight1.text)
21
- st.write(insight2.text)
22
  st.write(insight1.generate_topics())
23
  st.write(insight2.generate_topics())
 
 
 
 
 
24
  st.success('Done!')
 
1
  import streamlit as st
2
+ from topics import Insights
 
3
 
4
  import time
5
 
 
9
  file2 = st.file_uploader("Upload a file", type=["md", "txt"], key="second")
10
 
11
  if file1 is not None and file2 is not None:
12
+
 
 
13
  st.title("Generating insights")
14
  with st.spinner('Generating insights...'):
15
  insight1 = Insights(file1.read().decode("utf-8"))
16
  insight2 = Insights(file2.read().decode("utf-8"))
 
 
17
  st.write(insight1.generate_topics())
18
  st.write(insight2.generate_topics())
19
+ st.write(insight1.text)
20
+ st.write(insight2.text)
21
+ embed1 = insight1.generate_embeddings()
22
+ embed2 = insight2.generate_embeddings()
23
+
24
  st.success('Done!')
requirements.txt CHANGED
@@ -1,4 +1,9 @@
1
  keybert==0.7.0
 
 
2
  openai==0.27.2
 
 
 
3
  streamlit==1.21.0
4
  transformers==4.27.2
 
1
  keybert==0.7.0
2
+ mdforest==1.5.0
3
+ nltk==3.8.1
4
  openai==0.27.2
5
+ pandas==1.5.3
6
+ sentence_transformers==2.2.2
7
+ spacy==3.5.2
8
  streamlit==1.21.0
9
  transformers==4.27.2
generation.py β†’ topics.py RENAMED
@@ -1,23 +1,38 @@
1
  import openai
2
  from utils import *
3
  import mdforest
 
 
4
 
5
  class Insights:
6
 
7
- EMBEDDING_MAX_TOKENS = 8191
8
 
9
  def __init__(self, text:str) -> None:
10
- self.corpus = preprocess(mdforest.clean_markdown(text))
11
- self.text = create_nest_sentences(self.corpus, self.EMBEDDING_MAX_TOKENS)
12
  self.keywords = []
 
 
 
13
  self.model = load_keyword_model()
14
-
 
 
15
  def generate_topics(self) -> list:
16
- print("We are here for generating topics")
17
  for sentence in self.text:
18
  self.keywords = self.keywords + generate_keywords(self.model, sentence)
19
  return self.keywords
20
 
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
 
 
1
  import openai
2
  from utils import *
3
  import mdforest
4
+ import pandas as pd
5
+ import spacy
6
 
7
  class Insights:
8
 
9
+ EMBEDDING_MAX_TOKENS = 1023
10
 
11
  def __init__(self, text:str) -> None:
12
+ cleaned_text = mdforest.clean_markdown(text)
 
13
  self.keywords = []
14
+
15
+ self.corpus = preprocess(cleaned_text)
16
+ self.text = create_nest_sentences(self.corpus, self.EMBEDDING_MAX_TOKENS)
17
  self.model = load_keyword_model()
18
+ self.embedder = load_embedder()
19
+
20
+
21
  def generate_topics(self) -> list:
 
22
  for sentence in self.text:
23
  self.keywords = self.keywords + generate_keywords(self.model, sentence)
24
  return self.keywords
25
 
26
+ def generate_embeddings(self) -> list:
27
+ # generate embeddings for all the sentences
28
+ nlp = spacy.load("en_core_web_sm")
29
+ final_embeddings = []
30
+ for text in self.text:
31
+ print(text[0])
32
+ doc = nlp(text[0])
33
+ sentence_embeddings = [sent.vector for sent in doc.sents]
34
+ final_embeddings += sentence_embeddings
35
+
36
 
37
 
38
 
utils.py CHANGED
@@ -3,6 +3,13 @@ from keybert import KeyBERT
3
  from nltk.corpus import stopwords
4
  from transformers import AutoTokenizer
5
  import re
 
 
 
 
 
 
 
6
 
7
  @st.cache_data
8
  def load_autotoken():
@@ -14,7 +21,12 @@ def load_keyword_model():
14
  kw_model = KeyBERT()
15
  return kw_model
16
 
17
- def create_nest_sentences(document:str, token_max_length = 8191):
 
 
 
 
 
18
  nested = []
19
  sent = []
20
  length = 0
 
3
  from nltk.corpus import stopwords
4
  from transformers import AutoTokenizer
5
  import re
6
+ import spacy
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+ # @st.cache_data
10
+ # def load_nlp():
11
+ # nlp =
12
+
13
 
14
  @st.cache_data
15
  def load_autotoken():
 
21
  kw_model = KeyBERT()
22
  return kw_model
23
 
24
+ @st.cache_data
25
+ def load_embedder():
26
+ embedder = SentenceTransformer('all-MiniLM-L6-v2')
27
+ return embedder
28
+
29
+ def create_nest_sentences(document:str, token_max_length = 1023):
30
  nested = []
31
  sent = []
32
  length = 0