File size: 4,731 Bytes
d9ce745
 
d3aa9b9
232a10d
d3aa9b9
d9ce745
9a105fd
1981c78
28e14c5
04b8ab3
d87b50e
 
28e14c5
 
d87b50e
d3aa9b9
d9ce745
232a10d
 
1981c78
 
232a10d
 
 
d87b50e
 
 
d9ce745
28e14c5
d87b50e
1981c78
 
28e14c5
04b8ab3
 
 
 
 
28e14c5
1981c78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9ce745
232a10d
1981c78
 
 
 
 
 
232a10d
 
1981c78
 
 
 
 
 
 
 
 
04b8ab3
 
 
1981c78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a105fd
1981c78
 
9a105fd
1981c78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a105fd
1981c78
 
 
 
 
 
 
 
 
 
 
 
 
 
9a105fd
ca564a1
 
1981c78
10d9795
 
1981c78
 
 
 
10296ed
 
ca564a1
1981c78
 
 
 
 
 
ca564a1
10296ed
ca564a1
1981c78
 
d87b50e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import streamlit as st
from keybert import KeyBERT
import nltk
from nltk.corpus import stopwords

from transformers import AutoTokenizer
import os, re, json
import openai
import spacy
import en_core_web_sm
from sklearn.cluster import KMeans, AgglomerativeClustering
import numpy as np
from sentence_transformers import SentenceTransformer

MODEL = 'all-MiniLM-L6-v2' 
nltk.download('stopwords')

@st.cache_data
def load_autotoken():
	autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
	return autotok

@st.cache_data
def load_keyword_model():
	sentence_model = load_model()
	kw_model = KeyBERT(model=sentence_model)
	return kw_model

@st.cache_data
def load_model():
	embedder = SentenceTransformer(MODEL)
	return embedder 

@st.cache_data
def load_nlp():
	nlp = en_core_web_sm.load()
	return nlp

def create_nest_sentences(document:str, token_max_length = 1023):
	nested = []
	sent = []
	length = 0
	tokenizer = load_autotoken()

	for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", '.')):
		tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
		length += len(tokens_in_sentence)

		if length < token_max_length:
			sent.append(sentence)
		else:
			nested.append(sent)
			sent = [sentence]
			length = 0

	if sent:
		nested.append(sent)
	return nested

def preprocess(text) -> str:
		stop_words = set(stopwords.words("english"))
		text = text.lower()
		# text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
		words = text.split() 
		words = [w for w in words if not w in stop_words]
		return " ".join(words)

def generate_keywords(kw_model, document: str) -> list:
		atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
		complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
		final_topics = []
		for extraction in atomic_extractions:
				final_topics.append(extraction[0])
		for extraction in complex_extractions:
				final_topics.append(extraction[0])
		return final_topics


def cluster_based_on_topics(nlp, embedder, text1:str, text2:str, num_clusters=3):
		
	# Preprocess and tokenize the texts
	doc1 = nlp(preprocess(text1))
	doc2 = nlp(preprocess(text2))
	
	# Extract sentences from the texts  
	sentences1 = [sent.text for sent in doc1.sents]
	sentences2 = [sent.text for sent in doc2.sents]
	all_sentences = sentences1 + sentences2
				
	# Generate sentence embeddings for each sentence
	sentence_embeddings1 = embedder.encode(sentences1)
	sentence_embeddings2 = embedder.encode(sentences2)
	all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)

	# Normalize the embeddings to unit length
	# all_embeddings = all_embeddings /  np.linalg.norm(all_embeddings, axis=1, keepdims=True)

	# Perform agglomerative clustering
	clustering_model = KMeans(n_clusters=num_clusters)
	clustering_model.fit(all_embeddings)
	cluster_assignment = clustering_model.labels_

	clustered_sentences = {}
	for sentence_id, cluster_id in enumerate(cluster_assignment):
		if cluster_id not in clustered_sentences:
			clustered_sentences[cluster_id] = []
		clustered_sentences[cluster_id].append(all_sentences[sentence_id])

	return clustered_sentences


def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, clusters) -> list:
		
	openai.api_key = os.getenv("OPENAI_API_KEY")
	
	PROMPT = open("insights.prompt", "r").read()
	
	# print(topics)
 
	PROMPT = PROMPT.replace("{{name1}}", name1)
	PROMPT = PROMPT.replace("{{name2}}", name2)
 
	PROMPT = PROMPT.replace("{{topic1}}", ",".join(topics['insight1'][0]))
	PROMPT = PROMPT.replace("{{topic2}}", ",".join(topics['insight2'][0]))
	
	PROMPT = PROMPT.replace("{{complex1}}", ",".join(topics['insight1'][1]))
	PROMPT = PROMPT.replace("{{complex2}}", ",".join(topics['insight2'][1]))
	
	final_insights = []
	
	for cluster_id, sentences in clusters.items():  
		
		# print(cluster_id, " ", sentences)
		final_sentences = "\n".join(sentences)[:4000]
		final_prompt = PROMPT.replace("{{sentences}}", final_sentences)
	
		# with open(f"prompter/insights_{cluster_id}.prompt", "w") as f:
		# 	f.write(final_prompt)
     
		# Generate insights for each cluster
		response = openai.Completion.create(
			model="text-davinci-003",
			prompt=final_prompt,
			max_tokens=200,
			temperature=0.7,
			top_p=1,
			frequency_penalty=0.0,
			presence_penalty=0.0,
		)

		text = response['choices'][0]['text']
		jsonify = json.loads(text)
		
		final_insights.append(jsonify)
	return final_insights