Spaces:
Build error
Build error
File size: 4,731 Bytes
d9ce745 d3aa9b9 232a10d d3aa9b9 d9ce745 9a105fd 1981c78 28e14c5 04b8ab3 d87b50e 28e14c5 d87b50e d3aa9b9 d9ce745 232a10d 1981c78 232a10d d87b50e d9ce745 28e14c5 d87b50e 1981c78 28e14c5 04b8ab3 28e14c5 1981c78 d9ce745 232a10d 1981c78 232a10d 1981c78 04b8ab3 1981c78 9a105fd 1981c78 9a105fd 1981c78 9a105fd 1981c78 9a105fd ca564a1 1981c78 10d9795 1981c78 10296ed ca564a1 1981c78 ca564a1 10296ed ca564a1 1981c78 d87b50e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import streamlit as st
from keybert import KeyBERT
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer
import os, re, json
import openai
import spacy
import en_core_web_sm
from sklearn.cluster import KMeans, AgglomerativeClustering
import numpy as np
from sentence_transformers import SentenceTransformer
MODEL = 'all-MiniLM-L6-v2'
nltk.download('stopwords')
@st.cache_data
def load_autotoken():
autotok = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
return autotok
@st.cache_data
def load_keyword_model():
sentence_model = load_model()
kw_model = KeyBERT(model=sentence_model)
return kw_model
@st.cache_data
def load_model():
embedder = SentenceTransformer(MODEL)
return embedder
@st.cache_data
def load_nlp():
nlp = en_core_web_sm.load()
return nlp
def create_nest_sentences(document:str, token_max_length = 1023):
nested = []
sent = []
length = 0
tokenizer = load_autotoken()
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", '.')):
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
length += len(tokens_in_sentence)
if length < token_max_length:
sent.append(sentence)
else:
nested.append(sent)
sent = [sentence]
length = 0
if sent:
nested.append(sent)
return nested
def preprocess(text) -> str:
stop_words = set(stopwords.words("english"))
text = text.lower()
# text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
words = text.split()
words = [w for w in words if not w in stop_words]
return " ".join(words)
def generate_keywords(kw_model, document: str) -> list:
atomic_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 1), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
complex_extractions = kw_model.extract_keywords(document, keyphrase_ngram_range=(1, 2), stop_words=None, use_maxsum=True, nr_candidates=20, top_n=10)
final_topics = []
for extraction in atomic_extractions:
final_topics.append(extraction[0])
for extraction in complex_extractions:
final_topics.append(extraction[0])
return final_topics
def cluster_based_on_topics(nlp, embedder, text1:str, text2:str, num_clusters=3):
# Preprocess and tokenize the texts
doc1 = nlp(preprocess(text1))
doc2 = nlp(preprocess(text2))
# Extract sentences from the texts
sentences1 = [sent.text for sent in doc1.sents]
sentences2 = [sent.text for sent in doc2.sents]
all_sentences = sentences1 + sentences2
# Generate sentence embeddings for each sentence
sentence_embeddings1 = embedder.encode(sentences1)
sentence_embeddings2 = embedder.encode(sentences2)
all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
# Normalize the embeddings to unit length
# all_embeddings = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True)
# Perform agglomerative clustering
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(all_embeddings)
cluster_assignment = clustering_model.labels_
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
if cluster_id not in clustered_sentences:
clustered_sentences[cluster_id] = []
clustered_sentences[cluster_id].append(all_sentences[sentence_id])
return clustered_sentences
def generate_insights(topics:dict, name1:str, name2:str, text1:str, text2:str, clusters) -> list:
openai.api_key = os.getenv("OPENAI_API_KEY")
PROMPT = open("insights.prompt", "r").read()
# print(topics)
PROMPT = PROMPT.replace("{{name1}}", name1)
PROMPT = PROMPT.replace("{{name2}}", name2)
PROMPT = PROMPT.replace("{{topic1}}", ",".join(topics['insight1'][0]))
PROMPT = PROMPT.replace("{{topic2}}", ",".join(topics['insight2'][0]))
PROMPT = PROMPT.replace("{{complex1}}", ",".join(topics['insight1'][1]))
PROMPT = PROMPT.replace("{{complex2}}", ",".join(topics['insight2'][1]))
final_insights = []
for cluster_id, sentences in clusters.items():
# print(cluster_id, " ", sentences)
final_sentences = "\n".join(sentences)[:4000]
final_prompt = PROMPT.replace("{{sentences}}", final_sentences)
# with open(f"prompter/insights_{cluster_id}.prompt", "w") as f:
# f.write(final_prompt)
# Generate insights for each cluster
response = openai.Completion.create(
model="text-davinci-003",
prompt=final_prompt,
max_tokens=200,
temperature=0.7,
top_p=1,
frequency_penalty=0.0,
presence_penalty=0.0,
)
text = response['choices'][0]['text']
jsonify = json.loads(text)
final_insights.append(jsonify)
return final_insights
|