Spaces:
Sleeping
Sleeping
themeetjani
commited on
Commit
•
1ef3d70
1
Parent(s):
5dc77e3
Upload 3 files
Browse files- pages/cg.py +50 -0
- pages/sp.py +15 -0
- pages/tc.py +154 -0
pages/cg.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from streamlit import session_state
|
3 |
+
import os
|
4 |
+
import openai
|
5 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
6 |
+
import pandas as pd
|
7 |
+
from sklearn.preprocessing import LabelEncoder
|
8 |
+
import numpy as np
|
9 |
+
def gpt4_score(schema, query):
|
10 |
+
response = openai.ChatCompletion.create(
|
11 |
+
model="gpt-4",
|
12 |
+
messages=[
|
13 |
+
{
|
14 |
+
"role": "system",
|
15 |
+
"content": "You are Code generator assistant. your task is to generate a code/query based on the instructions given in any language.\nif it's sql then accept query instruction also.\n\n<<REMEMBER>> Give only code or query. Don't provide any extra information.\n\n<<OUTPUT>>"
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"role": "user",
|
19 |
+
"content": f"Schema/ Detail: {schema}"
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"role": "user",
|
23 |
+
"content": f"Query/instruction: {query}"
|
24 |
+
}
|
25 |
+
],
|
26 |
+
temperature=0.7,
|
27 |
+
max_tokens=701,
|
28 |
+
top_p=1,
|
29 |
+
frequency_penalty=0,
|
30 |
+
presence_penalty=0
|
31 |
+
)
|
32 |
+
return response.choices[0].message.content
|
33 |
+
|
34 |
+
st.write("# Auto Code Generation! 👋")
|
35 |
+
|
36 |
+
|
37 |
+
if 'score' not in session_state:
|
38 |
+
session_state['score']= ""
|
39 |
+
|
40 |
+
text1= st.text_area(label= "Please write the Schema or Detailed explaination bellow",
|
41 |
+
placeholder="What does the text say?")
|
42 |
+
text2= st.text_area(label= "Please write the Query or code instructions bellow",
|
43 |
+
placeholder="What does the text say?")
|
44 |
+
def classify(text1,text2):
|
45 |
+
session_state['score'] = gpt4_score(text1,text2)
|
46 |
+
|
47 |
+
|
48 |
+
st.text_area("result", value=session_state['score'])
|
49 |
+
|
50 |
+
st.button("Classify", on_click=classify, args=[text1,text2])
|
pages/sp.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import streamlit.components.v1 as components
|
3 |
+
|
4 |
+
# bootstrap 4 collapse example
|
5 |
+
components.html(
|
6 |
+
"""
|
7 |
+
<script
|
8 |
+
type="module"
|
9 |
+
src="https://gradio.s3-us-west-2.amazonaws.com/3.39.0/gradio.js"
|
10 |
+
></script>
|
11 |
+
|
12 |
+
<gradio-app src="https://themeetjani-speech2.hf.space"></gradio-app>
|
13 |
+
""",
|
14 |
+
height=1600,
|
15 |
+
)
|
pages/tc.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from streamlit import session_state
|
3 |
+
import numpy as np
|
4 |
+
import json
|
5 |
+
from io import StringIO
|
6 |
+
import openai
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import pandas as pd
|
10 |
+
from sentence_transformers import SentenceTransformer
|
11 |
+
import nltk
|
12 |
+
from nltk import word_tokenize
|
13 |
+
from nltk.corpus import stopwords
|
14 |
+
from sklearn.cluster import MiniBatchKMeans
|
15 |
+
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
|
16 |
+
from sklearn.cluster import AgglomerativeClustering,k_means
|
17 |
+
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
|
18 |
+
import numpy as np
|
19 |
+
nltk.download("stopwords")
|
20 |
+
import nltk
|
21 |
+
nltk.download('punkt')
|
22 |
+
#text preprocessing function
|
23 |
+
def clean_text_1(text):
|
24 |
+
stop_words = set(stopwords.words("english"))
|
25 |
+
def remove_stopwords(text):
|
26 |
+
return " ".join([word for word in str(text).split() if word not in stop_words])
|
27 |
+
text = remove_stopwords(text)
|
28 |
+
text = str(text).lower() # Lowercase words
|
29 |
+
text = re.sub(r"\[(.*?)\]", " ", text) # Remove [+XYZ chars] in content
|
30 |
+
text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content
|
31 |
+
text = re.sub(r"\w+…|…", " ", text) # Remove ellipsis (and last word)
|
32 |
+
text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words
|
33 |
+
# text = re.sub(stop_words, " ", text) # Replace dash between words
|
34 |
+
text = re.sub(
|
35 |
+
f"[{re.escape(string.punctuation)}]", "", text
|
36 |
+
) # Remove punctuation
|
37 |
+
return text
|
38 |
+
import streamlit as st
|
39 |
+
import pandas as pd
|
40 |
+
import numpy as np
|
41 |
+
from sklearn.cluster import AgglomerativeClustering
|
42 |
+
from sklearn.manifold import TSNE
|
43 |
+
import matplotlib.pyplot as plt
|
44 |
+
import matplotlib.colors as mcolors
|
45 |
+
from sentence_transformers import SentenceTransformer
|
46 |
+
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here
|
47 |
+
from openai import OpenAI
|
48 |
+
client = OpenAI()
|
49 |
+
|
50 |
+
# Load sentence transformer model
|
51 |
+
def get_embedding(text):
|
52 |
+
# Assuming you have a function clean_text_1 to clean the text
|
53 |
+
#text = clean_text_1(text)
|
54 |
+
return model.encode(text)
|
55 |
+
|
56 |
+
# Streamlit UI configuration
|
57 |
+
st.set_page_config(
|
58 |
+
page_title="text_clustering.py",
|
59 |
+
page_icon="👋",
|
60 |
+
)
|
61 |
+
|
62 |
+
# Upload file
|
63 |
+
uploaded_file = st.file_uploader("Choose a file")
|
64 |
+
if uploaded_file:
|
65 |
+
# Read data from file
|
66 |
+
df = pd.read_csv(uploaded_file)
|
67 |
+
|
68 |
+
# Clean data
|
69 |
+
df = df[df['text'].notna()].reset_index(drop=True)
|
70 |
+
|
71 |
+
# Get embeddings
|
72 |
+
df['embedding'] = df['text'].apply(get_embedding)
|
73 |
+
matrix = np.vstack(df['embedding'].values)
|
74 |
+
|
75 |
+
# Distance threshold slider
|
76 |
+
distance_threshold = st.slider("Select Distance Threshold", min_value=0.1, max_value=2.0, value=1.1, step=0.1)
|
77 |
+
|
78 |
+
# Perform clustering
|
79 |
+
agg_clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='ward')
|
80 |
+
cluster_labels = agg_clustering.fit_predict(matrix)
|
81 |
+
df['Cluster'] = cluster_labels
|
82 |
+
# Visualize clusters with t-SNE
|
83 |
+
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
|
84 |
+
vis_dims2 = tsne.fit_transform(matrix)
|
85 |
+
|
86 |
+
x = [x for x, y in vis_dims2]
|
87 |
+
y = [y for x, y in vis_dims2]
|
88 |
+
|
89 |
+
unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True)
|
90 |
+
|
91 |
+
# Create a colormap based on cluster sizes
|
92 |
+
colormap = plt.cm.get_cmap("viridis", len(unique_clusters))
|
93 |
+
|
94 |
+
# Set up Streamlit app
|
95 |
+
|
96 |
+
fig, ax = plt.subplots()
|
97 |
+
for category, (color, size) in enumerate(zip(colormap.colors, cluster_counts)):
|
98 |
+
xs = np.array(x)[cluster_labels == category]
|
99 |
+
ys = np.array(y)[cluster_labels == category]
|
100 |
+
|
101 |
+
ax.scatter(xs, ys, color=color, alpha=0.3, label=f'Cluster {category} (Size: {size})')
|
102 |
+
|
103 |
+
avg_x = xs.mean()
|
104 |
+
avg_y = ys.mean()
|
105 |
+
|
106 |
+
ax.scatter(avg_x, avg_y, marker="x", color=color, s=100)
|
107 |
+
|
108 |
+
ax.set_title("Clusters identified visualized in language 2D using t-SNE")
|
109 |
+
ax.legend()
|
110 |
+
|
111 |
+
# Display the plot in Streamlit
|
112 |
+
st.pyplot(fig)
|
113 |
+
st.text_area("Number of Cluster Labels", value=len(np.unique(cluster_labels.tolist())))
|
114 |
+
|
115 |
+
|
116 |
+
|
117 |
+
# Reading a review which belong to each group.
|
118 |
+
rev_per_cluster = 3
|
119 |
+
n_clusters = len(np.unique(cluster_labels.tolist()))
|
120 |
+
|
121 |
+
for i in range(n_clusters):
|
122 |
+
print(f"Cluster {i} Theme:", end=" ")
|
123 |
+
|
124 |
+
reviews = "\n".join(
|
125 |
+
df[df.Cluster == i]
|
126 |
+
.text.str.replace("Title: ", "")
|
127 |
+
.str.replace("\n\nContent: ", ": ")
|
128 |
+
.sample(rev_per_cluster, random_state=42)
|
129 |
+
.values
|
130 |
+
)
|
131 |
+
|
132 |
+
messages = [
|
133 |
+
{"role": "user", "content": f'What do the following have in common?\n\nValues:\n"""\n{reviews}\n"""\n\nTheme:'}
|
134 |
+
]
|
135 |
+
|
136 |
+
response = client.chat.completions.create(
|
137 |
+
model="gpt-4",
|
138 |
+
messages=messages,
|
139 |
+
temperature=0,
|
140 |
+
max_tokens=64,
|
141 |
+
top_p=1,
|
142 |
+
frequency_penalty=0,
|
143 |
+
presence_penalty=0)
|
144 |
+
print(response.choices[0].message.content.replace("\n", ""))
|
145 |
+
st.text_area(f"Cluster {i} Theme", value=response.choices[0].message.content.replace("\n", ""))
|
146 |
+
|
147 |
+
# sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42)
|
148 |
+
# for j in range(rev_per_cluster):
|
149 |
+
# print(sample_cluster_rows.Score.values[j], end=", ")
|
150 |
+
# print(sample_cluster_rows.Summary.values[j], end=": ")
|
151 |
+
# print(sample_cluster_rows.Text.str[:70].values[j])
|
152 |
+
|
153 |
+
# print("-" * 100)
|
154 |
+
#
|