Spaces:

Alcime
/

visualize_dataset

Sleeping

App Files Files Community

Alcime commited on Sep 25

Commit

fce64e9

•

1 Parent(s): 6d7b047

Create app.py

Browse files

Files changed (1) hide show

app.py +81 -0

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import streamlit as st
+import os
+from datasets import load_dataset
+from bunkatopics import Bunka
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.llms import HuggingFaceHub
+# Streamlit app
+st.title("Bunka Map 🗺️")
+# Input parameters
+dataset_id = st.text_input("Dataset ID", "bunkalab/medium-sample-technology")
+language = st.text_input("Language", "english")
+text_field = st.text_input("Text Field", "title")
+embedder_model = st.text_input("Embedder Model", "sentence-transformers/distiluse-base-multilingual-cased-v2")
+sample_size = st.number_input("Sample Size", min_value=100, max_value=10000, value=1000)
+n_clusters = st.number_input("Number of Clusters", min_value=5, max_value=50, value=15)
+llm_model = st.text_input("LLM Model", "mistralai/Mistral-7B-Instruct-v0.1")
+# Hugging Face API token input
+hf_token = st.text_input("Hugging Face API Token", type="password")
+if st.button("Generate Bunka Map"):
+    # Load dataset and sample
+    @st.cache_data
+    def load_data(dataset_id, text_field, sample_size):
+        dataset = load_dataset(dataset_id, streaming=True)
+        docs_sample = []
+        for i, example in enumerate(dataset["train"]):
+            if i >= sample_size:
+                break
+            docs_sample.append(example[text_field])
+        return docs_sample
+    docs_sample = load_data(dataset_id, text_field, sample_size)
+    # Initialize embedding model and Bunka
+    embedding_model = HuggingFaceEmbeddings(model_name=embedder_model)
+    bunka = Bunka(embedding_model=embedding_model, language=language)
+    # Fit Bunka to the text data
+    bunka.fit(docs_sample)
+    # Generate topics
+    df_topics = bunka.get_topics(n_clusters=n_clusters, name_length=5, min_count_terms=2)
+    # Visualize topics
+    st.plotly_chart(bunka.visualize_topics(width=800, height=800, colorscale='Portland', density=True, label_size_ratio=60, convex_hull=True))
+    # Clean labels using LLM
+    if hf_token:
+        os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_token
+        llm = HuggingFaceHub(repo_id=llm_model, huggingfacehub_api_token=hf_token)
+        bunka.get_clean_topic_name(llm=llm, language=language)
+        st.plotly_chart(bunka.visualize_topics(width=800, height=800, colorscale='Portland', density=True, label_size_ratio=60, convex_hull=True))
+    else:
+        st.warning("Please provide a Hugging Face API token to clean labels using LLM.")
+    # Manual topic cleaning
+    st.subheader("Manually Clean Topics")
+    cleaned_topics = {}
+    for topic, keywords in bunka.topics_.items():
+        cleaned_topic = st.text_input(f"Topic {topic}", ", ".join(keywords))
+        cleaned_topics[topic] = cleaned_topic.split(", ")
+    if st.button("Update Topics"):
+        bunka.topics_ = cleaned_topics
+        st.plotly_chart(bunka.visualize_topics(width=800, height=800, colorscale='Portland', density=True, label_size_ratio=60, convex_hull=True))
+    # Remove unwanted topics
+    st.subheader("Remove Unwanted Topics")
+    topics_to_remove = st.multiselect("Select topics to remove", list(bunka.topics_.keys()))
+    if st.button("Remove Topics"):
+        bunka.clean_data_by_topics(topics_to_remove)
+        st.plotly_chart(bunka.visualize_topics(width=800, height=800, colorscale='Portland', density=True, label_size_ratio=60, convex_hull=True))
+    # Save dataset
+    if st.button("Save Cleaned Dataset"):
+        name = dataset_id.replace('/', '_') + '_cleaned.csv'
+        bunka.df_cleaned_.to_csv(name)
+        st.success(f"Dataset saved as {name}")