Spaces:

asoria
/

dataset-insight-explorer

Sleeping

App Files Files Community

asoria HF staff commited on Jun 14

Commit

7afe0ab

•

1 Parent(s): ed28c62

Upload 3 files

Browse files

Files changed (3) hide show

README.md +5 -5
app.py +206 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: Dataset Insight Explorer
-emoji: 🐠
-colorFrom: red
-colorTo: blue
 sdk: gradio
 sdk_version: 4.36.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Dataset Insights Explorer
+emoji: 💻
+colorFrom: gray
+colorTo: pink
 sdk: gradio
 sdk_version: 4.36.1
 app_file: app.py
 pinned: false
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""
+TODOS:
+- Improve prompts
+- Improve model usage (Quantization?)
+- Improve error handling
+- Add more tests
+- Improve response in a friendly way
+"""
+import gradio as gr
+from gradio_huggingfacehub_search import HuggingfaceHubSearch
+import duckdb
+import pandas as pd
+import requests
+from outlines import prompt
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import spaces
+import json
+import torch
+import logging
+BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
+logger = logging.getLogger(__name__)
+"""
+Methods for generating potential questions and SQL queries
+"""
+device = "cuda"
+gemma_model_id = "google/gemma-2b-it"
+gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_model_id)
+gemma_model = AutoModelForCausalLM.from_pretrained(
+    gemma_model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16
+)
+@spaces.GPU
+def generate_potential_questions_with_gemma(prompt):
+    input_ids = gemma_tokenizer(prompt, return_tensors="pt").to(device)
+    outputs = gemma_model.generate(**input_ids, max_new_tokens=1024)
+    return gemma_tokenizer.decode(outputs[0], skip_special_tokens=True)
+@prompt
+def prompt_for_questions(dataset, schema, first_rows):
+    """
+    You are a data analyst tasked with exploring a dataset named {{ dataset }}.
+    Below is the dataset schema in SQL format along with a sample of 3 rows:
+    {{ schema }}
+    Sample rows:
+    {% for example in first_rows %}
+    {{ example}}
+    {% endfor %}
+    Your goal is to generate a list of 5 potential questions that a user might want
+    to ask about this dataset. Consider the information contained in the provided
+    columns and rows, and try to think of meaningful questions that could
+    provide insights or useful information. For each question, provide the SQL query
+    that would extract the relevant information from the dataset.
+    Ouput JSON format:
+    {
+        "questions": [
+            {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
+            {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
+            {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
+            {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
+            {"question": [Insert question here]", "sql_query": "[Insert SQL query here]"},
+        ]
+    }
+    Please ensure that each SQL query retrieves relevant information from the dataset to answer the corresponding question accurately.
+    Return only the JSON object, do not add extra information.
+    """
+"""
+Methods for generating and SQL based on a user request
+"""
+mother_duckdb_model_id = "motherduckdb/DuckDB-NSQL-7B-v0.1"
+mother_duck_tokenizer = AutoTokenizer.from_pretrained(mother_duckdb_model_id)
+mother_duck_model = AutoModelForCausalLM.from_pretrained(
+    mother_duckdb_model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16
+)
+@spaces.GPU
+def generate_sql_with_mother_duck(prompt):
+    input_ids = mother_duck_tokenizer(prompt, return_tensors="pt").to(device).input_ids
+    generated_ids = mother_duck_model.generate(input_ids, max_length=1024)
+    return mother_duck_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+@prompt
+def prompt_for_sql(ddl_create, query_input):
+    """
+    ### Instruction:
+    Your task is to generate valid duckdb SQL to answer the following question.
+    ### Input:
+    Here is the database schema that the SQL query will run on:
+    {{ ddl_create }}
+    ### Question:
+    {{ query_input }}
+    ### Response (use duckdb shorthand if possible):
+    """
+"""
+Datasets Viewer Methods
+https://huggingface.co/docs/datasets-server/index
+"""
+def get_first_parquet(dataset: str):
+    resp = requests.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset}")
+    return resp.json()["parquet_files"][0]
+def get_dataset_schema(parquet_url: str):
+    con = duckdb.connect()
+    con.execute(f"CREATE TABLE data as SELECT * FROM '{parquet_url}' LIMIT 1;")
+    result = con.sql("SELECT sql FROM duckdb_tables() where table_name ='data';").df()
+    ddl_create = result.iloc[0,0]
+    con.close()
+    return ddl_create
+def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int):
+    resp = requests.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}")
+    rows = resp.json()["rows"]
+    rows = [row['row'] for row in rows]
+    return pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit)
+"""
+Main logic, to get the recommended queries
+"""
+def get_recommended_queries(dataset: str):
+    ddl_create, prompt = "", ""
+    try:
+        first_split = get_first_parquet(dataset)
+        df_first_rows = get_first_rows_as_df(dataset, first_split["config"], first_split["split"], 3)
+        first_parquet_url = first_split["url"]
+        logger.info(f"First parquet URL: {first_parquet_url}")
+        ddl_create = get_dataset_schema(first_parquet_url)
+        prompt = prompt_for_questions(dataset, ddl_create, df_first_rows.to_dict('records'))
+        txt_questions = generate_potential_questions_with_gemma(prompt).split("``json")[1].replace('\n', ' ').strip()[:-4]
+        data = json.loads(txt_questions)
+        questions = data["questions"]
+        potential_questions = []
+        for question in questions:
+            try:
+                sql = question["sql_query"].replace("FROM data", f"FROM '{first_parquet_url}'")
+                result = duckdb.sql(sql).df()
+                potential_questions.append({"question": question["question"], "result": result, "sql_query": sql})
+                continue
+            except Exception as err:
+                logger.error(f"Error in running SQL query: {question['sql_query']} {err}")
+                mother_duck_prompt = prompt_for_sql(ddl_create, question["question"])
+                sql = generate_sql_with_mother_duck(mother_duck_prompt).split("### Response (use duckdb shorthand if possible):")[-1].strip()
+                sql = sql.replace("FROM data", f"FROM '{first_parquet_url}'")
+                try:
+                    result = duckdb.sql(sql).df()
+                    potential_questions.append({"question": question["question"], "result": result, "sql_query": sql})
+                except:
+                    pass
+        df_result = pd.DataFrame(potential_questions)
+    except Exception as err:
+        logger.error(f"Error in getting recommended queries: {err}")
+        return {
+            gr_txt_ddl: ddl_create,
+            gr_txt_prompt: prompt,
+            gr_df_result: pd.DataFrame([{"error": f"❌ {err=}"}])
+        }
+    return {
+        gr_txt_ddl: ddl_create,
+        gr_txt_prompt: prompt,
+        gr_df_result: df_result
+    }
+def preview_dataset(dataset: str):
+    try:
+        first_split = get_first_parquet(dataset)
+        df = get_first_rows_as_df(dataset, first_split["config"], first_split["split"], 4)
+    except Exception as err:
+        df = pd.DataFrame([{"Unable to preview dataset": f"❌ {err=}"}])
+    return {
+        gr_df_first_rows: df
+    }
+with gr.Blocks() as demo:
+    gr.Markdown("# 💫 Dataset Insights Explorer 💫")
+    gr_dataset_name = HuggingfaceHubSearch(
+            label="Hub Dataset ID",
+            placeholder="Search for dataset id on Huggingface",
+            search_type="dataset",
+            value="jamescalam/world-cities-geo",
+        )
+    gr_preview_btn = gr.Button("Preview Dataset")
+    gr_df_first_rows = gr.DataFrame(datatype="markdown")
+    gr_recommend_btn = gr.Button("Show Insights")
+    gr_df_result = gr.DataFrame(datatype="markdown")
+    with gr.Accordion("Open for details", open=False):
+        gr_txt_ddl = gr.Textbox(label="Dataset as CREATE DDL", interactive= False)
+        gr_txt_prompt = gr.Textbox(label="Generated prompt to get recommended questions", interactive= False)
+    gr_preview_btn.click(preview_dataset, inputs=[gr_dataset_name], outputs=[gr_df_first_rows])
+    gr_recommend_btn.click(get_recommended_queries, inputs=[gr_dataset_name], outputs=[gr_txt_ddl, gr_txt_prompt, gr_df_result])
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio_huggingfacehub_search==0.0.7
+duckdb
+pandas
+outlines
+transformers
+accelerate