Spaces:
Running
Running
rename + parse json
Browse files
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: green
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.6.0
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
+
title: DuckDB Spreadsheets
|
3 |
+
emoji: π₯π
|
4 |
colorFrom: green
|
5 |
+
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.6.0
|
8 |
app_file: app.py
|
app.py
CHANGED
@@ -2,14 +2,17 @@ from functools import partial, lru_cache
|
|
2 |
|
3 |
import duckdb
|
4 |
import gradio as gr
|
|
|
5 |
import pandas as pd
|
6 |
import pyarrow as pa
|
|
|
7 |
import requests
|
8 |
from huggingface_hub import HfApi
|
9 |
|
10 |
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
|
11 |
EMPTY_TABLE = pa.Table.from_pylist([{str(i): "" for i in range(4)}] * 10)
|
12 |
EMPTY_DF: pd.DataFrame = EMPTY_TABLE.to_pandas()
|
|
|
13 |
MAX_NUM_COLUMNS = 20
|
14 |
NUM_TRENDING_DATASETS = 10
|
15 |
NUM_USER_DATASETS = 10
|
@@ -102,6 +105,14 @@ def get_prepared_functions_from_table(table: pa.Table) -> dict[str, list[str]]:
|
|
102 |
prepared_functions[field.name] = [prepare_function(numeric_func, ["x"], field.name) for numeric_func in numeric_functions_df.Name]
|
103 |
elif pa.types.is_string(field.type):
|
104 |
prepared_functions[field.name] = [prepare_function(text_func, ["string"], field.name) for text_func in text_functions_df.Name]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
elif pa.types.is_date(field.type):
|
106 |
prepared_functions[field.name] = [prepare_function(date_func, ["startdate", "date"], field.name) for date_func in date_functions_df.Name]
|
107 |
elif pa.types.is_list(field.type):
|
@@ -122,7 +133,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
122 |
dataset_subset_split_textbox = gr.Textbox(visible=False)
|
123 |
input_table_state = gr.State()
|
124 |
run_button = gr.Button(visible=False, elem_id="run_button")
|
125 |
-
gr.Markdown("#
|
126 |
with gr.Group():
|
127 |
with gr.Row():
|
128 |
dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
|
@@ -133,7 +144,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
133 |
transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True, elem_classes="transform_dropdown") for column_name in EMPTY_DF.columns]
|
134 |
transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False, elem_classes="transform_dropdown") for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
|
135 |
dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
|
136 |
-
with gr.Accordion("Show SQL command", open=False, elem_classes="transparent-accordion"):
|
137 |
code_markdown = gr.Markdown()
|
138 |
|
139 |
def show_subset_dropdown(dataset: str):
|
@@ -153,7 +164,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
153 |
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
|
154 |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
155 |
if dataset and subset and split and pattern:
|
156 |
-
table = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT
|
157 |
else:
|
158 |
table = EMPTY_TABLE
|
159 |
prepared_functions = get_prepared_functions_from_table(table)
|
@@ -181,7 +192,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
181 |
code_markdown: (
|
182 |
"```sql\n"
|
183 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
184 |
-
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
185 |
+ "\n```"
|
186 |
) if pattern else "",
|
187 |
}
|
@@ -213,7 +224,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
213 |
code_markdown: (
|
214 |
"```sql\n"
|
215 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
216 |
-
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
217 |
+ "\n```"
|
218 |
) if pattern else "",
|
219 |
}
|
@@ -234,7 +245,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
234 |
code_markdown: (
|
235 |
"```sql\n"
|
236 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
237 |
-
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
238 |
+ "\n```"
|
239 |
) if pattern else "",
|
240 |
}
|
@@ -252,7 +263,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
252 |
code_markdown: (
|
253 |
"```sql\n"
|
254 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
255 |
-
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
256 |
+ "\n```"
|
257 |
) if pattern else "",
|
258 |
}
|
@@ -268,7 +279,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
268 |
code_markdown: (
|
269 |
"```sql\n"
|
270 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
271 |
-
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
272 |
+ "\n```"
|
273 |
) if pattern else "",
|
274 |
}
|
|
|
2 |
|
3 |
import duckdb
|
4 |
import gradio as gr
|
5 |
+
import json
|
6 |
import pandas as pd
|
7 |
import pyarrow as pa
|
8 |
+
import pyarrow.compute as pc
|
9 |
import requests
|
10 |
from huggingface_hub import HfApi
|
11 |
|
12 |
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
|
13 |
EMPTY_TABLE = pa.Table.from_pylist([{str(i): "" for i in range(4)}] * 10)
|
14 |
EMPTY_DF: pd.DataFrame = EMPTY_TABLE.to_pandas()
|
15 |
+
NUM_ROWS = 10
|
16 |
MAX_NUM_COLUMNS = 20
|
17 |
NUM_TRENDING_DATASETS = 10
|
18 |
NUM_USER_DATASETS = 10
|
|
|
105 |
prepared_functions[field.name] = [prepare_function(numeric_func, ["x"], field.name) for numeric_func in numeric_functions_df.Name]
|
106 |
elif pa.types.is_string(field.type):
|
107 |
prepared_functions[field.name] = [prepare_function(text_func, ["string"], field.name) for text_func in text_functions_df.Name]
|
108 |
+
# try parsing json
|
109 |
+
if pc.all(pc.starts_with(table[field.name], "{")).as_py() or pc.all(pc.starts_with(table[field.name], "[")).as_py():
|
110 |
+
try:
|
111 |
+
json_parsed_table = pa.Table.from_pylist([{field.name: json.loads(row)} for row in table[field.name].to_pylist()])
|
112 |
+
parsed_type = str(duckdb.from_arrow(json_parsed_table).dtypes[0])
|
113 |
+
prepared_functions[field.name] = [f"CAST({field.name} as {parsed_type})"] + prepared_functions[field.name]
|
114 |
+
except Exception:
|
115 |
+
pass
|
116 |
elif pa.types.is_date(field.type):
|
117 |
prepared_functions[field.name] = [prepare_function(date_func, ["startdate", "date"], field.name) for date_func in date_functions_df.Name]
|
118 |
elif pa.types.is_list(field.type):
|
|
|
133 |
dataset_subset_split_textbox = gr.Textbox(visible=False)
|
134 |
input_table_state = gr.State()
|
135 |
run_button = gr.Button(visible=False, elem_id="run_button")
|
136 |
+
gr.Markdown("# DuckDB Spreadsheets\n\nEdit any dataset on Hugging Face (full list [here](https://huggingface.co/datasets)) using DuckDB functions (documentation [here](https://duckdb.org/docs/sql/functions/overview))")
|
137 |
with gr.Group():
|
138 |
with gr.Row():
|
139 |
dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
|
|
|
144 |
transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True, elem_classes="transform_dropdown") for column_name in EMPTY_DF.columns]
|
145 |
transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False, elem_classes="transform_dropdown") for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
|
146 |
dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
|
147 |
+
with gr.Accordion("Show DuckDB SQL command", open=False, elem_classes="transparent-accordion"):
|
148 |
code_markdown = gr.Markdown()
|
149 |
|
150 |
def show_subset_dropdown(dataset: str):
|
|
|
164 |
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
|
165 |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
166 |
if dataset and subset and split and pattern:
|
167 |
+
table = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS}").arrow()
|
168 |
else:
|
169 |
table = EMPTY_TABLE
|
170 |
prepared_functions = get_prepared_functions_from_table(table)
|
|
|
192 |
code_markdown: (
|
193 |
"```sql\n"
|
194 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
195 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
|
196 |
+ "\n```"
|
197 |
) if pattern else "",
|
198 |
}
|
|
|
224 |
code_markdown: (
|
225 |
"```sql\n"
|
226 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
227 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
|
228 |
+ "\n```"
|
229 |
) if pattern else "",
|
230 |
}
|
|
|
245 |
code_markdown: (
|
246 |
"```sql\n"
|
247 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
248 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
|
249 |
+ "\n```"
|
250 |
) if pattern else "",
|
251 |
}
|
|
|
263 |
code_markdown: (
|
264 |
"```sql\n"
|
265 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
266 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
|
267 |
+ "\n```"
|
268 |
) if pattern else "",
|
269 |
}
|
|
|
279 |
code_markdown: (
|
280 |
"```sql\n"
|
281 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
282 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
|
283 |
+ "\n```"
|
284 |
) if pattern else "",
|
285 |
}
|