lhoestq HF staff commited on
Commit
b6190b3
β€’
1 Parent(s): 4c86203

rename + parse json

Browse files
Files changed (2) hide show
  1. README.md +3 -3
  2. app.py +19 -8
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Dataset Spreadsheets
3
- emoji: πŸš€
4
  colorFrom: green
5
- colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.6.0
8
  app_file: app.py
 
1
  ---
2
+ title: DuckDB Spreadsheets
3
+ emoji: πŸ₯πŸ“
4
  colorFrom: green
5
+ colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.6.0
8
  app_file: app.py
app.py CHANGED
@@ -2,14 +2,17 @@ from functools import partial, lru_cache
2
 
3
  import duckdb
4
  import gradio as gr
 
5
  import pandas as pd
6
  import pyarrow as pa
 
7
  import requests
8
  from huggingface_hub import HfApi
9
 
10
  READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
11
  EMPTY_TABLE = pa.Table.from_pylist([{str(i): "" for i in range(4)}] * 10)
12
  EMPTY_DF: pd.DataFrame = EMPTY_TABLE.to_pandas()
 
13
  MAX_NUM_COLUMNS = 20
14
  NUM_TRENDING_DATASETS = 10
15
  NUM_USER_DATASETS = 10
@@ -102,6 +105,14 @@ def get_prepared_functions_from_table(table: pa.Table) -> dict[str, list[str]]:
102
  prepared_functions[field.name] = [prepare_function(numeric_func, ["x"], field.name) for numeric_func in numeric_functions_df.Name]
103
  elif pa.types.is_string(field.type):
104
  prepared_functions[field.name] = [prepare_function(text_func, ["string"], field.name) for text_func in text_functions_df.Name]
 
 
 
 
 
 
 
 
105
  elif pa.types.is_date(field.type):
106
  prepared_functions[field.name] = [prepare_function(date_func, ["startdate", "date"], field.name) for date_func in date_functions_df.Name]
107
  elif pa.types.is_list(field.type):
@@ -122,7 +133,7 @@ with gr.Blocks(css=css, js=js) as demo:
122
  dataset_subset_split_textbox = gr.Textbox(visible=False)
123
  input_table_state = gr.State()
124
  run_button = gr.Button(visible=False, elem_id="run_button")
125
- gr.Markdown("# Dataset Spreadsheets\n\nEdit any dataset on Hugging Face (full list [here](https://huggingface.co/datasets)) using DuckDB functions (documentation [here](https://duckdb.org/docs/sql/functions/overview))")
126
  with gr.Group():
127
  with gr.Row():
128
  dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
@@ -133,7 +144,7 @@ with gr.Blocks(css=css, js=js) as demo:
133
  transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True, elem_classes="transform_dropdown") for column_name in EMPTY_DF.columns]
134
  transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False, elem_classes="transform_dropdown") for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
135
  dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
136
- with gr.Accordion("Show SQL command", open=False, elem_classes="transparent-accordion"):
137
  code_markdown = gr.Markdown()
138
 
139
  def show_subset_dropdown(dataset: str):
@@ -153,7 +164,7 @@ with gr.Blocks(css=css, js=js) as demo:
153
  def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
154
  pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
155
  if dataset and subset and split and pattern:
156
- table = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").arrow()
157
  else:
158
  table = EMPTY_TABLE
159
  prepared_functions = get_prepared_functions_from_table(table)
@@ -181,7 +192,7 @@ with gr.Blocks(css=css, js=js) as demo:
181
  code_markdown: (
182
  "```sql\n"
183
  + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
184
- + f"FROM 'hf://datasets/{dataset}/{pattern}';"
185
  + "\n```"
186
  ) if pattern else "",
187
  }
@@ -213,7 +224,7 @@ with gr.Blocks(css=css, js=js) as demo:
213
  code_markdown: (
214
  "```sql\n"
215
  + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
216
- + f"FROM 'hf://datasets/{dataset}/{pattern}';"
217
  + "\n```"
218
  ) if pattern else "",
219
  }
@@ -234,7 +245,7 @@ with gr.Blocks(css=css, js=js) as demo:
234
  code_markdown: (
235
  "```sql\n"
236
  + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
237
- + f"FROM 'hf://datasets/{dataset}/{pattern}';"
238
  + "\n```"
239
  ) if pattern else "",
240
  }
@@ -252,7 +263,7 @@ with gr.Blocks(css=css, js=js) as demo:
252
  code_markdown: (
253
  "```sql\n"
254
  + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
255
- + f"FROM 'hf://datasets/{dataset}/{pattern}';"
256
  + "\n```"
257
  ) if pattern else "",
258
  }
@@ -268,7 +279,7 @@ with gr.Blocks(css=css, js=js) as demo:
268
  code_markdown: (
269
  "```sql\n"
270
  + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
271
- + f"FROM 'hf://datasets/{dataset}/{pattern}';"
272
  + "\n```"
273
  ) if pattern else "",
274
  }
 
2
 
3
  import duckdb
4
  import gradio as gr
5
+ import json
6
  import pandas as pd
7
  import pyarrow as pa
8
+ import pyarrow.compute as pc
9
  import requests
10
  from huggingface_hub import HfApi
11
 
12
  READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
13
  EMPTY_TABLE = pa.Table.from_pylist([{str(i): "" for i in range(4)}] * 10)
14
  EMPTY_DF: pd.DataFrame = EMPTY_TABLE.to_pandas()
15
+ NUM_ROWS = 10
16
  MAX_NUM_COLUMNS = 20
17
  NUM_TRENDING_DATASETS = 10
18
  NUM_USER_DATASETS = 10
 
105
  prepared_functions[field.name] = [prepare_function(numeric_func, ["x"], field.name) for numeric_func in numeric_functions_df.Name]
106
  elif pa.types.is_string(field.type):
107
  prepared_functions[field.name] = [prepare_function(text_func, ["string"], field.name) for text_func in text_functions_df.Name]
108
+ # try parsing json
109
+ if pc.all(pc.starts_with(table[field.name], "{")).as_py() or pc.all(pc.starts_with(table[field.name], "[")).as_py():
110
+ try:
111
+ json_parsed_table = pa.Table.from_pylist([{field.name: json.loads(row)} for row in table[field.name].to_pylist()])
112
+ parsed_type = str(duckdb.from_arrow(json_parsed_table).dtypes[0])
113
+ prepared_functions[field.name] = [f"CAST({field.name} as {parsed_type})"] + prepared_functions[field.name]
114
+ except Exception:
115
+ pass
116
  elif pa.types.is_date(field.type):
117
  prepared_functions[field.name] = [prepare_function(date_func, ["startdate", "date"], field.name) for date_func in date_functions_df.Name]
118
  elif pa.types.is_list(field.type):
 
133
  dataset_subset_split_textbox = gr.Textbox(visible=False)
134
  input_table_state = gr.State()
135
  run_button = gr.Button(visible=False, elem_id="run_button")
136
+ gr.Markdown("# DuckDB Spreadsheets\n\nEdit any dataset on Hugging Face (full list [here](https://huggingface.co/datasets)) using DuckDB functions (documentation [here](https://duckdb.org/docs/sql/functions/overview))")
137
  with gr.Group():
138
  with gr.Row():
139
  dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
 
144
  transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True, elem_classes="transform_dropdown") for column_name in EMPTY_DF.columns]
145
  transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False, elem_classes="transform_dropdown") for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
146
  dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
147
+ with gr.Accordion("Show DuckDB SQL command", open=False, elem_classes="transparent-accordion"):
148
  code_markdown = gr.Markdown()
149
 
150
  def show_subset_dropdown(dataset: str):
 
164
  def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
165
  pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
166
  if dataset and subset and split and pattern:
167
+ table = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS}").arrow()
168
  else:
169
  table = EMPTY_TABLE
170
  prepared_functions = get_prepared_functions_from_table(table)
 
192
  code_markdown: (
193
  "```sql\n"
194
  + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
195
+ + f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
196
  + "\n```"
197
  ) if pattern else "",
198
  }
 
224
  code_markdown: (
225
  "```sql\n"
226
  + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
227
+ + f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
228
  + "\n```"
229
  ) if pattern else "",
230
  }
 
245
  code_markdown: (
246
  "```sql\n"
247
  + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
248
+ + f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
249
  + "\n```"
250
  ) if pattern else "",
251
  }
 
263
  code_markdown: (
264
  "```sql\n"
265
  + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
266
+ + f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
267
  + "\n```"
268
  ) if pattern else "",
269
  }
 
279
  code_markdown: (
280
  "```sql\n"
281
  + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
282
+ + f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
283
  + "\n```"
284
  ) if pattern else "",
285
  }