Spaces:
Running
Running
v0
Browse files- app.py +92 -20
- requirements.txt +1 -0
- text_functions.tsv +82 -0
app.py
CHANGED
@@ -1,9 +1,14 @@
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import requests
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
-
|
6 |
-
|
|
|
7 |
css = """
|
8 |
@media (prefers-color-scheme: dark) {
|
9 |
.transparent-dropdown, .transparent-dropdown .container .wrap {
|
@@ -15,23 +20,63 @@ css = """
|
|
15 |
background: var(--bg);
|
16 |
}
|
17 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
29 |
loading_codes_json = gr.JSON(visible=False)
|
30 |
dataset_subset_split_textbox = gr.Textbox(visible=False)
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
@demo.load(outputs=dataset_dropdown)
|
34 |
-
def
|
35 |
api = HfApi(token=oauth_token.token if oauth_token else None)
|
36 |
datasets = list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"]))
|
37 |
if oauth_token and (user := api.whoami().get("user")):
|
@@ -40,14 +85,14 @@ with gr.Blocks(css=css) as demo:
|
|
40 |
return {dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset)}
|
41 |
|
42 |
@dataset_dropdown.change(inputs=dataset_dropdown, outputs=loading_codes_json)
|
43 |
-
def
|
44 |
-
if "/" not in dataset.strip().strip("/"):
|
45 |
return []
|
46 |
-
resp =
|
47 |
-
return ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"]
|
48 |
|
49 |
@loading_codes_json.change(inputs=loading_codes_json, outputs=[subset_dropdown, split_dropdown])
|
50 |
-
def
|
51 |
subsets = [loading_code["config_name"] for loading_code in loading_codes]
|
52 |
subset = (subsets or [""])[0]
|
53 |
splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
|
@@ -55,11 +100,38 @@ with gr.Blocks(css=css) as demo:
|
|
55 |
return gr.Dropdown(subsets, value=subset, visible=len(subsets) > 1), gr.Dropdown(splits, value=split, visible=len(splits) > 1)
|
56 |
|
57 |
@subset_dropdown.change(inputs=[loading_codes_json, subset_dropdown], outputs=split_dropdown)
|
58 |
-
def
|
59 |
splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
|
60 |
split = (splits or [""])[0]
|
61 |
return gr.Dropdown(splits, value=split, visible=len(splits) > 1)
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
|
65 |
if __name__ == "__main__":
|
|
|
1 |
+
from functools import partial, lru_cache
|
2 |
+
|
3 |
+
import duckdb
|
4 |
import gradio as gr
|
5 |
+
import pandas as pd
|
6 |
import requests
|
7 |
from huggingface_hub import HfApi
|
8 |
|
9 |
+
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
|
10 |
+
EMPTY_DF = pd.DataFrame([{str(i): "" for i in range(4)}] * 10)
|
11 |
+
MAX_NUM_COLUMNS = 20
|
12 |
css = """
|
13 |
@media (prefers-color-scheme: dark) {
|
14 |
.transparent-dropdown, .transparent-dropdown .container .wrap {
|
|
|
20 |
background: var(--bg);
|
21 |
}
|
22 |
}
|
23 |
+
input {
|
24 |
+
-webkit-user-select: none;
|
25 |
+
-moz-user-select: none;
|
26 |
+
-ms-user-select: none;
|
27 |
+
user-select: none;
|
28 |
+
}
|
29 |
+
.cell-menu-button {
|
30 |
+
z-index: -1;
|
31 |
+
}
|
32 |
+
thead {
|
33 |
+
display: none;
|
34 |
+
}
|
35 |
"""
|
36 |
+
js = """
|
37 |
+
function setDataFrameReadonly() {
|
38 |
+
MutationObserver = window.MutationObserver || window.WebKitMutationObserver;
|
39 |
+
var observer = new MutationObserver(function(mutations, observer) {
|
40 |
+
// fired when a mutation occurs
|
41 |
+
document.querySelectorAll('.readonly-dataframe div .table-wrap button svelte-virtual-table-viewport table tbody tr td .cell-wrap input').forEach(i => i.setAttribute("readonly", "true"));
|
42 |
+
});
|
43 |
+
// define what element should be observed by the observer
|
44 |
+
// and what types of mutations trigger the callback
|
45 |
+
observer.observe(document, {
|
46 |
+
subtree: true,
|
47 |
+
childList: true
|
48 |
+
});
|
49 |
+
|
50 |
+
}
|
51 |
+
"""
|
52 |
+
text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t")
|
53 |
|
54 |
+
def prepare_function(func: str, placeholder: str, column_name: str) -> str:
|
55 |
+
if "(" in func:
|
56 |
+
prepared_func = func.split("(")
|
57 |
+
prepared_func[1] = prepared_func[1].replace(placeholder, column_name, 1)
|
58 |
+
prepared_func = "(".join(prepared_func)
|
59 |
+
else:
|
60 |
+
prepared_func = func.replace(placeholder, column_name, 1)
|
61 |
+
return prepared_func
|
62 |
+
|
63 |
+
with gr.Blocks(css=css, js=js) as demo:
|
64 |
loading_codes_json = gr.JSON(visible=False)
|
65 |
dataset_subset_split_textbox = gr.Textbox(visible=False)
|
66 |
+
input_dataframe = gr.DataFrame(visible=False)
|
67 |
+
with gr.Group():
|
68 |
+
with gr.Row():
|
69 |
+
dataset_dropdown = gr.Dropdown(label="Open Dataset", allow_custom_value=True, scale=10)
|
70 |
+
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
|
71 |
+
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
|
72 |
+
gr.LoginButton()
|
73 |
+
with gr.Row():
|
74 |
+
transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in EMPTY_DF.columns]
|
75 |
+
transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
|
76 |
+
dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
|
77 |
|
78 |
@demo.load(outputs=dataset_dropdown)
|
79 |
+
def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
|
80 |
api = HfApi(token=oauth_token.token if oauth_token else None)
|
81 |
datasets = list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"]))
|
82 |
if oauth_token and (user := api.whoami().get("user")):
|
|
|
85 |
return {dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset)}
|
86 |
|
87 |
@dataset_dropdown.change(inputs=dataset_dropdown, outputs=loading_codes_json)
|
88 |
+
def _fetch_read_parquet_loading(dataset: str):
|
89 |
+
if dataset and "/" not in dataset.strip().strip("/"):
|
90 |
return []
|
91 |
+
resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
|
92 |
+
return ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or []
|
93 |
|
94 |
@loading_codes_json.change(inputs=loading_codes_json, outputs=[subset_dropdown, split_dropdown])
|
95 |
+
def _show_subset_dropdown(loading_codes: list[dict]):
|
96 |
subsets = [loading_code["config_name"] for loading_code in loading_codes]
|
97 |
subset = (subsets or [""])[0]
|
98 |
splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
|
|
|
100 |
return gr.Dropdown(subsets, value=subset, visible=len(subsets) > 1), gr.Dropdown(splits, value=split, visible=len(splits) > 1)
|
101 |
|
102 |
@subset_dropdown.change(inputs=[loading_codes_json, subset_dropdown], outputs=split_dropdown)
|
103 |
+
def _show_split_dropdown(loading_codes: list[dict], subset: str):
|
104 |
splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
|
105 |
split = (splits or [""])[0]
|
106 |
return gr.Dropdown(splits, value=split, visible=len(splits) > 1)
|
107 |
+
|
108 |
+
@split_dropdown.change(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=input_dataframe)
|
109 |
+
@lru_cache(maxsize=3)
|
110 |
+
def _set_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
|
111 |
+
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
112 |
+
if dataset and subset and split and pattern:
|
113 |
+
df = duckdb.sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").df()
|
114 |
+
return gr.DataFrame(df, column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns))
|
115 |
+
else:
|
116 |
+
return gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns))
|
117 |
+
|
118 |
+
@input_dataframe.change(inputs=input_dataframe, outputs=transform_dropdowns)
|
119 |
+
def _set_transforms(input_df: pd.DataFrame):
|
120 |
+
new_transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in input_df.columns]
|
121 |
+
new_transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
|
122 |
+
return new_transform_dropdowns
|
123 |
+
|
124 |
+
def _set_dataframe(input_df: pd.DataFrame, *transforms: tuple[str], column_index: int):
|
125 |
+
try:
|
126 |
+
print(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;")
|
127 |
+
# return input_df
|
128 |
+
return duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;")
|
129 |
+
except Exception as e:
|
130 |
+
raise gr.Error(f"{type(e).__name__}: {e}")
|
131 |
+
|
132 |
+
for column_index, transform_dropdown in enumerate(transform_dropdowns):
|
133 |
+
transform_dropdown.change(partial(_set_dataframe, column_index=column_index), inputs=[input_dataframe] + transform_dropdowns, outputs=dataframe)
|
134 |
+
|
135 |
|
136 |
|
137 |
if __name__ == "__main__":
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
duckdb
|
text_functions.tsv
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Name Description
|
2 |
+
string ^@ search_string Return true if string begins with search_string.
|
3 |
+
string || string Concatenate two strings. Any NULL input results in NULL. See also concat(string, ...).
|
4 |
+
string[index] Extract a single character using a (1-based) index.
|
5 |
+
string[begin:end] Extract a string using slice conventions, see slicing.
|
6 |
+
string LIKE target Returns true if the string matches the like specifier (see Pattern Matching).
|
7 |
+
string SIMILAR TO regex Returns true if the string matches the regex; identical to regexp_full_match (see Pattern Matching).
|
8 |
+
array_extract(list, index) Extract a single character using a (1-based) index.
|
9 |
+
array_slice(list, begin, end) Extract a string using slice conventions. Negative values are accepted.
|
10 |
+
ascii(string) Returns an integer that represents the Unicode code point of the first character of the string.
|
11 |
+
bar(x, min, max[, width]) Draw a band whose width is proportional to (x - min) and equal to width characters when x = max. width defaults to 80.
|
12 |
+
bit_length(string) Number of bits in a string.
|
13 |
+
chr(x) Returns a character which is corresponding the ASCII code value or Unicode code point.
|
14 |
+
concat_ws(separator, string, ...) Concatenate many strings, separated by separator. NULL inputs are skipped.
|
15 |
+
concat(string, ...) Concatenate many strings. NULL inputs are skipped. See also string || string.
|
16 |
+
contains(string, search_string) Return true if search_string is found within string.
|
17 |
+
ends_with(string, search_string) Return true if string ends with search_string.
|
18 |
+
format_bytes(bytes) Converts bytes to a human-readable representation using units based on powers of 2 (KiB, MiB, GiB, etc.).
|
19 |
+
format(format, parameters, ...) Formats a string using the fmt syntax.
|
20 |
+
from_base64(string) Convert a base64 encoded string to a character string.
|
21 |
+
greatest(x1, x2, ...) Selects the largest value using lexicographical ordering. Note that lowercase characters are considered “larger” than uppercase characters and collations are not supported.
|
22 |
+
hash(value) Returns a UBIGINT with the hash of the value.
|
23 |
+
ilike_escape(string, like_specifier, escape_character) Returns true if the string matches the like_specifier (see Pattern Matching) using case-insensitive matching. escape_character is used to search for wildcard characters in the string.
|
24 |
+
instr(string, search_string) Return location of first occurrence of search_string in string, counting from 1. Returns 0 if no match found.
|
25 |
+
least(x1, x2, ...) Selects the smallest value using lexicographical ordering. Note that uppercase characters are considered “smaller” than lowercase characters, and collations are not supported.
|
26 |
+
left_grapheme(string, count) Extract the left-most grapheme clusters.
|
27 |
+
left(string, count) Extract the left-most count characters.
|
28 |
+
length_grapheme(string) Number of grapheme clusters in string.
|
29 |
+
length(string) Number of characters in string.
|
30 |
+
like_escape(string, like_specifier, escape_character) Returns true if the string matches the like_specifier (see Pattern Matching) using case-sensitive matching. escape_character is used to search for wildcard characters in the string.
|
31 |
+
lower(string) Convert string to lower case.
|
32 |
+
lpad(string, count, character) Pads the string with the character from the left until it has count characters.
|
33 |
+
ltrim(string, characters) Removes any occurrences of any of the characters from the left side of the string.
|
34 |
+
ltrim(string) Removes any spaces from the left side of the string.
|
35 |
+
md5(string) Returns the MD5 hash of the string as a VARCHAR.
|
36 |
+
md5_number(string) Returns the MD5 hash of the string as a HUGEINT.
|
37 |
+
md5_number_lower(string) Returns the lower 64-bit segment of the MD5 hash of the string as a BIGINT.
|
38 |
+
md5_number_higher(string) Returns the higher 64-bit segment of the MD5 hash of the string as a BIGINT.
|
39 |
+
nfc_normalize(string) Convert string to Unicode NFC normalized string. Useful for comparisons and ordering if text data is mixed between NFC normalized and not.
|
40 |
+
not_ilike_escape(string, like_specifier, escape_character) Returns false if the string matches the like_specifier (see Pattern Matching) using case-sensitive matching. escape_character is used to search for wildcard characters in the string.
|
41 |
+
not_like_escape(string, like_specifier, escape_character) Returns false if the string matches the like_specifier (see Pattern Matching) using case-insensitive matching. escape_character is used to search for wildcard characters in the string.
|
42 |
+
ord(string) Return ASCII character code of the leftmost character in a string.
|
43 |
+
parse_dirname(path, separator) Returns the top-level directory name from the given path. separator options: system, both_slash (default), forward_slash, backslash.
|
44 |
+
parse_dirpath(path, separator) Returns the head of the path (the pathname until the last slash) similarly to Python's os.path.dirname function. separator options: system, both_slash (default), forward_slash, backslash.
|
45 |
+
parse_filename(path, trim_extension, separator) Returns the last component of the path similarly to Python's os.path.basename function. If trim_extension is true, the file extension will be removed (defaults to false). separator options: system, both_slash (default), forward_slash, backslash.
|
46 |
+
parse_path(path, separator) Returns a list of the components (directories and filename) in the path similarly to Python's pathlib.parts function. separator options: system, both_slash (default), forward_slash, backslash.
|
47 |
+
position(search_string IN string) Return location of first occurrence of search_string in string, counting from 1. Returns 0 if no match found.
|
48 |
+
printf(format, parameters...) Formats a string using printf syntax.
|
49 |
+
read_text(source) Returns the content from source (a filename, a list of filenames, or a glob pattern) as a VARCHAR. The file content is first validated to be valid UTF-8. If read_text attempts to read a file with invalid UTF-8 an error is thrown suggesting to use read_blob instead. See the read_text guide for more details.
|
50 |
+
regexp_escape(string) Escapes special patterns to turn string into a regular expression similarly to Python's re.escape function.
|
51 |
+
regexp_extract(string, pattern[, group = 0]) If string contains the regexp pattern, returns the capturing group specified by optional parameter group (see Pattern Matching).
|
52 |
+
regexp_extract(string, pattern, name_list) If string contains the regexp pattern, returns the capturing groups as a struct with corresponding names from name_list (see Pattern Matching).
|
53 |
+
regexp_extract_all(string, regex[, group = 0]) Split the string along the regex and extract all occurrences of group.
|
54 |
+
regexp_full_match(string, regex) Returns true if the entire string matches the regex (see Pattern Matching).
|
55 |
+
regexp_matches(string, pattern) Returns true if string contains the regexp pattern, false otherwise (see Pattern Matching).
|
56 |
+
regexp_replace(string, pattern, replacement) If string contains the regexp pattern, replaces the matching part with replacement (see Pattern Matching).
|
57 |
+
regexp_split_to_array(string, regex) Splits the string along the regex.
|
58 |
+
regexp_split_to_table(string, regex) Splits the string along the regex and returns a row for each part.
|
59 |
+
repeat(string, count) Repeats the string count number of times.
|
60 |
+
replace(string, source, target) Replaces any occurrences of the source with target in string.
|
61 |
+
reverse(string) Reverses the string.
|
62 |
+
right_grapheme(string, count) Extract the right-most count grapheme clusters.
|
63 |
+
right(string, count) Extract the right-most count characters.
|
64 |
+
rpad(string, count, character) Pads the string with the character from the right until it has count characters.
|
65 |
+
rtrim(string, characters) Removes any occurrences of any of the characters from the right side of the string.
|
66 |
+
rtrim(string) Removes any spaces from the right side of the string.
|
67 |
+
sha256(value) Returns a VARCHAR with the SHA-256 hash of the value.
|
68 |
+
split_part(string, separator, index) Split the string along the separator and return the data at the (1-based) index of the list. If the index is outside the bounds of the list, return an empty string (to match PostgreSQL's behavior).
|
69 |
+
starts_with(string, search_string) Return true if string begins with search_string.
|
70 |
+
str_split_regex(string, regex) Splits the string along the regex.
|
71 |
+
string_split_regex(string, regex) Splits the string along the regex.
|
72 |
+
string_split(string, separator) Splits the string along the separator.
|
73 |
+
strip_accents(string) Strips accents from string.
|
74 |
+
strlen(string) Number of bytes in string.
|
75 |
+
strpos(string, search_string) Return location of first occurrence of search_string in string, counting from 1. Returns 0 if no match found.
|
76 |
+
substring(string, start, length) Extract substring of length characters starting from character start. Note that a start value of 1 refers to the first character of the string.
|
77 |
+
substring_grapheme(string, start, length) Extract substring of length grapheme clusters starting from character start. Note that a start value of 1 refers to the first character of the string.
|
78 |
+
to_base64(blob) Convert a blob to a base64 encoded string.
|
79 |
+
trim(string, characters) Removes any occurrences of any of the characters from either side of the string.
|
80 |
+
trim(string) Removes any spaces from either side of the string.
|
81 |
+
unicode(string) Returns the Unicode code of the first character of the string.
|
82 |
+
upper(string) Convert string to upper case.
|