import gradio as gr from huggingface_hub import HfApi import duckdb from datasets import load_dataset import pandas as pd import os, time, sys, json, random custom_css=""" * { animation: gow 3s 1 forwards; } @keyframes gow { from { transform: scale(0.1); } to { transform: scale(1.0); } } """ head_js=""" """ api = HfApi() datasets=api.list_datasets(filter="task_categories:text-generation",language="en",gated=False,limit=100) outf='./output.csv' lst=[] def looky(value): datasets=api.list_datasets(search=f"{value}",language="en",gated=False,limit=100) return gr.CheckboxGroup([d.id for d in datasets], label="Select Datasets") def preview(selected): lst=[] for selecd in selected: datum=load_dataset(selecd, split='train', streaming=True).take(3) lst.extend(datum) fd=pd.DataFrame(lst) return gr.Dataframe(headers=["Dataset", "Sample"], value=fd) def build_dataset(selected_datasets, num_samples): outf='./output.csv' con = duckdb.connect(database=':memory:') combined_data = [] for dataset in selected_datasets: data = load_dataset(dataset, split='train', streaming=True).take(num_samples) combined_data.extend(data) df = pd.DataFrame(combined_data) con.execute("CREATE TABLE dataset AS SELECT * FROM df") result = con.execute("SELECT * FROM dataset").fetchall() con.execute("COPY (SELECT * FROM dataset) TO 'output.csv' (HEADER, DELIMITER ',');") return result,outf with gr.Blocks(head=head_js,css=custom_css) as iface: frst_sample=gr.Dataframe(value=None,label="View 3 Samples per selected dataset") srchbx=gr.Textbox(label="Search datasets",placeholder="Search Datasets on the Hub. Type query..hit Enter.. this will update the dataset list below..") with gr.Accordion("Multi-Select Datasets", open=False,): with gr.Row(): dataset_selector = gr.CheckboxGroup([d.id for d in datasets], label="Multi-Select Datasets") num_samples_input = gr.Number(value=10, label="Number of Samples to retrieve per Dataset") build_button = gr.Button("Build Dataset", elem_id="moish") out_way = gr.File() output_display = gr.Dataframe(headers=["Dataset", "Sample"]) build_button.click(fn=build_dataset,inputs=[dataset_selector, num_samples_input],outputs=[output_display,out_way]) dataset_selector.change(preview,dataset_selector,frst_sample) srchbx.change(looky,srchbx,dataset_selector) iface.load(None,None,None,js="""() =>{var colr = 'rgba('+Math.floor(Math.random() * 256)+','+Math.floor(Math.random() * 256)+','+Math.floor(Math.random() * 256)+','+(Math.random() * 1)+')'; document.querySelectorAll('*').forEach(item =>{ item.style.backgroundColor=colr; }); var tin = document.getElementById('moish'); var parents=[]; function getAllParentNodes(element) {while (element.parentNode) {element = element.parentNode; element.style.background = bkd; parents.push(element); }; }; getAllParentNodes(tin);}""",) iface.launch(debug=True)