import time import gradio as gr import io import pandas as pd import spaces from generate import stream_file @spaces.GPU(duration=120) def stream_output(filename: str): if filename.endswith(".jsonl"): filename = filename[:-len(".jsonl")] content = "" size=3 start_time = time.time() for i, chunk in enumerate(stream_file( filename=filename, prompt="", columns=[], seed=42, size=size, )): content += chunk df = pd.read_json(io.StringIO(content), lines=True) state_msg = ( f"✅ Done generating {size} samples in {time.time() - start_time:.2f}s" if i + 1 == size else f"⚙️ Generating... [{i}/{size}]" ) yield df, "```json\n" + content + "\n```", state_msg def test(filename: str): if not filename.endswith(".jsonl"): yield "❌ 404: File name must end with .jsonl", None, "" return content = "" size = 10 start_time = time.time() for i in range(size): content += f'{{"i": {i}, "filename": "{filename}"}}\n' df = pd.read_json(io.StringIO(content), lines=True) state_msg = ( f"✅ Done generating {size} samples in {time.time() - start_time:.2f}s" if i + 1 == size else f"⚙️ Generating... [{i}/{size}]" ) yield df, "```json\n" + content + "\n```", state_msg time.sleep(0.1) title = "LLM DataGen" description = "Generate and stream synthetic dataset files in JSON Lines format" examples = [ "movies_data.jsonl", "common_first_names.jsonl", "bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl", "dungeon_and_dragon_characters.jsonl" ] with gr.Blocks() as demo: gr.Markdown(f"# {title}") gr.Markdown(description) filename_comp = gr.Textbox(examples[0], placeholder=examples[0]) gr.Examples(examples, filename_comp) generate_button = gr.Button("Generate dataset") state_msg_comp = gr.Markdown("🔥 Ready to generate") with gr.Tab("Dataset"): dataframe_comp = gr.DataFrame() with gr.Tab("File content"): with gr.Blocks(fill_height=True): with gr.Row(): file_content_comp = gr.Markdown() generate_button.click(stream_output, filename_comp, [dataframe_comp, file_content_comp, state_msg_comp]) demo.launch()