File size: 2,396 Bytes
4f83ec0
 
 
 
 
 
 
6447366
 
 
4f83ec0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6447366
4f83ec0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import time

import gradio as gr
import io
import pandas as pd
import spaces

from generate import stream_file


@spaces.GPU(duration=120)
def stream_output(filename: str):
    if filename.endswith(".jsonl"):
        filename = filename[:-len(".jsonl")]
    content = ""
    size=3
    start_time = time.time()
    for i, chunk in enumerate(stream_file(
        filename=filename,
        prompt="",
        columns=[],
        seed=42,
        size=size,
    )):
        content += chunk
        df = pd.read_json(io.StringIO(content), lines=True)
        state_msg = (
            f"✅ Done generating {size} samples in {time.time() - start_time:.2f}s"
            if i + 1 == size else
            f"⚙️ Generating... [{i}/{size}]"
        )
        yield df, "```json\n" + content + "\n```", state_msg

def test(filename: str):
    if not filename.endswith(".jsonl"):
        yield "❌ 404: File name must end with .jsonl", None, ""
        return
    
    content = ""
    size = 10
    start_time = time.time()
    for i in range(size):
        content += f'{{"i": {i}, "filename": "{filename}"}}\n'
        df = pd.read_json(io.StringIO(content), lines=True)
        state_msg = (
            f"✅ Done generating {size} samples in {time.time() - start_time:.2f}s"
            if i + 1 == size else
            f"⚙️ Generating... [{i}/{size}]"
        )
        yield df, "```json\n" + content + "\n```", state_msg
        time.sleep(0.1)

title = "LLM DataGen"
description = "Generate and stream synthetic dataset files in JSON Lines format"
examples = [
    "movies_data.jsonl",
    "common_first_names.jsonl",
    "bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl",
    "dungeon_and_dragon_characters.jsonl"
]

with gr.Blocks() as demo:
    gr.Markdown(f"# {title}")
    gr.Markdown(description)
    filename_comp = gr.Textbox(examples[0], placeholder=examples[0])
    gr.Examples(examples, filename_comp)
    generate_button = gr.Button("Generate dataset")
    state_msg_comp = gr.Markdown("🔥 Ready to generate")
    with gr.Tab("Dataset"):
        dataframe_comp = gr.DataFrame()
    with gr.Tab("File content"):
        with gr.Blocks(fill_height=True):
            with gr.Row():
                file_content_comp = gr.Markdown()

    generate_button.click(stream_output, filename_comp, [dataframe_comp, file_content_comp, state_msg_comp])


demo.launch()