File size: 4,200 Bytes
fbe940a
6b97460
4f83ec0
 
 
 
 
 
2e2e9ca
6447366
2e2e9ca
6b97460
2e2e9ca
6447366
4f83ec0
fbe940a
 
 
6b97460
 
 
 
 
 
 
fbe940a
 
456f50b
fbe940a
 
 
 
 
 
 
6b97460
4f83ec0
6b97460
 
fbe940a
4f83ec0
 
 
456f50b
fbe940a
 
 
 
 
 
 
 
 
 
 
 
 
4f83ec0
 
bb0eeb1
82dc3c2
bb0eeb1
 
4f83ec0
 
fbe940a
6b97460
 
4f83ec0
 
 
 
 
fbe940a
 
4f83ec0
 
 
 
6b97460
fbe940a
 
 
 
 
 
 
4f83ec0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from pathlib import Path
from urllib.parse import urlparse, parse_qs

import gradio as gr
import io
import pandas as pd
import spaces

from generate import model_id, stream_jsonl_file, batch_size

MAX_SIZE = 20 * batch_size
DEFAULT_SEED = 42
DEFAULT_SIZE = 5 * batch_size

@spaces.GPU(duration=120)
def stream_output(query: str, continue_content: str = ""):
    query = Path(query).name
    parsed_filename = urlparse(query)
    filename = parsed_filename.path
    params = parse_qs(parsed_filename.query)
    prompt = params["prompt"][0] if "prompt" in params else ""
    columns = [column.strip() for column in params["columns"][0].split(",") if column.strip()] if "columns" in params else []
    size = int(params["size"][0]) if "size" in params else DEFAULT_SIZE
    seed = int(params["seed"][0]) if "seed" in params else DEFAULT_SEED
    if size > MAX_SIZE:
        raise gr.Error(f"Maximum size is {MAX_SIZE}. Duplicate this Space to remove this limit.")
    content = continue_content
    df = pd.read_json(io.StringIO(content), lines=True, convert_dates=False)
    continue_content_size = len(df)
    state_msg = f"⚙️ Generating... [{continue_content_size + 1}/{continue_content_size + size}]"
    if list(df.columns):
        columns = list(df.columns)
    else:
        df = pd.DataFrame({"1": [], "2": [], "3": []})
    yield df, "```json\n" + content + "\n```", gr.Button(state_msg), gr.Button("Generate one more batch", interactive=False), gr.DownloadButton("⬇️ Download", interactive=False)
    for i, chunk in enumerate(stream_jsonl_file(
        filename=filename,
        prompt=prompt,
        columns=columns,
        seed=seed + (continue_content_size // size),
        size=size,
    )):
        content += chunk
        df = pd.read_json(io.StringIO(content), lines=True, convert_dates=False)
        state_msg = f"⚙️ Generating... [{continue_content_size + i + 1}/{continue_content_size + size}]"
        yield df, "```json\n" + content + "\n```", gr.Button(state_msg), gr.Button("Generate one more batch", interactive=False), gr.DownloadButton("⬇️ Download", interactive=False)
    with open(query, "w", encoding="utf-8") as f:
        f.write(content)
    yield df, "```json\n" + content + "\n```", gr.Button("Generate dataset"), gr.Button("Generate one more batch", visible=True, interactive=True), gr.DownloadButton("⬇️ Download", value=query, visible=True, interactive=True)


def stream_more_output(query: str):
    query = Path(query).name
    with open(query, "r", encoding="utf-8") as f:
        continue_content = f.read()
    yield from stream_output(query=query, continue_content=continue_content)


title = "LLM DataGen"
description = (
    f"Generate and stream synthetic dataset files in `{{JSON Lines}}` format (currently using [{model_id}](https://huggingface.co/{model_id}))\n\n"
    "Disclaimer: LLM data generation is an area of active research with known problems such as biased generation and incorrect information."
)
examples = [
    "movies_data.jsonl",
    "dungeon_and_dragon_characters.jsonl",
    "bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl",
    "common_first_names.jsonl?columns=first_name,popularity&size=10",
]

with gr.Blocks() as demo:
    gr.Markdown(f"# {title}")
    gr.Markdown(description)
    filename_comp = gr.Textbox(examples[0], placeholder=examples[0], label="File name to generate")
    outputs = []
    generate_button = gr.Button("Generate dataset")
    with gr.Tab("Dataset"):
        dataframe_comp = gr.DataFrame()
    with gr.Tab("File content"):
        file_content_comp = gr.Markdown()
    with gr.Row():
        generate_more_button = gr.Button("Generate one more batch", visible=False, interactive=False, scale=3)
        download_button = gr.DownloadButton("⬇️ Download", visible=False, interactive=False, scale=1)
    outputs = [dataframe_comp, file_content_comp, generate_button, generate_more_button, download_button]
    examples = gr.Examples(examples, filename_comp, outputs, fn=stream_output, run_on_click=True)
    generate_button.click(stream_output, filename_comp, outputs)
    generate_more_button.click(stream_more_output, filename_comp, outputs)


demo.launch()