import gradio as gr import jsonlines import os import uuid from datetime import datetime from huggingface_hub import HfApi from pprint import pprint datasets = [ "gutenberg_raw", "stackexchange2", "bigcode_python_code", "bigcode_python_github_issues", "bigcode_python_jupyter_scripts_dedup_filtered", "books3", "c4", "s2orc_raw", "reddit_threaded", "cc_filtered_text", ] def line_generator(dataset): if dataset == "gutenberg_raw": with jsonlines.open("data/gutenberg_raw_examples_with_stats.json", "r") as f: for line in f: yield line if dataset == "stackexchange2": with jsonlines.open("data/stackexchange2_examples_with_stats.json", "r") as f: for line in f: yield line if dataset == "bigcode_python_code": with jsonlines.open( "data/bigcode_python_code_examples_with_stats.json", "r" ) as f: for line in f: yield line if dataset == "bigcode_python_github_issues": with jsonlines.open( "data/bigcode_python_github_issues_examples_with_stats.json", "r" ) as f: for line in f: yield line if dataset == "bigcode_python_jupyter_scripts_dedup_filtered": with jsonlines.open( "data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json", "r", ) as f: for line in f: yield line if dataset == "books3": with jsonlines.open("data/books3_examples_with_stats.json", "r") as f: for line in f: yield line if dataset == "c4": with jsonlines.open("data/c4_examples_with_stats.json", "r") as f: for line in f: yield line if dataset == "s2orc_raw": with jsonlines.open("data/s2orc_raw_examples_with_stats.json", "r") as f: for line in f: yield line if dataset == "reddit_threaded": with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f: for line in f: yield line if dataset == "cc_filtered_text": with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f: for line in f: yield line line_generators = {dataset: line_generator(dataset) for dataset in datasets} def send_report(sample, dataset, reason, annotator): text = sample["text"] sample.pop("text") sample_id = "" if "id" not in sample: if "title" in sample: sample_id = sample["title"] else: sample_id = sample["id"] print("submitting") pprint( { "dataset": dataset, "docid": sample_id, "text": text, "metadata": sample, "reason": reason, "annotator": annotator, "timestamp": str(datetime.now()), } ) with jsonlines.open("report.jsonl", "w") as f: f.write( { "dataset": dataset, "docid": sample_id, "text": text, "metadata": sample, "reason": reason, "annotator": annotator, "timestamp": str(datetime.now()), } ) print("geclm_token", os.environ.get("geclm_token")) api = HfApi() api.upload_file( path_or_fileobj="report.jsonl", path_in_repo="report-{}.jsonl".format(uuid.uuid4()), repo_id="HuggingFaceGECLM/data_feedback", repo_type="dataset", token=os.environ.get("geclm_token"), ) if __name__ == "__main__": demo = gr.Blocks() with demo: current_sample_state = gr.State(dict()) with gr.Row(): annotator = gr.Textbox( lines=1, max_lines=1, placeholder="Type your name here if you'd like it to be recorded.", label="Annotator", ) with gr.Row(): dataset = gr.Dropdown( choices=datasets, value="Pick a dataset below", label="Dataset", ) with gr.Row(): reason_txt = gr.Textbox( label="Flagging reason", placeholder="Provide the reason for flagging if you think the sample is bad.", visible=False, ) with gr.Row(): bad_btn = gr.Button("Bad", visible=False) good_btn = gr.Button("Next", visible=False) with gr.Row(): text = gr.Markdown(visible=False) def next_line(dataset): next_line = next(line_generators[dataset]) return [ gr.update(value="
" + next_line["text"] + "
", visible=True), next_line, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), ] def bad_line(current_sample, dataset, reason, annotator): send_report(current_sample, dataset, reason, annotator) next_line = next(line_generators[dataset]) return [ "
" + next_line["text"] + "
", gr.update( value="", placeholder="Provide the reason for flagging if you think the sample is bad.", ), next_line, ] good_btn.click( next_line, inputs=dataset, outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn], ) dataset.change( next_line, inputs=dataset, outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn], ) bad_btn.click( bad_line, inputs=[current_sample_state, dataset, reason_txt, annotator], outputs=[text, reason_txt, current_sample_state], ) demo.launch(enable_queue=False, debug=True)