import json import math import os import uuid from functools import partial import jsonlines import streamlit as st import streamlit.components.v1 as components from huggingface_hub import HfApi BAD_EXAMPLES_PATH = "bad_examples" DATA_PATH = "data" def report_result_dataset(dataset, docid, text, metadata, reason, annotator): with jsonlines.open("report.jsonl", "w") as f: f.write( { "dataset": dataset, "docid": docid, "text": text, "metadata": metadata, "reason": reason, "annotator": annotator, } ) api = HfApi() api.upload_file( path_or_fileobj="report.jsonl", path_in_repo="report-{}.jsonl".format(uuid.uuid4()), repo_id="HuggingFaceGECLM/data_feedback", repo_type="dataset", token=os.environ.get("geclm_token"), ) def load_jsonl(file_path): data = [] with open(file_path, "r") as f: for line in f: data.append(json.loads(line)) return data if "idx" not in st.session_state: st.session_state.idx = 0 def get_next_item(): st.session_state.idx += 1 def save_flag_and_get_next_item(sample, issue): if issue is None or issue == "": issue = "None" sample["issue"] = issue with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f: f.write(json.dumps(sample) + "\n") text = sample["text"] sample.pop("text") sample.pop("issue") sample_id = "" if "id" not in sample: if "title" in sample: sample_id = sample["title"] else: sample_id = sample["id"] report_result_dataset(dataset, sample_id, text, str(sample), issue, "") get_next_item() datasets = [ "gutenberg_raw", "stackexchange2", "bigcode_python_code", "bigcode_python_github_issues", "bigcode_python_jupyter_scripts_dedup_filtered", "books3", "c4", "s2orc_raw", "reddit_threaded", "cc_filtered_text", ] dataset = st.sidebar.selectbox("Dataset", datasets) data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json") # create bad file if it does not exists with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f: pass st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx")) with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f: st.sidebar.download_button( "Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl" ) st.sidebar.button( "Clear bad examples file", on_click=lambda: open( f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w" ).close(), ) with st.form(key="bad_form", clear_on_submit=True): sample = data[st.session_state.idx] text = sample["text"] st.text_area(f"text id: {st.session_state.idx}", text, height=500) issue = st.text_input( "What's wrong with this example? (leave blank if example is fine)" ) good = st.form_submit_button( "GOOD", on_click=get_next_item, ) bad = st.form_submit_button( "BAD", on_click=save_flag_and_get_next_item, args=(sample, issue), )