ola13's picture
hello world
0112a25
raw
history blame
3.99 kB
import json
import math
from functools import partial
import streamlit as st
import streamlit.components.v1 as components
from gforms import Form
BAD_EXAMPLES_PATH = "bad_examples"
DATA_PATH = "data"
MAX_DOC_LENGTH = 30000
def form_callback(
element,
page_index,
element_index,
dataset,
docid,
text,
metadata,
reason,
person,
part,
):
if element.name == "Dataset":
return dataset
if element.name == "Datapoint ID":
return docid
if element.name == "Text":
return text
if element.name == "Metadata":
return metadata
if element.name == "Flagging Reason":
return reason
if element.name == "Flagging Person":
return person
if element.name == "Part":
return part
def report_result(dataset, docid, text, metadata, reason, person, part):
form = Form()
FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSedYTj1pBD5L6xo6qPUKY5vleNW183FXCgc3LSSgg3AUwZWKA/viewform"
form.load(FORM_URL)
form.fill(
partial(
form_callback,
dataset=dataset,
docid=docid,
text=text,
metadata=metadata,
reason=reason,
person=person,
part=part,
),
)
form.submit()
def load_jsonl(file_path):
data = []
with open(file_path, "r") as f:
for line in f:
data.append(json.loads(line))
return data
if "idx" not in st.session_state:
st.session_state.idx = 0
def get_next_item():
st.session_state.idx += 1
def save_flag_and_get_next_item(sample, issue):
if issue is None or issue == "":
issue = "None"
sample["issue"] = issue
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
f.write(json.dumps(sample) + "\n")
text = sample["text"]
sample.pop("text")
sample.pop("issue")
sample_id = ""
if "id" not in sample:
if "title" in sample:
sample_id = sample["title"]
else:
sample_id = sample["id"]
if len(text) > MAX_DOC_LENGTH:
num_parts = math.ceil(len(text) / MAX_DOC_LENGTH)
for i in range(num_parts):
text_portion = text[i * MAX_DOC_LENGTH : (i + 1) * MAX_DOC_LENGTH]
report_result(
dataset, sample_id, text_portion, str(sample), issue, "", str(i)
)
else:
report_result(dataset, sample_id, text, str(sample), issue, "", str(0))
get_next_item()
datasets = [
"gutenberg_raw",
"stackexchange2",
"bigcode_python_code",
"bigcode_python_github_issues",
"bigcode_python_jupyter_scripts_dedup_filtered",
"books3",
"c4",
"s2orc_raw",
"reddit_threaded",
"cc_filtered_text",
]
dataset = st.sidebar.selectbox("Dataset", datasets)
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")
# create bad file if it does not exists
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
pass
st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
st.sidebar.download_button(
"Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
)
st.sidebar.button(
"Clear bad examples file",
on_click=lambda: open(
f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
).close(),
)
with st.form(key="bad_form", clear_on_submit=True):
sample = data[st.session_state.idx]
text = sample["text"]
st.text_area(f"text id: {st.session_state.idx}", text, height=500)
issue = st.text_input(
"What's wrong with this example? (leave blank if example is fine)"
)
good = st.form_submit_button(
"GOOD",
on_click=get_next_item,
)
bad = st.form_submit_button(
"BAD",
on_click=save_flag_and_get_next_item,
args=(sample, issue),
)