Spaces:

HuggingFaceGECLM
/

random_dataset_exploration

Sleeping

App Files Files Community

random_dataset_exploration / app.py

ola13

hello world

0112a25 over 1 year ago

raw

history blame

3.99 kB

	import json
	import math
	from functools import partial

	import streamlit as st
	import streamlit.components.v1 as components
	from gforms import Form

	BAD_EXAMPLES_PATH = "bad_examples"
	DATA_PATH = "data"
	MAX_DOC_LENGTH = 30000


	def form_callback(
	element,
	page_index,
	element_index,
	dataset,
	docid,
	text,
	metadata,
	reason,
	person,
	part,
	):
	if element.name == "Dataset":
	return dataset
	if element.name == "Datapoint ID":
	return docid
	if element.name == "Text":
	return text
	if element.name == "Metadata":
	return metadata
	if element.name == "Flagging Reason":
	return reason
	if element.name == "Flagging Person":
	return person
	if element.name == "Part":
	return part


	def report_result(dataset, docid, text, metadata, reason, person, part):
	form = Form()
	FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSedYTj1pBD5L6xo6qPUKY5vleNW183FXCgc3LSSgg3AUwZWKA/viewform"
	form.load(FORM_URL)
	form.fill(
	partial(
	form_callback,
	dataset=dataset,
	docid=docid,
	text=text,
	metadata=metadata,
	reason=reason,
	person=person,
	part=part,
	),
	)
	form.submit()


	def load_jsonl(file_path):
	data = []
	with open(file_path, "r") as f:
	for line in f:
	data.append(json.loads(line))

	return data


	if "idx" not in st.session_state:
	st.session_state.idx = 0


	def get_next_item():
	st.session_state.idx += 1


	def save_flag_and_get_next_item(sample, issue):
	if issue is None or issue == "":
	issue = "None"
	sample["issue"] = issue

	with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
	f.write(json.dumps(sample) + "\n")

	text = sample["text"]

	sample.pop("text")
	sample.pop("issue")
	sample_id = ""
	if "id" not in sample:
	if "title" in sample:
	sample_id = sample["title"]
	else:
	sample_id = sample["id"]

	if len(text) > MAX_DOC_LENGTH:
	num_parts = math.ceil(len(text) / MAX_DOC_LENGTH)
	for i in range(num_parts):
	text_portion = text[i * MAX_DOC_LENGTH : (i + 1) * MAX_DOC_LENGTH]
	report_result(
	dataset, sample_id, text_portion, str(sample), issue, "", str(i)
	)
	else:
	report_result(dataset, sample_id, text, str(sample), issue, "", str(0))

	get_next_item()


	datasets = [
	"gutenberg_raw",
	"stackexchange2",
	"bigcode_python_code",
	"bigcode_python_github_issues",
	"bigcode_python_jupyter_scripts_dedup_filtered",
	"books3",
	"c4",
	"s2orc_raw",
	"reddit_threaded",
	"cc_filtered_text",
	]
	dataset = st.sidebar.selectbox("Dataset", datasets)
	data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")

	# create bad file if it does not exists
	with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
	pass

	st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))

	with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
	st.sidebar.download_button(
	"Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
	)

	st.sidebar.button(
	"Clear bad examples file",
	on_click=lambda: open(
	f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
	).close(),
	)

	with st.form(key="bad_form", clear_on_submit=True):
	sample = data[st.session_state.idx]
	text = sample["text"]
	st.text_area(f"text id: {st.session_state.idx}", text, height=500)

	issue = st.text_input(
	"What's wrong with this example? (leave blank if example is fine)"
	)

	good = st.form_submit_button(
	"GOOD",
	on_click=get_next_item,
	)
	bad = st.form_submit_button(
	"BAD",
	on_click=save_flag_and_get_next_item,
	args=(sample, issue),
	)