Spaces:

HuggingFaceM4
/

IDEFICS_Data_Measurement_Tool

Runtime error

App Files Files Community

IDEFICS_Data_Measurement_Tool / widgets /duplicates.py

Ezi

Upload 312 files

46df0b6 over 1 year ago

raw

history blame contribute delete

2.3 kB

	import gradio as gr

	from widgets.widget_base import Widget
	from data_measurements.dataset_statistics import DatasetStatisticsCacheClass as dmt_cls
	import utils
	import utils.dataset_utils as ds_utils

	logs = utils.prepare_logging(__file__)


	class Duplicates(Widget):
	def __init__(self):
	duplicates_text = f"""
	Use this widget to identify text strings that appear more than once.

	A model's training and testing may be negatively affected by unwarranted duplicates ([Lee et al., 2021](https://arxiv.org/abs/2107.06499))

	------

	### Here is the list of all the duplicated items and their counts in the dataset.
	"""
	self.duplicates_intro = gr.Markdown(render=False, value=duplicates_text)
	self.duplicates_df = gr.DataFrame(render=False)
	self.duplicates_text = gr.Markdown(render=False)

	def render(self):
	with gr.TabItem(f"Duplicates"):
	self.duplicates_intro.render()
	self.duplicates_text.render()
	self.duplicates_df.render()

	def update(self, dstats: dmt_cls):
	output = {}

	if not dstats.duplicates_results:
	output[self.duplicates_df] = gr.DataFrame.update(visible=False)
	output[self.duplicates_text] = gr.Markdown.update(visible=True,
	value="There are no duplicates in this dataset! 🥳")
	else:
	dupes_df_tmp = ds_utils.counter_dict_to_df(dstats.dups_dict, key_as_column=True)
	dupes_df_tmp.columns = ["instance", "count"]
	# Nice to have the counts show up first, because the instances
	# can be quite long (and run off the page)
	dupes_df = dupes_df_tmp[["count", "instance"]]
	output[self.duplicates_df] = gr.DataFrame.update(visible=True, value=dupes_df)

	duplicates_text = f"The fraction of data that is duplicate is {str(round(dstats.dups_frac, 4))}"
	output[self.duplicates_text] = gr.Markdown.update(value=duplicates_text, visible=True)

	return output


	@property
	def output_components(self):
	return [
	self.duplicates_text,
	self.duplicates_df,
	]

	def add_events(self, state: gr.State):
	pass