Spaces:

hkunlp
/

Binder

Runtime error

Binder / datasets /missing_squall.py

Timothyxxx

Init

f6f97d8 about 2 years ago

6.29 kB

	# coding=utf-8
	# Copyright 2021 The HuggingFace Datasets Authors, The Google AI Language Team Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""The WikiTableQuestions dataset is for the task of question answering on semi-structured HTML tables"""

	import json
	import os
	import datasets
	from utils.wtq.utils import _load_table_w_page as _load_table

	# Find for instance the citation on arxiv or on the dataset repo/website
	_CITATION = """\
	@inproceedings{pasupat-liang-2015-compositional,
	title = "Compositional Semantic Parsing on Semi-Structured Tables",
	author = "Pasupat, Panupong and
	Liang, Percy",
	booktitle = "Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
	month = jul,
	year = "2015",
	address = "Beijing, China",
	publisher = "Association for Computational Linguistics",
	url = "https://aclanthology.org/P15-1142",
	doi = "10.3115/v1/P15-1142",
	pages = "1470--1480",
	}
	"""

	_DESCRIPTION = """\
	Two important aspects of semantic parsing for question answering are the breadth of the knowledge source and the depth of
	logical compositionality. While existing work trades off one aspect for another, this paper simultaneously makes progress
	on both fronts through a new task: answering complex questions on semi-structured tables using question-answer pairs as
	supervision. The central challenge arises from two compounding factors: the broader domain results in an open-ended set
	of relations, and the deeper compositionality results in a combinatorial explosion in the space of logical forms. We
	propose a logical-form driven parsing algorithm guided by strong typing constraints and show that it obtains significant
	improvements over natural baselines. For evaluation, we created a new dataset of 22,033 complex questions on Wikipedia
	tables, which is made publicly available.
	"""

	_HOMEPAGE = "https://ppasupat.github.io/WikiTableQuestions/"

	_LICENSE = "CC-BY-SA-4.0 License"

	_URL = "https://github.com/ppasupat/WikiTableQuestions/archive/refs/heads/master.zip"
	_SQUALL_URL = "https://github.com/tzshi/squall/archive/refs/heads/main.zip"


	class WikiTableQuestion(datasets.GeneratorBasedBuilder):
	"""The WikiTableQuestions dataset"""

	def _info(self):
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=datasets.Features(
	{
	"id": datasets.Value("string"),
	"question": datasets.Value("string"),
	"table_id": datasets.Value("string"),
	"table": {"page_title": datasets.Value("string"),
	"header": datasets.features.Sequence(datasets.Value("string")),
	"rows": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("string")))},
	"answer_text": datasets.features.Sequence(datasets.Value("string")),
	}
	),
	supervised_keys=None,
	homepage=_HOMEPAGE,
	license=_LICENSE,
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	"""Returns SplitGenerators."""
	data_dir = os.path.join(dl_manager.download_and_extract(_URL), 'WikiTableQuestions-master')
	squall_dir = os.path.join(dl_manager.download_and_extract(_SQUALL_URL), 'squall-main')

	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	gen_kwargs={"filepath": os.path.join(data_dir, "data/random-split-1-train.tsv"),
	"data_dir": data_dir,
	"squall_path": os.path.join(squall_dir, "data/squall.json")},
	),
	datasets.SplitGenerator(
	name=datasets.Split.VALIDATION,
	gen_kwargs={"filepath": os.path.join(data_dir, "data/random-split-1-dev.tsv"),
	"data_dir": data_dir,
	"squall_path": os.path.join(squall_dir, "data/squall.json")},
	),
	datasets.SplitGenerator(
	name=datasets.Split.TEST,
	gen_kwargs={"filepath": os.path.join(data_dir, "data/pristine-unseen-tables.tsv"),
	"data_dir": data_dir,
	"squall_path": os.path.join(squall_dir, "data/squall.json")},
	),

	]

	def _generate_examples(self, filepath, data_dir, squall_path):
	"""Yields examples."""
	squall_id_list = []
	with open(squall_path) as f:
	squall_data = json.load(f)
	for squall_item in squall_data:
	squall_id_list.append(squall_item["nt"])
	# data_id, question, table_id, gold_result_str
	with open(filepath, encoding="utf-8") as f:
	for idx, line in enumerate(f):
	# skip the header
	if idx == 0:
	continue
	data_id, question, table_id, gold_result_str = line.strip("\n").split("\t")
	if data_id not in squall_id_list:
	gold_result = gold_result_str.split('\|')
	yield idx, {
	"id": data_id,
	"question": question,
	"table_id": table_id,
	"table": _load_table(os.path.join(data_dir, table_id.replace('.csv', '.tsv'))),
	# convert the .csv postfix to .tsv, for easier read-in
	"answer_text": gold_result,
	}
	else:
	continue