Binder / generation /prompt.py
Timothyxxx
Init
f6f97d8
raw
history blame
20.6 kB
"""
Build NSQL generation prompt.
Two main parts:
1) PromptBuilder makes prompt for calling codex to generate NSQL(Binder-SQL).
2) OpenAIQAPromptBuilder makes prompt for calling codex to generate QA answers.
"""
import random
from typing import Dict, Tuple
import pandas as pd
import copy
from utils.errors import DuplicateColumnsError
from utils.mmqa.image_stuff import get_caption_map
from retrieval.retrieve_pool import QAItem
from utils.normalizer import prepare_df_for_neuraldb_from_table
def _create_table_prompt(df: pd.DataFrame, title: str):
"""
Return the CREATE TABLE clause as prompt.
"""
string = "CREATE TABLE {}(\n".format(title)
for header in df.columns:
column_type = 'text'
try:
if df[header].dtype == 'int64':
column_type = 'int'
elif df[header].dtype == 'float64':
column_type = 'real'
elif df[header].dtype == 'datetime64':
column_type = 'datetime'
except AttributeError as e:
raise DuplicateColumnsError(e)
string += '\t{} {},\n'.format(header, column_type)
string = string.rstrip(',\n') + ')\n'
return string
class PromptBuilder(object):
def __init__(self, args):
self.args = args
self.prompt_style = args.prompt_style
random.seed(args.seed)
def _select_x_prompt(self, df: pd.DataFrame, num_rows: int,
few_shot_demonstration=True):
"""
Return the first X rows table contents as prompt.
"""
if self.prompt_style == 'create_table_select_full_table':
string = '/*\nAll rows of the table:\nSELECT * FROM w;\n'
elif self.prompt_style == 'create_table_select_3':
string = '/*\n{} example rows:\nSELECT * FROM w LIMIT {};\n'.format(num_rows, num_rows)
elif self.prompt_style == 'create_table_select_3_hidden':
string = '/*\n{} example rows:\n'.format(num_rows)
elif few_shot_demonstration is True and self.prompt_style in \
["create_table_select_3_full_table",
"create_table_select_3_full_table_w_gold_passage_image",
"create_table_select_3_full_table_w_all_passage_image"]:
string = '/*\n{} example rows:\nSELECT * FROM w LIMIT {};\n'.format(num_rows, num_rows)
elif few_shot_demonstration is False and self.prompt_style in \
["create_table_select_3_full_table",
"create_table_select_3_full_table_w_gold_passage_image",
"create_table_select_3_full_table_w_all_passage_image"]:
string = '/*\nAll rows of the table:\nSELECT * FROM w;\n'
else:
raise ValueError(f"Select x prompt style {self.prompt_style} is not supported.")
for column_id, header in enumerate(df.columns):
string += str(header)
if column_id != len(df.columns) - 1:
string += '\t'
string += '\n'
for row_id, row in df.iloc[:num_rows].iterrows():
for column_id, header in enumerate(df.columns):
string += str(row[header])
if column_id != len(df.columns) - 1:
string += '\t'
string += '\n'
string += '*/\n'
return string
def _passage_prompt(self, passages, only_title, db_style_prompt=True):
"""
Return the passage prompt.
"""
if not db_style_prompt:
string = "Passages: "
for passage in passages:
if only_title:
string += passage['title'] + ';; '
else:
string += passage['title'] + f" ({passage['text']})" + ';; '
string = string.rstrip(';; ')
string += '\n'
return string
else:
if len(passages) == 0:
return ""
passage_table_prompt = ""
_header = []
_rows = [[]]
for passage in passages:
_header.append(passage['title'])
_rows[0].append(passage['text'])
passage_table = prepare_df_for_neuraldb_from_table({"header": _header, "rows": _rows})
passage_table_prompt += _create_table_prompt(passage_table, "Passages")
if not only_title:
passage_table_prompt += self._select_x_prompt(
df=passage_table,
num_rows=passage_table.shape[0]
)
return passage_table_prompt
def _image_prompt(self, images, only_title, db_style_prompt=True):
"""
Return the image prompt.
"""
if not db_style_prompt:
string = "Images: "
for image in images:
if only_title:
string += image['title'] + ';;'
else:
string += image['title'] + f" ({image['caption']})" + ';; '
string = string.rstrip(';; ')
string += '\n'
return string
else:
if len(images) == 0:
return ""
image_table_prompt = ""
_header = []
_rows = [[]]
for image in images:
_header.append(image['title'])
_rows[0].append(image['caption'])
image_table = prepare_df_for_neuraldb_from_table({"header": _header, "rows": _rows})
image_table_prompt += _create_table_prompt(image_table, "Images")
if not only_title:
image_table_prompt += self._select_x_prompt(
df=image_table,
num_rows=image_table.shape[0]
)
return image_table_prompt
def _pick_target_columns(self, df, strategy):
"""
Pick the controllable target columns for generation.
"""
if strategy == 'random':
return random.choice(list(df.columns) + ['*'])
elif strategy == 'traverse':
raise NotImplementedError
else:
return ValueError
def _pick_operators(self, df, strategy):
"""
Pick the controllable operators for generation.
"""
candidate_operators = ['none', 'count', 'max', 'min', 'sum']
if strategy == 'random':
return random.choice(candidate_operators)
elif strategy == 'traverse':
raise NotImplementedError
else:
return ValueError
def _pick_nested_levels(self, df, strategy):
"""
Pick the controllable(maybe) nested levels for generation.
"""
if strategy == 'fixed':
return 2
elif strategy == 'random':
raise NotImplementedError
elif strategy == 'traverse':
raise NotImplementedError
else:
raise ValueError
def build_one_shot_prompt(
self,
prompt_type: Tuple,
table: pd.DataFrame,
question: str,
answer_text: str,
nsql: str,
passages: Dict = None,
images: Dict = None,
title: str = None,
only_title: bool = False,
**kwargs
):
"""
Build one-shot prompt with table-question-nsql.
"""
one_shot_prompt = ""
if self.prompt_style == 'create_table_select_full_table':
one_shot_prompt += _create_table_prompt(table, title)
one_shot_prompt += self._select_x_prompt(
df=table,
num_rows=table.shape[0]
)
elif self.prompt_style in ['create_table_select_3_full_table', 'create_table_select_3']:
one_shot_prompt += _create_table_prompt(table, title)
one_shot_prompt += self._select_x_prompt(
df=table,
num_rows=3,
)
elif self.prompt_style == 'create_table':
one_shot_prompt += _create_table_prompt(table, title)
elif self.prompt_style == 'no_table':
# No table input, to test Codex QA with only internal knowledge
pass
elif self.prompt_style in ['create_table_select_3_full_table_w_all_passage_image']:
assert passages is not None and images is not None
one_shot_prompt += _create_table_prompt(table, title)
one_shot_prompt += self._select_x_prompt(
df=table,
num_rows=3,
)
all_passages, all_images = [], []
caption_map = get_caption_map()
for passage_idx in range(len(passages['id'])):
all_passages.append({
'id': passages['id'][passage_idx],
'title': passages['title'][passage_idx],
'url': passages['url'][passage_idx],
'text': passages['text'][passage_idx]
})
for image_idx in range(len(images['id'])):
all_images.append({
"id": images['id'][image_idx],
"title": images['title'][image_idx],
"url": images['url'][image_idx],
"path": images['path'][image_idx],
"pic": images['pic'][image_idx],
"caption": caption_map[images['id'][image_idx]]
})
one_shot_prompt += self._passage_prompt(
passages=all_passages,
only_title=only_title
)
one_shot_prompt += self._image_prompt(
images=all_images,
only_title=only_title
)
else:
raise ValueError('{} is not supported.'.format(self.prompt_style))
# question and nsql pairs
if prompt_type == ('question', 'nsql'):
one_shot_prompt += 'Q: {}\n'.format(question)
one_shot_prompt += 'NeuralSQL: {}\n'.format(nsql)
elif prompt_type == ('question', 'sql'):
one_shot_prompt += 'Q: {}\n'.format(question)
one_shot_prompt += 'SQL: {}\n'.format(nsql)
elif prompt_type == ('question', 'answer'):
one_shot_prompt += 'Q: {}\n'.format(question)
one_shot_prompt += 'A: {}\n'.format(', '.join(answer_text))
else:
raise ValueError(f'Prompt type {prompt_type} is not supported.')
return one_shot_prompt
def build_generate_prompt(
self,
generate_type: Tuple,
table: pd.DataFrame,
question: str = None,
passages: Dict = None,
images: Dict = None,
title: str = None,
only_title: bool = False,
supporting_context: Dict = None,
**kwargs
):
"""
Build the prompt of the generation sample.
"""
generate_prompt = ""
# task instruction
if generate_type == ('answer',):
generate_prompt += """\n-- Answer the question based on the given table below.\n\n"""
elif generate_type == ('nsql',):
generate_prompt += """\n-- Parse the question into NeuralSQL based on the given table below.\n\n"""
elif generate_type == ('sql',):
generate_prompt += """\n-- Parse the question into SQL based on the given table below.\n\n"""
elif generate_type == ('npython',):
generate_prompt += """\n-- Parse the question into NeuralPython based on the given table below.\n\n"""
elif generate_type == ('python',):
generate_prompt += """\n-- Parse the question into Python based on the given table below.\n\n"""
else:
generate_prompt += """\n-- Generate NeuralSQL and question pairs based on the given table below.\n\n"""
# table prompt
if self.prompt_style in ['create_table_select_full_table', 'create_table_select_3_full_table']:
generate_prompt += _create_table_prompt(table, title)
generate_prompt += self._select_x_prompt(
df=table,
num_rows=table.shape[0],
few_shot_demonstration=False
)
elif self.prompt_style in ['create_table_select_3']:
generate_prompt += _create_table_prompt(table, title)
generate_prompt += self._select_x_prompt(
df=table,
num_rows=3,
few_shot_demonstration=False
)
elif self.prompt_style == 'create_table':
generate_prompt += _create_table_prompt(table, title)
elif self.prompt_style == 'no_table':
# No table input, to test Codex QA with only internal knowledge
pass
elif self.prompt_style in ['create_table_select_3_full_table_w_all_passage_image']:
assert passages is not None and images is not None
generate_prompt += _create_table_prompt(table, title)
generate_prompt += self._select_x_prompt(
df=table,
num_rows=table.shape[0],
few_shot_demonstration=False
)
all_passages, all_images = [], []
caption_map = get_caption_map()
for passage_idx in range(len(passages['id'])):
all_passages.append({
'id': passages['id'][passage_idx],
'title': passages['title'][passage_idx],
'url': passages['url'][passage_idx],
'text': passages['text'][passage_idx]
})
for image_idx in range(len(images['id'])):
all_images.append({
"id": images['id'][image_idx],
"title": images['title'][image_idx],
"url": images['url'][image_idx],
"path": images['path'][image_idx],
"pic": images['pic'][image_idx],
"caption": caption_map[images['id'][image_idx]]
})
generate_prompt += self._passage_prompt(
passages=all_passages,
only_title=only_title
)
generate_prompt += self._image_prompt(
images=all_images,
only_title=only_title
)
elif self.prompt_style in ['create_table_select_3_full_table_w_gold_passage_image']:
assert passages is not None and images is not None
generate_prompt += _create_table_prompt(table, title)
generate_prompt += self._select_x_prompt(
df=table,
num_rows=table.shape[0],
few_shot_demonstration=False
)
gold_passages, gold_images = [], []
caption_map = get_caption_map()
for doc_id, doc_part in zip(supporting_context['doc_id'], supporting_context['doc_part']):
if doc_part == 'text':
passage_idx = passages['id'].index(doc_id)
gold_passages.append({
'id': passages['id'][passage_idx],
'title': passages['title'][passage_idx],
'url': passages['url'][passage_idx],
'text': passages['text'][passage_idx]
})
elif doc_part == 'image':
image_idx = images['id'].index(doc_id)
gold_images.append({
"id": images['id'][image_idx],
"title": images['title'][image_idx],
"url": images['url'][image_idx],
"path": images['path'][image_idx],
"pic": images['pic'][image_idx],
"caption": caption_map[doc_id]
})
generate_prompt += self._passage_prompt(
passages=gold_passages,
only_title=only_title
)
generate_prompt += self._image_prompt(
images=gold_images,
only_title=only_title
)
else:
raise ValueError('{} is not supported.'.format(self.prompt_style))
# determine the target to generate
if generate_type == ('answer',):
generate_prompt += 'Q: {}\n'.format(question)
generate_prompt += 'A: '
elif generate_type == ('nsql',):
generate_prompt += 'Q: {}\n'.format(question)
generate_prompt += 'NeuralSQL: '
elif generate_type == ('sql',):
generate_prompt += 'Q: {}\n'.format(question)
generate_prompt += 'SQL: '
elif generate_type == ('npython',):
generate_prompt += 'Q: {}\n'.format(question)
generate_prompt += 'NeuralPython: '
elif generate_type == ('python',):
generate_prompt += 'Q: {}\n'.format(question)
generate_prompt += 'Python: '
else:
raise ValueError(f'Generate type {generate_type} is not supported.')
return generate_prompt
class OpenAIQAPromptBuilder(object):
@staticmethod
def table2codex_prompt(table, table_title=None, drop_row_id=True, ):
_table = copy.deepcopy(table)
header = _table['header']
rows = _table['rows']
if drop_row_id:
if header[0] == "row_id":
header = header[1:]
rows = [_row[1:] for _row in rows]
prompt_str = 'Table: {}\n'.format(table_title) if table_title else ''
prompt_str += "/*\n"
prompt_str += "\t".join(header) + "\n"
prompt_str += '\n'.join(["\t".join([str(cell) for cell in row]) for row in rows]) + "\n"
prompt_str += "*/"
return prompt_str
@staticmethod
def build_one_shot_prompt(
item: QAItem,
answer_split_token: str = ';',
verbose: bool = False,
prompting_method='new_db',
db_mapping_token="πŸ˜…"
) -> str:
"""
Build one-shot QA prompt.
"""
assert prompting_method in ['basic', 'new_db']
qa_type, qa_question = item.qa_question.split('@')
prompt = ''
db_prompt = OpenAIQAPromptBuilder.table2codex_prompt(item.table, item.title)
prompt += "Give a database as shown below:\n{}\n\n".format(db_prompt)
if prompting_method == 'basic':
if qa_type == "map":
prompt += "Q: Answer question \"{}\" row by row.".format(qa_question)
assert answer_split_token is not None
prompt += " The answer should be a list split by '{}' and have {} items in total.".format(
answer_split_token, len(item.table['rows']))
prompt += "\nA: {}\n\n".format(f'{answer_split_token}'.join(item.qa_answer))
elif qa_type == "ans":
prompt += "Q: Answer question \"{}\" for the table.".format(qa_question)
prompt += " "
prompt += "\nA: {}\n\n".format(f'{answer_split_token}'.join(item.qa_answer))
else:
raise ValueError("The QA type is not supported!")
return prompt
elif prompting_method == "new_db":
if qa_type == "map":
prompt += "Q: Answer question \"{}\" row by row.".format(qa_question)
assert answer_split_token is not None
db_prompt_lines = db_prompt.split("\n")[2:-1] # skip Title, /*, and */
db_prompt_lines_with_answer = []
db_prompt_lines_with_answer.append("/*")
db_prompt_lines_with_answer.append(db_prompt_lines[0])
assert len(db_prompt_lines[1:]) == len(
item.qa_answer), "answer items and table rows must be in the same number, check annotations"
for db_prompt_line, qa_answer_item in zip(db_prompt_lines[1:], item.qa_answer):
db_prompt_lines_with_answer.append(
"{}{}{}".format(db_prompt_line, db_mapping_token, qa_answer_item))
db_prompt_lines_with_answer.append("*/")
prompt += "\n{}\n".format("\n".join(db_prompt_lines_with_answer))
elif qa_type == "ans":
prompt += "Q: Answer question \"{}\" for the table.".format(qa_question)
prompt += " "
prompt += "\nA: {}\n".format(f'{answer_split_token}'.join(item.qa_answer))
else:
raise ValueError("The QA type is not supported!")
return prompt