File size: 3,216 Bytes
526964f 2d4c49d 9e6a5eb 526964f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import copy
def preprocess(example):
def _add_adjusted_col_offsets(table):
"""Add adjusted column offsets to take into account multi-column cells."""
adjusted_table = []
for row in table:
real_col_index = 0
adjusted_row = []
for cell in row:
adjusted_cell = copy.deepcopy(cell)
adjusted_cell["adjusted_col_start"] = real_col_index
adjusted_cell["adjusted_col_end"] = (
adjusted_cell["adjusted_col_start"] + adjusted_cell["column_span"])
real_col_index += adjusted_cell["column_span"]
adjusted_row.append(adjusted_cell)
adjusted_table.append(adjusted_row)
return adjusted_table
def _get_heuristic_row_headers(adjusted_table, row_index, col_index):
"""Heuristic to find row headers."""
row_headers = []
row = adjusted_table[row_index]
for i in range(0, col_index):
if row[i]["is_header"]:
row_headers.append(row[i])
return row_headers
def _get_heuristic_col_headers(adjusted_table, row_index, col_index):
"""Heuristic to find column headers."""
adjusted_cell = adjusted_table[row_index][col_index]
adjusted_col_start = adjusted_cell["adjusted_col_start"]
adjusted_col_end = adjusted_cell["adjusted_col_end"]
col_headers = []
for r in range(0, row_index):
row = adjusted_table[r]
for cell in row:
if (cell["adjusted_col_start"] < adjusted_col_end and
cell["adjusted_col_end"] > adjusted_col_start):
if cell["is_header"]:
col_headers.append(cell)
return col_headers
table = example['table']
cell_indices = example["highlighted_cells"]
table_str = ""
if example['table_page_title']:
table_str += "<page_title> " + example['table_page_title'] + " </page_title> "
if example['table_section_title']:
table_str += "<section_title> " + example['table_section_title'] + " </section_title> "
table_str += "<table> "
adjusted_table = _add_adjusted_col_offsets(table)
for r_index, row in enumerate(table):
row_str = "<row> "
for c_index, col in enumerate(row):
row_headers = _get_heuristic_row_headers(adjusted_table, r_index, c_index)
col_headers = _get_heuristic_col_headers(adjusted_table, r_index, c_index)
# Distinguish between highlighted and non-highlighted cells.
if [r_index, c_index] in cell_indices:
start_cell_marker = "<highlighted_cell> "
end_cell_marker = "</highlighted_cell> "
else:
start_cell_marker = "<c> "
end_cell_marker = "</c> "
# The value of the cell.
item_str = start_cell_marker + col["value"] + " "
# All the column headers associated with this cell.
for col_header in col_headers:
item_str += "<col_header> " + col_header["value"] + " </col_header> "
# All the row headers associated with this cell.
for row_header in row_headers:
item_str += "<row_header> " + row_header["value"] + " </row_header> "
item_str += end_cell_marker
row_str += item_str
row_str += "</row> "
table_str += row_str
table_str += "</table>"
example['linearized_table'] = '<s>' + table_str + '\n' + '\n'
return example
|