bglearning commited on
Commit
f9dd31c
·
1 Parent(s): 71b2a17

Refactor table_html creation

Browse files
Files changed (1) hide show
  1. tapas_visualizer.py +42 -34
tapas_visualizer.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- from typing import Any, List
3
 
4
  from collections import defaultdict
5
 
@@ -102,6 +102,31 @@ class TapasVisualizer:
102
 
103
  return spans
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  def __call__(self, table: pd.DataFrame) -> Any:
107
  tokenized = self.tokenizer(table)
@@ -113,39 +138,22 @@ class TapasVisualizer:
113
  # 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation' not required
114
  segment_id, col_id, row_id, *_ = tokenized['token_type_ids'][id_ind]
115
  token_text = self.tokenizer._convert_id_to_token(input_id)
116
- cell_tokens[(row_id, col_id)].append(token_text)
117
-
118
- cumulative_cnt = 0
119
- header_row_html = ""
120
- header_row_token_cnt = 0
121
- for col_id, col in enumerate(table.columns, start=1):
122
- cur_cell_tokens = cell_tokens[0, col_id]
123
- span_htmls = self.text_to_html(col, cur_cell_tokens)
124
- cell_html = "".join(span_htmls)
125
- header_row_html += f"<th>{cell_html}</th>"
126
- header_row_token_cnt += len(cur_cell_tokens)
127
- cumulative_cnt += header_row_token_cnt
128
- cnt_str = f'{header_row_token_cnt} | {cumulative_cnt}'
129
- header_row_html += f'<th style="border: none;">{self.style_span(cnt_str, ["non-token", "count"])}</th>'
130
- header_row_html = f'<tr>{header_row_html}</tr>'
131
-
132
- table_vals = table.values
133
-
134
- table_html = header_row_html
135
-
136
- for row_id, row in enumerate(table_vals, start=1):
137
- row_html = ""
138
- row_token_cnt = 0
139
- for col_id, cell in enumerate(row, start=1):
140
- cur_cell_tokens = cell_tokens[(row_id, col_id)]
141
- span_htmls = self.text_to_html(cell, cur_cell_tokens)
142
- cell_html = "".join(span_htmls)
143
- row_html += f"<td>{cell_html}</td>"
144
- row_token_cnt += len(cur_cell_tokens)
145
- cumulative_cnt += row_token_cnt
146
- cnt_str = f'{row_token_cnt} | {cumulative_cnt}'
147
- row_html += f'<td style="border: none;">{self.style_span(cnt_str, ["non-token", "count"])}</td>'
148
- table_html += f'<tr>{row_html}</tr>'
149
 
150
  table_html = f'<table>{table_html}</table>'
151
  return HTMLBody(table_html)
 
1
  import os
2
+ from typing import Any, List, Dict
3
 
4
  from collections import defaultdict
5
 
 
102
 
103
  return spans
104
 
105
+ def cells_to_html(self,
106
+ cell_vals: List[List[str]],
107
+ cell_tokens: Dict,
108
+ row_id_start: int=0,
109
+ cell_element: str="td",
110
+ cumulative_cnt: int=0,
111
+ table_html: str="") -> str:
112
+
113
+ for row_id, row in enumerate(cell_vals, start=row_id_start):
114
+ row_html = ""
115
+ row_token_cnt = 0
116
+ for col_id, cell in enumerate(row, start=1):
117
+ cur_cell_tokens = cell_tokens[(row_id, col_id)]
118
+ span_htmls = self.text_to_html(cell, cur_cell_tokens)
119
+ cell_html = "".join(span_htmls)
120
+ row_html += f"<{cell_element}>{cell_html}</{cell_element}>"
121
+ row_token_cnt += len(cur_cell_tokens)
122
+ cumulative_cnt += row_token_cnt
123
+ # cnt_str = f'{row_token_cnt} | {cumulative_cnt}'
124
+ row_html += f'<td style="border: none;" align="right">{self.style_span(str(row_token_cnt), ["non-token", "count"])}</td>'
125
+ row_html += f'<td style="border: none;" align="right">{self.style_span(str(cumulative_cnt), ["non-token", "count"])}</td>'
126
+ table_html += f'<tr>{row_html}</tr>'
127
+
128
+ return table_html, cumulative_cnt
129
+
130
 
131
  def __call__(self, table: pd.DataFrame) -> Any:
132
  tokenized = self.tokenizer(table)
 
138
  # 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation' not required
139
  segment_id, col_id, row_id, *_ = tokenized['token_type_ids'][id_ind]
140
  token_text = self.tokenizer._convert_id_to_token(input_id)
141
+ if int(segment_id) == 1:
142
+ cell_tokens[(row_id, col_id)].append(token_text)
143
+
144
+ table_html, cumulative_cnt = self.cells_to_html(cell_vals=[table.columns],
145
+ cell_tokens=cell_tokens,
146
+ row_id_start=0,
147
+ cell_element="th",
148
+ cumulative_cnt=0,
149
+ table_html="")
150
+
151
+ table_html, cumulative_cnt = self.cells_to_html(cell_vals=table.values,
152
+ cell_tokens=cell_tokens,
153
+ row_id_start=1,
154
+ cell_element="td",
155
+ cumulative_cnt=cumulative_cnt,
156
+ table_html=table_html)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  table_html = f'<table>{table_html}</table>'
159
  return HTMLBody(table_html)