Spaces:
Runtime error
Runtime error
Commit
·
f9dd31c
1
Parent(s):
71b2a17
Refactor table_html creation
Browse files- tapas_visualizer.py +42 -34
tapas_visualizer.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import os
|
2 |
-
from typing import Any, List
|
3 |
|
4 |
from collections import defaultdict
|
5 |
|
@@ -102,6 +102,31 @@ class TapasVisualizer:
|
|
102 |
|
103 |
return spans
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
def __call__(self, table: pd.DataFrame) -> Any:
|
107 |
tokenized = self.tokenizer(table)
|
@@ -113,39 +138,22 @@ class TapasVisualizer:
|
|
113 |
# 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation' not required
|
114 |
segment_id, col_id, row_id, *_ = tokenized['token_type_ids'][id_ind]
|
115 |
token_text = self.tokenizer._convert_id_to_token(input_id)
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
table_vals = table.values
|
133 |
-
|
134 |
-
table_html = header_row_html
|
135 |
-
|
136 |
-
for row_id, row in enumerate(table_vals, start=1):
|
137 |
-
row_html = ""
|
138 |
-
row_token_cnt = 0
|
139 |
-
for col_id, cell in enumerate(row, start=1):
|
140 |
-
cur_cell_tokens = cell_tokens[(row_id, col_id)]
|
141 |
-
span_htmls = self.text_to_html(cell, cur_cell_tokens)
|
142 |
-
cell_html = "".join(span_htmls)
|
143 |
-
row_html += f"<td>{cell_html}</td>"
|
144 |
-
row_token_cnt += len(cur_cell_tokens)
|
145 |
-
cumulative_cnt += row_token_cnt
|
146 |
-
cnt_str = f'{row_token_cnt} | {cumulative_cnt}'
|
147 |
-
row_html += f'<td style="border: none;">{self.style_span(cnt_str, ["non-token", "count"])}</td>'
|
148 |
-
table_html += f'<tr>{row_html}</tr>'
|
149 |
|
150 |
table_html = f'<table>{table_html}</table>'
|
151 |
return HTMLBody(table_html)
|
|
|
1 |
import os
|
2 |
+
from typing import Any, List, Dict
|
3 |
|
4 |
from collections import defaultdict
|
5 |
|
|
|
102 |
|
103 |
return spans
|
104 |
|
105 |
+
def cells_to_html(self,
|
106 |
+
cell_vals: List[List[str]],
|
107 |
+
cell_tokens: Dict,
|
108 |
+
row_id_start: int=0,
|
109 |
+
cell_element: str="td",
|
110 |
+
cumulative_cnt: int=0,
|
111 |
+
table_html: str="") -> str:
|
112 |
+
|
113 |
+
for row_id, row in enumerate(cell_vals, start=row_id_start):
|
114 |
+
row_html = ""
|
115 |
+
row_token_cnt = 0
|
116 |
+
for col_id, cell in enumerate(row, start=1):
|
117 |
+
cur_cell_tokens = cell_tokens[(row_id, col_id)]
|
118 |
+
span_htmls = self.text_to_html(cell, cur_cell_tokens)
|
119 |
+
cell_html = "".join(span_htmls)
|
120 |
+
row_html += f"<{cell_element}>{cell_html}</{cell_element}>"
|
121 |
+
row_token_cnt += len(cur_cell_tokens)
|
122 |
+
cumulative_cnt += row_token_cnt
|
123 |
+
# cnt_str = f'{row_token_cnt} | {cumulative_cnt}'
|
124 |
+
row_html += f'<td style="border: none;" align="right">{self.style_span(str(row_token_cnt), ["non-token", "count"])}</td>'
|
125 |
+
row_html += f'<td style="border: none;" align="right">{self.style_span(str(cumulative_cnt), ["non-token", "count"])}</td>'
|
126 |
+
table_html += f'<tr>{row_html}</tr>'
|
127 |
+
|
128 |
+
return table_html, cumulative_cnt
|
129 |
+
|
130 |
|
131 |
def __call__(self, table: pd.DataFrame) -> Any:
|
132 |
tokenized = self.tokenizer(table)
|
|
|
138 |
# 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation' not required
|
139 |
segment_id, col_id, row_id, *_ = tokenized['token_type_ids'][id_ind]
|
140 |
token_text = self.tokenizer._convert_id_to_token(input_id)
|
141 |
+
if int(segment_id) == 1:
|
142 |
+
cell_tokens[(row_id, col_id)].append(token_text)
|
143 |
+
|
144 |
+
table_html, cumulative_cnt = self.cells_to_html(cell_vals=[table.columns],
|
145 |
+
cell_tokens=cell_tokens,
|
146 |
+
row_id_start=0,
|
147 |
+
cell_element="th",
|
148 |
+
cumulative_cnt=0,
|
149 |
+
table_html="")
|
150 |
+
|
151 |
+
table_html, cumulative_cnt = self.cells_to_html(cell_vals=table.values,
|
152 |
+
cell_tokens=cell_tokens,
|
153 |
+
row_id_start=1,
|
154 |
+
cell_element="td",
|
155 |
+
cumulative_cnt=cumulative_cnt,
|
156 |
+
table_html=table_html)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
table_html = f'<table>{table_html}</table>'
|
159 |
return HTMLBody(table_html)
|