Spaces:
Runtime error
Runtime error
Commit
·
f38dbc0
1
Parent(s):
b60285f
Styling fixes
Browse files- app.py +14 -11
- tapas_visualizer.py +55 -40
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
import io
|
2 |
|
3 |
import pandas as pd
|
4 |
import streamlit as st
|
@@ -7,21 +7,19 @@ from transformers import AutoTokenizer
|
|
7 |
|
8 |
from tapas_visualizer import TapasVisualizer
|
9 |
|
10 |
-
st.set_page_config(page_title="Tapas Tokenizer", page_icon=
|
|
|
11 |
|
12 |
def set_file_input():
|
13 |
st.session_state.input_stream = "file"
|
14 |
|
|
|
15 |
def set_text_input():
|
16 |
st.session_state.input_stream = "text"
|
17 |
|
18 |
|
19 |
def main():
|
20 |
-
|
21 |
-
models = [
|
22 |
-
"google/tapas-base",
|
23 |
-
"deepset/tapas-large-nq-hn-reader"
|
24 |
-
]
|
25 |
|
26 |
@st.cache()
|
27 |
def load_tokenizer():
|
@@ -31,7 +29,9 @@ def main():
|
|
31 |
col1, col2 = st.columns([1, 2])
|
32 |
with col1:
|
33 |
selected_model = st.selectbox("Select a tokenizer", models, key=1)
|
34 |
-
text = st.text_area(
|
|
|
|
|
35 |
uploaded_file = st.file_uploader("(Or) Choose a file", on_change=set_file_input)
|
36 |
button_clicked = st.button("Tokenize")
|
37 |
|
@@ -42,7 +42,10 @@ def main():
|
|
42 |
if text or uploaded_file or button_clicked:
|
43 |
df: pd.DataFrame
|
44 |
|
45 |
-
if
|
|
|
|
|
|
|
46 |
df = pd.read_csv(io.StringIO(text), sep=",")
|
47 |
elif st.session_state.input_stream == "file":
|
48 |
df = pd.read_csv(uploaded_file)
|
@@ -51,5 +54,5 @@ def main():
|
|
51 |
st.components.v1.html(visualizer(df.astype(str)), height=1500)
|
52 |
|
53 |
|
54 |
-
if __name__ ==
|
55 |
-
main()
|
|
|
1 |
+
import io
|
2 |
|
3 |
import pandas as pd
|
4 |
import streamlit as st
|
|
|
7 |
|
8 |
from tapas_visualizer import TapasVisualizer
|
9 |
|
10 |
+
st.set_page_config(page_title="Tapas Tokenizer", page_icon="🍽️", layout="wide")
|
11 |
+
|
12 |
|
13 |
def set_file_input():
|
14 |
st.session_state.input_stream = "file"
|
15 |
|
16 |
+
|
17 |
def set_text_input():
|
18 |
st.session_state.input_stream = "text"
|
19 |
|
20 |
|
21 |
def main():
|
22 |
+
models = ["google/tapas-base", "deepset/tapas-large-nq-hn-reader"]
|
|
|
|
|
|
|
|
|
23 |
|
24 |
@st.cache()
|
25 |
def load_tokenizer():
|
|
|
29 |
col1, col2 = st.columns([1, 2])
|
30 |
with col1:
|
31 |
selected_model = st.selectbox("Select a tokenizer", models, key=1)
|
32 |
+
text = st.text_area(
|
33 |
+
label="", placeholder="Table to tokenize; csv", on_change=set_text_input
|
34 |
+
)
|
35 |
uploaded_file = st.file_uploader("(Or) Choose a file", on_change=set_file_input)
|
36 |
button_clicked = st.button("Tokenize")
|
37 |
|
|
|
42 |
if text or uploaded_file or button_clicked:
|
43 |
df: pd.DataFrame
|
44 |
|
45 |
+
if (
|
46 |
+
"input_stream" not in st.session_state
|
47 |
+
or st.session_state.input_stream == "text"
|
48 |
+
):
|
49 |
df = pd.read_csv(io.StringIO(text), sep=",")
|
50 |
elif st.session_state.input_stream == "file":
|
51 |
df = pd.read_csv(uploaded_file)
|
|
|
54 |
st.components.v1.html(visualizer(df.astype(str)), height=1500)
|
55 |
|
56 |
|
57 |
+
if __name__ == "__main__":
|
58 |
+
main()
|
tapas_visualizer.py
CHANGED
@@ -65,8 +65,8 @@ class TapasVisualizer:
|
|
65 |
str: html with styling for the tokens
|
66 |
"""
|
67 |
if len(tokens) == 0:
|
68 |
-
print(f
|
69 |
-
return
|
70 |
|
71 |
cur_token_id = 0
|
72 |
cur_token = self.normalize_token_str(tokens[cur_token_id])
|
@@ -77,17 +77,19 @@ class TapasVisualizer:
|
|
77 |
spans = []
|
78 |
|
79 |
while next_start < len(org_text):
|
80 |
-
candidate = org_text[next_start: next_start + len(cur_token)]
|
81 |
|
82 |
# The tokenizer performs lowercasing; so check against lowercase
|
83 |
if candidate.lower() == cur_token:
|
84 |
if last_end != next_start:
|
85 |
# There was token-less text (probably whitespace)
|
86 |
# in the middle
|
87 |
-
spans.append(
|
|
|
|
|
88 |
|
89 |
-
odd_or_even =
|
90 |
-
spans.append(self.style_span(candidate, [
|
91 |
next_start += len(cur_token)
|
92 |
last_end = next_start
|
93 |
cur_token_id += 1
|
@@ -96,20 +98,21 @@ class TapasVisualizer:
|
|
96 |
cur_token = self.normalize_token_str(tokens[cur_token_id])
|
97 |
else:
|
98 |
next_start += 1
|
99 |
-
|
100 |
if last_end != len(org_text):
|
101 |
-
spans.append(self.style_span(org_text[last_end:
|
102 |
|
103 |
return spans
|
104 |
|
105 |
-
def cells_to_html(
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
113 |
for row_id, row in enumerate(cell_vals, start=row_id_start):
|
114 |
row_html = ""
|
115 |
row_token_cnt = 0
|
@@ -120,42 +123,54 @@ class TapasVisualizer:
|
|
120 |
row_html += f"<{cell_element}>{cell_html}</{cell_element}>"
|
121 |
row_token_cnt += len(cur_cell_tokens)
|
122 |
cumulative_cnt += row_token_cnt
|
123 |
-
cnt_html = (
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
row_html = cnt_html + row_html
|
126 |
-
table_html += f
|
127 |
|
128 |
return table_html, cumulative_cnt
|
129 |
|
130 |
-
|
131 |
def __call__(self, table: pd.DataFrame) -> Any:
|
132 |
tokenized = self.tokenizer(table)
|
133 |
|
134 |
cell_tokens = defaultdict(list)
|
135 |
|
136 |
-
for id_ind, input_id in enumerate(tokenized[
|
137 |
input_id = int(input_id)
|
138 |
-
# 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation'
|
139 |
-
|
|
|
140 |
token_text = self.tokenizer._convert_id_to_token(input_id)
|
141 |
if int(segment_id) == 1:
|
142 |
cell_tokens[(row_id, col_id)].append(token_text)
|
143 |
|
144 |
-
table_html =
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
return HTMLBody(table_html)
|
|
|
65 |
str: html with styling for the tokens
|
66 |
"""
|
67 |
if len(tokens) == 0:
|
68 |
+
print(f"Empty tokens for: {org_text}")
|
69 |
+
return ""
|
70 |
|
71 |
cur_token_id = 0
|
72 |
cur_token = self.normalize_token_str(tokens[cur_token_id])
|
|
|
77 |
spans = []
|
78 |
|
79 |
while next_start < len(org_text):
|
80 |
+
candidate = org_text[next_start : next_start + len(cur_token)]
|
81 |
|
82 |
# The tokenizer performs lowercasing; so check against lowercase
|
83 |
if candidate.lower() == cur_token:
|
84 |
if last_end != next_start:
|
85 |
# There was token-less text (probably whitespace)
|
86 |
# in the middle
|
87 |
+
spans.append(
|
88 |
+
self.style_span(org_text[last_end:next_start], ["non-token"])
|
89 |
+
)
|
90 |
|
91 |
+
odd_or_even = "even-token" if cur_token_id % 2 == 0 else "odd-token"
|
92 |
+
spans.append(self.style_span(candidate, ["token", odd_or_even]))
|
93 |
next_start += len(cur_token)
|
94 |
last_end = next_start
|
95 |
cur_token_id += 1
|
|
|
98 |
cur_token = self.normalize_token_str(tokens[cur_token_id])
|
99 |
else:
|
100 |
next_start += 1
|
101 |
+
|
102 |
if last_end != len(org_text):
|
103 |
+
spans.append(self.style_span(org_text[last_end:next_start], ["non-token"]))
|
104 |
|
105 |
return spans
|
106 |
|
107 |
+
def cells_to_html(
|
108 |
+
self,
|
109 |
+
cell_vals: List[List[str]],
|
110 |
+
cell_tokens: Dict,
|
111 |
+
row_id_start: int = 0,
|
112 |
+
cell_element: str = "td",
|
113 |
+
cumulative_cnt: int = 0,
|
114 |
+
table_html: str = "",
|
115 |
+
) -> str:
|
116 |
for row_id, row in enumerate(cell_vals, start=row_id_start):
|
117 |
row_html = ""
|
118 |
row_token_cnt = 0
|
|
|
123 |
row_html += f"<{cell_element}>{cell_html}</{cell_element}>"
|
124 |
row_token_cnt += len(cur_cell_tokens)
|
125 |
cumulative_cnt += row_token_cnt
|
126 |
+
cnt_html = (
|
127 |
+
f'<td style="border: none;" align="right">'
|
128 |
+
f'{self.style_span(str(cumulative_cnt), ["non-token", "count"])}'
|
129 |
+
'</td>'
|
130 |
+
f'<td style="border: none;" align="right">'
|
131 |
+
f'{self.style_span(f"<+{row_token_cnt}", ["non-token", "count"])}'
|
132 |
+
'</td>'
|
133 |
+
)
|
134 |
row_html = cnt_html + row_html
|
135 |
+
table_html += f"<tr>{row_html}</tr>"
|
136 |
|
137 |
return table_html, cumulative_cnt
|
138 |
|
|
|
139 |
def __call__(self, table: pd.DataFrame) -> Any:
|
140 |
tokenized = self.tokenizer(table)
|
141 |
|
142 |
cell_tokens = defaultdict(list)
|
143 |
|
144 |
+
for id_ind, input_id in enumerate(tokenized["input_ids"]):
|
145 |
input_id = int(input_id)
|
146 |
+
# 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation'
|
147 |
+
# not required
|
148 |
+
segment_id, col_id, row_id, *_ = tokenized["token_type_ids"][id_ind]
|
149 |
token_text = self.tokenizer._convert_id_to_token(input_id)
|
150 |
if int(segment_id) == 1:
|
151 |
cell_tokens[(row_id, col_id)].append(token_text)
|
152 |
|
153 |
+
table_html = (
|
154 |
+
'<tr><td style="border: none;" colspan="2" align="left">#Tokens</td></tr>'
|
155 |
+
)
|
156 |
+
|
157 |
+
table_html, cumulative_cnt = self.cells_to_html(
|
158 |
+
cell_vals=[table.columns],
|
159 |
+
cell_tokens=cell_tokens,
|
160 |
+
row_id_start=0,
|
161 |
+
cell_element="th",
|
162 |
+
cumulative_cnt=0,
|
163 |
+
table_html=table_html,
|
164 |
+
)
|
165 |
+
|
166 |
+
table_html, cumulative_cnt = self.cells_to_html(
|
167 |
+
cell_vals=table.values,
|
168 |
+
cell_tokens=cell_tokens,
|
169 |
+
row_id_start=1,
|
170 |
+
cell_element="td",
|
171 |
+
cumulative_cnt=cumulative_cnt,
|
172 |
+
table_html=table_html,
|
173 |
+
)
|
174 |
+
|
175 |
+
table_html = f"<table>{table_html}</table>"
|
176 |
return HTMLBody(table_html)
|