bglearning commited on
Commit
f38dbc0
·
1 Parent(s): b60285f

Styling fixes

Browse files
Files changed (2) hide show
  1. app.py +14 -11
  2. tapas_visualizer.py +55 -40
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import io
2
 
3
  import pandas as pd
4
  import streamlit as st
@@ -7,21 +7,19 @@ from transformers import AutoTokenizer
7
 
8
  from tapas_visualizer import TapasVisualizer
9
 
10
- st.set_page_config(page_title="Tapas Tokenizer", page_icon='‍🍽️', layout="wide")
 
11
 
12
  def set_file_input():
13
  st.session_state.input_stream = "file"
14
 
 
15
  def set_text_input():
16
  st.session_state.input_stream = "text"
17
 
18
 
19
  def main():
20
-
21
- models = [
22
- "google/tapas-base",
23
- "deepset/tapas-large-nq-hn-reader"
24
- ]
25
 
26
  @st.cache()
27
  def load_tokenizer():
@@ -31,7 +29,9 @@ def main():
31
  col1, col2 = st.columns([1, 2])
32
  with col1:
33
  selected_model = st.selectbox("Select a tokenizer", models, key=1)
34
- text = st.text_area(label="", placeholder="Table to tokenize; csv", on_change=set_text_input)
 
 
35
  uploaded_file = st.file_uploader("(Or) Choose a file", on_change=set_file_input)
36
  button_clicked = st.button("Tokenize")
37
 
@@ -42,7 +42,10 @@ def main():
42
  if text or uploaded_file or button_clicked:
43
  df: pd.DataFrame
44
 
45
- if 'input_stream' not in st.session_state or st.session_state.input_stream == "text":
 
 
 
46
  df = pd.read_csv(io.StringIO(text), sep=",")
47
  elif st.session_state.input_stream == "file":
48
  df = pd.read_csv(uploaded_file)
@@ -51,5 +54,5 @@ def main():
51
  st.components.v1.html(visualizer(df.astype(str)), height=1500)
52
 
53
 
54
- if __name__ == '__main__':
55
- main()
 
1
+ import io
2
 
3
  import pandas as pd
4
  import streamlit as st
 
7
 
8
  from tapas_visualizer import TapasVisualizer
9
 
10
+ st.set_page_config(page_title="Tapas Tokenizer", page_icon="‍🍽️", layout="wide")
11
+
12
 
13
  def set_file_input():
14
  st.session_state.input_stream = "file"
15
 
16
+
17
  def set_text_input():
18
  st.session_state.input_stream = "text"
19
 
20
 
21
  def main():
22
+ models = ["google/tapas-base", "deepset/tapas-large-nq-hn-reader"]
 
 
 
 
23
 
24
  @st.cache()
25
  def load_tokenizer():
 
29
  col1, col2 = st.columns([1, 2])
30
  with col1:
31
  selected_model = st.selectbox("Select a tokenizer", models, key=1)
32
+ text = st.text_area(
33
+ label="", placeholder="Table to tokenize; csv", on_change=set_text_input
34
+ )
35
  uploaded_file = st.file_uploader("(Or) Choose a file", on_change=set_file_input)
36
  button_clicked = st.button("Tokenize")
37
 
 
42
  if text or uploaded_file or button_clicked:
43
  df: pd.DataFrame
44
 
45
+ if (
46
+ "input_stream" not in st.session_state
47
+ or st.session_state.input_stream == "text"
48
+ ):
49
  df = pd.read_csv(io.StringIO(text), sep=",")
50
  elif st.session_state.input_stream == "file":
51
  df = pd.read_csv(uploaded_file)
 
54
  st.components.v1.html(visualizer(df.astype(str)), height=1500)
55
 
56
 
57
+ if __name__ == "__main__":
58
+ main()
tapas_visualizer.py CHANGED
@@ -65,8 +65,8 @@ class TapasVisualizer:
65
  str: html with styling for the tokens
66
  """
67
  if len(tokens) == 0:
68
- print(f'Empty tokens for: {org_text}')
69
- return ''
70
 
71
  cur_token_id = 0
72
  cur_token = self.normalize_token_str(tokens[cur_token_id])
@@ -77,17 +77,19 @@ class TapasVisualizer:
77
  spans = []
78
 
79
  while next_start < len(org_text):
80
- candidate = org_text[next_start: next_start + len(cur_token)]
81
 
82
  # The tokenizer performs lowercasing; so check against lowercase
83
  if candidate.lower() == cur_token:
84
  if last_end != next_start:
85
  # There was token-less text (probably whitespace)
86
  # in the middle
87
- spans.append(self.style_span(org_text[last_end: next_start], ['non-token']))
 
 
88
 
89
- odd_or_even = 'even-token' if cur_token_id % 2 == 0 else 'odd-token'
90
- spans.append(self.style_span(candidate, ['token', odd_or_even]))
91
  next_start += len(cur_token)
92
  last_end = next_start
93
  cur_token_id += 1
@@ -96,20 +98,21 @@ class TapasVisualizer:
96
  cur_token = self.normalize_token_str(tokens[cur_token_id])
97
  else:
98
  next_start += 1
99
-
100
  if last_end != len(org_text):
101
- spans.append(self.style_span(org_text[last_end: next_start], ['non-token']))
102
 
103
  return spans
104
 
105
- def cells_to_html(self,
106
- cell_vals: List[List[str]],
107
- cell_tokens: Dict,
108
- row_id_start: int=0,
109
- cell_element: str="td",
110
- cumulative_cnt: int=0,
111
- table_html: str="") -> str:
112
-
 
113
  for row_id, row in enumerate(cell_vals, start=row_id_start):
114
  row_html = ""
115
  row_token_cnt = 0
@@ -120,42 +123,54 @@ class TapasVisualizer:
120
  row_html += f"<{cell_element}>{cell_html}</{cell_element}>"
121
  row_token_cnt += len(cur_cell_tokens)
122
  cumulative_cnt += row_token_cnt
123
- cnt_html = (f'<td style="border: none;" align="right">{self.style_span(str(cumulative_cnt), ["non-token", "count"])}</td>'
124
- f'<td style="border: none;" align="right">{self.style_span(f"<+{row_token_cnt}", ["non-token", "count"])}</td>')
 
 
 
 
 
 
125
  row_html = cnt_html + row_html
126
- table_html += f'<tr>{row_html}</tr>'
127
 
128
  return table_html, cumulative_cnt
129
 
130
-
131
  def __call__(self, table: pd.DataFrame) -> Any:
132
  tokenized = self.tokenizer(table)
133
 
134
  cell_tokens = defaultdict(list)
135
 
136
- for id_ind, input_id in enumerate(tokenized['input_ids']):
137
  input_id = int(input_id)
138
- # 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation' not required
139
- segment_id, col_id, row_id, *_ = tokenized['token_type_ids'][id_ind]
 
140
  token_text = self.tokenizer._convert_id_to_token(input_id)
141
  if int(segment_id) == 1:
142
  cell_tokens[(row_id, col_id)].append(token_text)
143
 
144
- table_html = '<tr><td style="border: none;" colspan="2" align="left">#Tokens</td></tr>'
145
-
146
- table_html, cumulative_cnt = self.cells_to_html(cell_vals=[table.columns],
147
- cell_tokens=cell_tokens,
148
- row_id_start=0,
149
- cell_element="th",
150
- cumulative_cnt=0,
151
- table_html=table_html)
152
-
153
- table_html, cumulative_cnt = self.cells_to_html(cell_vals=table.values,
154
- cell_tokens=cell_tokens,
155
- row_id_start=1,
156
- cell_element="td",
157
- cumulative_cnt=cumulative_cnt,
158
- table_html=table_html)
159
-
160
- table_html = f'<table>{table_html}</table>'
 
 
 
 
 
 
161
  return HTMLBody(table_html)
 
65
  str: html with styling for the tokens
66
  """
67
  if len(tokens) == 0:
68
+ print(f"Empty tokens for: {org_text}")
69
+ return ""
70
 
71
  cur_token_id = 0
72
  cur_token = self.normalize_token_str(tokens[cur_token_id])
 
77
  spans = []
78
 
79
  while next_start < len(org_text):
80
+ candidate = org_text[next_start : next_start + len(cur_token)]
81
 
82
  # The tokenizer performs lowercasing; so check against lowercase
83
  if candidate.lower() == cur_token:
84
  if last_end != next_start:
85
  # There was token-less text (probably whitespace)
86
  # in the middle
87
+ spans.append(
88
+ self.style_span(org_text[last_end:next_start], ["non-token"])
89
+ )
90
 
91
+ odd_or_even = "even-token" if cur_token_id % 2 == 0 else "odd-token"
92
+ spans.append(self.style_span(candidate, ["token", odd_or_even]))
93
  next_start += len(cur_token)
94
  last_end = next_start
95
  cur_token_id += 1
 
98
  cur_token = self.normalize_token_str(tokens[cur_token_id])
99
  else:
100
  next_start += 1
101
+
102
  if last_end != len(org_text):
103
+ spans.append(self.style_span(org_text[last_end:next_start], ["non-token"]))
104
 
105
  return spans
106
 
107
+ def cells_to_html(
108
+ self,
109
+ cell_vals: List[List[str]],
110
+ cell_tokens: Dict,
111
+ row_id_start: int = 0,
112
+ cell_element: str = "td",
113
+ cumulative_cnt: int = 0,
114
+ table_html: str = "",
115
+ ) -> str:
116
  for row_id, row in enumerate(cell_vals, start=row_id_start):
117
  row_html = ""
118
  row_token_cnt = 0
 
123
  row_html += f"<{cell_element}>{cell_html}</{cell_element}>"
124
  row_token_cnt += len(cur_cell_tokens)
125
  cumulative_cnt += row_token_cnt
126
+ cnt_html = (
127
+ f'<td style="border: none;" align="right">'
128
+ f'{self.style_span(str(cumulative_cnt), ["non-token", "count"])}'
129
+ '</td>'
130
+ f'<td style="border: none;" align="right">'
131
+ f'{self.style_span(f"<+{row_token_cnt}", ["non-token", "count"])}'
132
+ '</td>'
133
+ )
134
  row_html = cnt_html + row_html
135
+ table_html += f"<tr>{row_html}</tr>"
136
 
137
  return table_html, cumulative_cnt
138
 
 
139
  def __call__(self, table: pd.DataFrame) -> Any:
140
  tokenized = self.tokenizer(table)
141
 
142
  cell_tokens = defaultdict(list)
143
 
144
+ for id_ind, input_id in enumerate(tokenized["input_ids"]):
145
  input_id = int(input_id)
146
+ # 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation'
147
+ # not required
148
+ segment_id, col_id, row_id, *_ = tokenized["token_type_ids"][id_ind]
149
  token_text = self.tokenizer._convert_id_to_token(input_id)
150
  if int(segment_id) == 1:
151
  cell_tokens[(row_id, col_id)].append(token_text)
152
 
153
+ table_html = (
154
+ '<tr><td style="border: none;" colspan="2" align="left">#Tokens</td></tr>'
155
+ )
156
+
157
+ table_html, cumulative_cnt = self.cells_to_html(
158
+ cell_vals=[table.columns],
159
+ cell_tokens=cell_tokens,
160
+ row_id_start=0,
161
+ cell_element="th",
162
+ cumulative_cnt=0,
163
+ table_html=table_html,
164
+ )
165
+
166
+ table_html, cumulative_cnt = self.cells_to_html(
167
+ cell_vals=table.values,
168
+ cell_tokens=cell_tokens,
169
+ row_id_start=1,
170
+ cell_element="td",
171
+ cumulative_cnt=cumulative_cnt,
172
+ table_html=table_html,
173
+ )
174
+
175
+ table_html = f"<table>{table_html}</table>"
176
  return HTMLBody(table_html)