zstanjj commited on
Commit
e9cf3e9
·
verified ·
1 Parent(s): 35b6576

Upload modeling_phi3.py

Browse files
Files changed (1) hide show
  1. modeling_phi3.py +6 -65
modeling_phi3.py CHANGED
@@ -95,66 +95,6 @@ class TokenIdNode(Node):
95
  self.prob = kwargs.get('prob', np.float32(0.0))
96
 
97
 
98
- def split_tree(soup: bs4.BeautifulSoup, max_node_words=0) -> List[Tuple[bs4.element.Tag, List[str], bool]]:
99
- word_count = len(soup.get_text().split())
100
- if word_count > max_node_words:
101
- possible_trees = [(soup, [])]
102
- target_trees = [] # [(tag, path, is_leaf)]
103
- # split the entire dom tee into subtrees, until the length of the subtree is less than max_node_words words
104
- # find all possible trees
105
- while True:
106
- if len(possible_trees) == 0:
107
- break
108
- tree = possible_trees.pop(0)
109
- tag_children = defaultdict(int)
110
- bare_word_count = 0
111
- # count child tags
112
- for child in tree[0].contents:
113
- if isinstance(child, bs4.element.Tag):
114
- tag_children[child.name] += 1
115
- _tag_children = {k: 0 for k in tag_children.keys()}
116
-
117
- # check if the tree can be split
118
- for child in tree[0].contents:
119
- if isinstance(child, bs4.element.Tag):
120
- # change child tag with duplicate names
121
- if tag_children[child.name] > 1:
122
- new_name = f"{child.name}{_tag_children[child.name]}"
123
- new_tree = (child, tree[1] + [new_name])
124
- _tag_children[child.name] += 1
125
- child.name = new_name
126
- else:
127
- new_tree = (child, tree[1] + [child.name])
128
- word_count = len(child.get_text().split())
129
- # add node with more than max_node_words words, and recursion depth is less than 64
130
- if word_count > max_node_words and len(new_tree[1]) < 64:
131
- possible_trees.append(new_tree)
132
- else:
133
- target_trees.append((new_tree[0], new_tree[1], True))
134
- else:
135
- bare_word_count += len(str(child).split())
136
-
137
- # add leaf node
138
- if len(tag_children) == 0:
139
- target_trees.append((tree[0], tree[1], True))
140
- # add node with more than max_node_words bare words
141
- elif bare_word_count > max_node_words:
142
- target_trees.append((tree[0], tree[1], False))
143
- else:
144
- soup_children = [c for c in soup.contents if isinstance(c, bs4.element.Tag)]
145
- if len(soup_children) == 1:
146
- target_trees = [(soup_children[0], [soup_children[0].name], True)]
147
- else:
148
- # add an html tag to wrap all children
149
- new_soup = bs4.BeautifulSoup("", 'html.parser')
150
- new_tag = new_soup.new_tag("html")
151
- new_soup.append(new_tag)
152
- for child in soup_children:
153
- new_tag.append(child)
154
- target_trees = [(new_tag, ["html"], True)]
155
- return target_trees
156
-
157
-
158
  logger = logging.get_logger(__name__)
159
 
160
  # Transformers scans dependencies in the modeling file, causing issues on conditional loading. The regex only ignores try/catch blocks, but not if statements
@@ -1887,6 +1827,7 @@ class PHI3ForHTMLTreeGeneration(Phi3PreTrainedModel):
1887
  tokenizer,
1888
  query: List[str],
1889
  htmls: List[List[str]],
 
1890
  **kwargs):
1891
  max_seq_length = kwargs.pop("max_seq_length", 131072)
1892
  def apply_html_tree_template(query, htmls):
@@ -1922,11 +1863,11 @@ class PHI3ForHTMLTreeGeneration(Phi3PreTrainedModel):
1922
  soup.append(bs4.BeautifulSoup(html, 'html.parser'))
1923
 
1924
  token_id_paths = []
1925
- html_chunk_paths = split_tree(soup, max_node_words=self.max_node_words)
1926
- is_leaf = [p[2] for p in html_chunk_paths]
1927
- html_chunk_paths = [p[1] for p in html_chunk_paths]
1928
 
1929
- for path in html_chunk_paths:
1930
  path_str = "<" + "><".join(path) + ">"
1931
  token_ids = tokenizer.encode(path_str, add_special_tokens=False)
1932
  token_id_paths.append(token_ids)
@@ -1982,7 +1923,7 @@ class PHI3ForHTMLTreeGeneration(Phi3PreTrainedModel):
1982
 
1983
  res_html_refs.append({
1984
  "html": str(soup),
1985
- "paths": html_chunk_paths,
1986
  "is_leaf": is_leaf,
1987
  "path_token_ids": token_id_paths,
1988
  "node_tree": list(TokenDotExporter(root, nodenamefunc=nodenamefunc))
 
95
  self.prob = kwargs.get('prob', np.float32(0.0))
96
 
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  logger = logging.get_logger(__name__)
99
 
100
  # Transformers scans dependencies in the modeling file, causing issues on conditional loading. The regex only ignores try/catch blocks, but not if statements
 
1827
  tokenizer,
1828
  query: List[str],
1829
  htmls: List[List[str]],
1830
+ block_tree: List[Tuple],
1831
  **kwargs):
1832
  max_seq_length = kwargs.pop("max_seq_length", 131072)
1833
  def apply_html_tree_template(query, htmls):
 
1863
  soup.append(bs4.BeautifulSoup(html, 'html.parser'))
1864
 
1865
  token_id_paths = []
1866
+ _block_tree = block_tree[idx]
1867
+ is_leaf = [p[2] for p in _block_tree]
1868
+ _block_tree = [p[1] for p in _block_tree]
1869
 
1870
+ for path in _block_tree:
1871
  path_str = "<" + "><".join(path) + ">"
1872
  token_ids = tokenizer.encode(path_str, add_special_tokens=False)
1873
  token_id_paths.append(token_ids)
 
1923
 
1924
  res_html_refs.append({
1925
  "html": str(soup),
1926
+ "paths": _block_tree,
1927
  "is_leaf": is_leaf,
1928
  "path_token_ids": token_id_paths,
1929
  "node_tree": list(TokenDotExporter(root, nodenamefunc=nodenamefunc))