zstanjj commited on
Commit
35b6576
·
verified ·
1 Parent(s): d2620d0

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -2
README.md CHANGED
@@ -67,6 +67,11 @@ document.write("Hello World!");
67
  </html>
68
  """
69
 
 
 
 
 
 
70
  simplified_html = clean_html(html)
71
  print(simplified_html)
72
 
@@ -80,7 +85,6 @@ print(simplified_html)
80
  # </html>
81
  ```
82
 
83
-
84
  ### 🔧 Configure Pruning Parameters
85
 
86
  The example HTML document is rather a short one. Real-world HTML documents can be much longer and more complex. To handle such cases, we can configure the following parameters:
@@ -107,6 +111,7 @@ MAX_CONTEXT_WINDOW_GEN = 32
107
  from htmlrag import build_block_tree
108
 
109
  block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
 
110
  for block in block_tree:
111
  print("Block Content: ", block[0])
112
  print("Block Path: ", block[1])
@@ -175,6 +180,7 @@ import torch
175
 
176
  # construct a finer block tree
177
  block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
 
178
  for block in block_tree:
179
  print("Block Content: ", block[0])
180
  print("Block Path: ", block[1])
@@ -189,7 +195,7 @@ for block in block_tree:
189
  # Block Path: ['html', 'p']
190
  # Is Leaf: True
191
 
192
- ckpt_path = "zstanjj/HTML-Pruner-Llama-1B"
193
  if torch.cuda.is_available():
194
  device="cuda"
195
  else:
@@ -206,6 +212,7 @@ print(pruned_html)
206
  # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
207
  ```
208
 
 
209
 
210
 
211
  ## Results
 
67
  </html>
68
  """
69
 
70
+ #. alternatively you can read html files and merge them
71
+ # html_files=["/path/to/html/file1.html", "/path/to/html/file2.html"]
72
+ # htmls=[open(file).read() for file in html_files]
73
+ # html = "\n".join(htmls)
74
+
75
  simplified_html = clean_html(html)
76
  print(simplified_html)
77
 
 
85
  # </html>
86
  ```
87
 
 
88
  ### 🔧 Configure Pruning Parameters
89
 
90
  The example HTML document is rather a short one. Real-world HTML documents can be much longer and more complex. To handle such cases, we can configure the following parameters:
 
111
  from htmlrag import build_block_tree
112
 
113
  block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
114
+ # block_tree, simplified_html=build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
115
  for block in block_tree:
116
  print("Block Content: ", block[0])
117
  print("Block Path: ", block[1])
 
180
 
181
  # construct a finer block tree
182
  block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
183
+ # block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
184
  for block in block_tree:
185
  print("Block Content: ", block[0])
186
  print("Block Path: ", block[1])
 
195
  # Block Path: ['html', 'p']
196
  # Is Leaf: True
197
 
198
+ ckpt_path = "zstanjj/HTML-Pruner-Phi-3.8B"
199
  if torch.cuda.is_available():
200
  device="cuda"
201
  else:
 
212
  # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
213
  ```
214
 
215
+ ---
216
 
217
 
218
  ## Results