zstanjj commited on
Commit
d2620d0
·
verified ·
1 Parent(s): 5b0bdbe

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +53 -19
README.md CHANGED
@@ -81,12 +81,32 @@ print(simplified_html)
81
  ```
82
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  ### 🌲 Build Block Tree
85
 
86
  ```python
87
  from htmlrag import build_block_tree
88
 
89
- block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=10)
90
  for block in block_tree:
91
  print("Block Content: ", block[0])
92
  print("Block Path: ", block[1])
@@ -114,8 +134,21 @@ for block in block_tree:
114
  ```python
115
  from htmlrag import EmbedHTMLPruner
116
 
117
- embed_html_pruner = EmbedHTMLPruner(embed_model="bm25")
118
- block_rankings = embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  print(block_rankings)
120
 
121
  # [0, 2, 1]
@@ -124,8 +157,7 @@ from transformers import AutoTokenizer
124
 
125
  chat_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-70B-Instruct")
126
 
127
- max_context_window = 60
128
- pruned_html = embed_html_pruner.prune_HTML(simplified_html, block_tree, block_rankings, chat_tokenizer, max_context_window)
129
  print(pruned_html)
130
 
131
  # <html>
@@ -141,18 +173,8 @@ print(pruned_html)
141
  from htmlrag import GenHTMLPruner
142
  import torch
143
 
144
- ckpt_path = "zstanjj/HTML-Pruner-Phi-3.8B"
145
- if torch.cuda.is_available():
146
- device="cuda"
147
- else:
148
- device="cpu"
149
- gen_embed_pruner = GenHTMLPruner(gen_model=ckpt_path, max_node_words=5, device=device)
150
- block_rankings = gen_embed_pruner.calculate_block_rankings(question, pruned_html)
151
- print(block_rankings)
152
-
153
- # [1, 0]
154
-
155
- block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=10)
156
  for block in block_tree:
157
  print("Block Content: ", block[0])
158
  print("Block Path: ", block[1])
@@ -167,13 +189,25 @@ for block in block_tree:
167
  # Block Path: ['html', 'p']
168
  # Is Leaf: True
169
 
170
- max_context_window = 32
171
- pruned_html = gen_embed_pruner.prune_HTML(pruned_html, block_tree, block_rankings, chat_tokenizer, max_context_window)
 
 
 
 
 
 
 
 
 
 
172
  print(pruned_html)
173
 
174
  # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
175
  ```
176
 
 
 
177
  ## Results
178
 
179
  - **Results for [HTML-Pruner-Phi-3.8B](https://huggingface.co/zstanjj/HTML-Pruner-Phi-3.8B) and [HTML-Pruner-Llama-1B](https://huggingface.co/zstanjj/HTML-Pruner-Llama-1B) with Llama-3.1-70B-Instruct as chat model**.
 
81
  ```
82
 
83
 
84
+ ### 🔧 Configure Pruning Parameters
85
+
86
+ The example HTML document is rather a short one. Real-world HTML documents can be much longer and more complex. To handle such cases, we can configure the following parameters:
87
+ ```python
88
+ # Maximum number of words in a node when constructing the block tree for pruning with the embedding model
89
+ MAX_NODE_WORDS_EMBED = 10
90
+ # MAX_NODE_WORDS_EMBED = 256 # a recommended setting for real-world HTML documents
91
+ # Maximum number of tokens in the output HTML document pruned with the embedding model
92
+ MAX_CONTEXT_WINDOW_EMBED = 60
93
+ # MAX_CONTEXT_WINDOW_EMBED = 6144 # a recommended setting for real-world HTML documents
94
+ # Maximum number of words in a node when constructing the block tree for pruning with the generative model
95
+ MAX_NODE_WORDS_GEN = 5
96
+ # MAX_NODE_WORDS_GEN = 128 # a recommended setting for real-world HTML documents
97
+ # Maximum number of tokens in the output HTML document pruned with the generative model
98
+ MAX_CONTEXT_WINDOW_GEN = 32
99
+ # MAX_CONTEXT_WINDOW_GEN = 4096 # a recommended setting for real-world HTML documents
100
+ ```
101
+
102
+
103
+
104
  ### 🌲 Build Block Tree
105
 
106
  ```python
107
  from htmlrag import build_block_tree
108
 
109
+ block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
110
  for block in block_tree:
111
  print("Block Content: ", block[0])
112
  print("Block Path: ", block[1])
 
134
  ```python
135
  from htmlrag import EmbedHTMLPruner
136
 
137
+ embed_model="/train_data_load/huggingface/tjj_hf/bge-large-en/"
138
+ query_instruction_for_retrieval = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
139
+ embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=True, query_instruction_for_retrieval = query_instruction_for_retrieval)
140
+ # alternatively you can init a remote TEI model, refer to https://github.com/huggingface/text-embeddings-inference.
141
+ # tei_endpoint="http://YOUR_TEI_ENDPOINT"
142
+ # embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=False, query_instruction_for_retrieval = query_instruction_for_retrieval, endpoint=tei_endpoint)
143
+ block_rankings=embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
144
+ print(block_rankings)
145
+
146
+ # [0, 2, 1]
147
+
148
+ #. alternatively you can use bm25 to rank the blocks
149
+ from htmlrag import BM25HTMLPruner
150
+ bm25_html_pruner = BM25HTMLPruner()
151
+ block_rankings=bm25_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
152
  print(block_rankings)
153
 
154
  # [0, 2, 1]
 
157
 
158
  chat_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-70B-Instruct")
159
 
160
+ pruned_html = embed_html_pruner.prune_HTML(simplified_html, block_tree, block_rankings, chat_tokenizer, MAX_CONTEXT_WINDOW_EMBED)
 
161
  print(pruned_html)
162
 
163
  # <html>
 
173
  from htmlrag import GenHTMLPruner
174
  import torch
175
 
176
+ # construct a finer block tree
177
+ block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
 
 
 
 
 
 
 
 
 
 
178
  for block in block_tree:
179
  print("Block Content: ", block[0])
180
  print("Block Path: ", block[1])
 
189
  # Block Path: ['html', 'p']
190
  # Is Leaf: True
191
 
192
+ ckpt_path = "zstanjj/HTML-Pruner-Llama-1B"
193
+ if torch.cuda.is_available():
194
+ device="cuda"
195
+ else:
196
+ device="cpu"
197
+ gen_embed_pruner = GenHTMLPruner(gen_model=ckpt_path, max_node_words=MAX_NODE_WORDS_GEN, device=device)
198
+ block_rankings = gen_embed_pruner.calculate_block_rankings(question, pruned_html)
199
+ print(block_rankings)
200
+
201
+ # [1, 0]
202
+
203
+ pruned_html = gen_embed_pruner.prune_HTML(pruned_html, block_tree, block_rankings, chat_tokenizer, MAX_CONTEXT_WINDOW_GEN)
204
  print(pruned_html)
205
 
206
  # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
207
  ```
208
 
209
+
210
+
211
  ## Results
212
 
213
  - **Results for [HTML-Pruner-Phi-3.8B](https://huggingface.co/zstanjj/HTML-Pruner-Phi-3.8B) and [HTML-Pruner-Llama-1B](https://huggingface.co/zstanjj/HTML-Pruner-Llama-1B) with Llama-3.1-70B-Instruct as chat model**.