Update README.md
Browse files
    	
        README.md
    CHANGED
    
    | 
         @@ -67,6 +67,11 @@ document.write("Hello World!"); 
     | 
|
| 67 | 
         
             
            </html>
         
     | 
| 68 | 
         
             
            """
         
     | 
| 69 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 70 | 
         
             
            simplified_html = clean_html(html)
         
     | 
| 71 | 
         
             
            print(simplified_html)
         
     | 
| 72 | 
         | 
| 
         @@ -80,7 +85,6 @@ print(simplified_html) 
     | 
|
| 80 | 
         
             
            # </html>
         
     | 
| 81 | 
         
             
            ```
         
     | 
| 82 | 
         | 
| 83 | 
         
            -
             
     | 
| 84 | 
         
             
            ### 🔧 Configure Pruning Parameters
         
     | 
| 85 | 
         | 
| 86 | 
         
             
            The example HTML document is rather a short one. Real-world HTML documents can be much longer and more complex. To handle such cases, we can configure the following parameters:
         
     | 
| 
         @@ -107,6 +111,7 @@ MAX_CONTEXT_WINDOW_GEN = 32 
     | 
|
| 107 | 
         
             
            from htmlrag import build_block_tree
         
     | 
| 108 | 
         | 
| 109 | 
         
             
            block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
         
     | 
| 
         | 
|
| 110 | 
         
             
            for block in block_tree:
         
     | 
| 111 | 
         
             
                print("Block Content: ", block[0])
         
     | 
| 112 | 
         
             
                print("Block Path: ", block[1])
         
     | 
| 
         @@ -175,6 +180,7 @@ import torch 
     | 
|
| 175 | 
         | 
| 176 | 
         
             
            # construct a finer block tree
         
     | 
| 177 | 
         
             
            block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
         
     | 
| 
         | 
|
| 178 | 
         
             
            for block in block_tree:
         
     | 
| 179 | 
         
             
                print("Block Content: ", block[0])
         
     | 
| 180 | 
         
             
                print("Block Path: ", block[1])
         
     | 
| 
         @@ -189,7 +195,7 @@ for block in block_tree: 
     | 
|
| 189 | 
         
             
            # Block Path:  ['html', 'p']
         
     | 
| 190 | 
         
             
            # Is Leaf:  True
         
     | 
| 191 | 
         | 
| 192 | 
         
            -
            ckpt_path = "zstanjj/HTML-Pruner- 
     | 
| 193 | 
         
             
            if torch.cuda.is_available():
         
     | 
| 194 | 
         
             
                device="cuda"
         
     | 
| 195 | 
         
             
            else:
         
     | 
| 
         @@ -206,6 +212,7 @@ print(pruned_html) 
     | 
|
| 206 | 
         
             
            # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
         
     | 
| 207 | 
         
             
            ```
         
     | 
| 208 | 
         | 
| 
         | 
|
| 209 | 
         | 
| 210 | 
         | 
| 211 | 
         
             
            ## Results
         
     | 
| 
         | 
|
| 67 | 
         
             
            </html>
         
     | 
| 68 | 
         
             
            """
         
     | 
| 69 | 
         | 
| 70 | 
         
            +
            #. alternatively you can read html files and merge them
         
     | 
| 71 | 
         
            +
            # html_files=["/path/to/html/file1.html", "/path/to/html/file2.html"]
         
     | 
| 72 | 
         
            +
            # htmls=[open(file).read() for file in html_files]
         
     | 
| 73 | 
         
            +
            # html = "\n".join(htmls)
         
     | 
| 74 | 
         
            +
             
     | 
| 75 | 
         
             
            simplified_html = clean_html(html)
         
     | 
| 76 | 
         
             
            print(simplified_html)
         
     | 
| 77 | 
         | 
| 
         | 
|
| 85 | 
         
             
            # </html>
         
     | 
| 86 | 
         
             
            ```
         
     | 
| 87 | 
         | 
| 
         | 
|
| 88 | 
         
             
            ### 🔧 Configure Pruning Parameters
         
     | 
| 89 | 
         | 
| 90 | 
         
             
            The example HTML document is rather a short one. Real-world HTML documents can be much longer and more complex. To handle such cases, we can configure the following parameters:
         
     | 
| 
         | 
|
| 111 | 
         
             
            from htmlrag import build_block_tree
         
     | 
| 112 | 
         | 
| 113 | 
         
             
            block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
         
     | 
| 114 | 
         
            +
            # block_tree, simplified_html=build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
         
     | 
| 115 | 
         
             
            for block in block_tree:
         
     | 
| 116 | 
         
             
                print("Block Content: ", block[0])
         
     | 
| 117 | 
         
             
                print("Block Path: ", block[1])
         
     | 
| 
         | 
|
| 180 | 
         | 
| 181 | 
         
             
            # construct a finer block tree
         
     | 
| 182 | 
         
             
            block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
         
     | 
| 183 | 
         
            +
            # block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
         
     | 
| 184 | 
         
             
            for block in block_tree:
         
     | 
| 185 | 
         
             
                print("Block Content: ", block[0])
         
     | 
| 186 | 
         
             
                print("Block Path: ", block[1])
         
     | 
| 
         | 
|
| 195 | 
         
             
            # Block Path:  ['html', 'p']
         
     | 
| 196 | 
         
             
            # Is Leaf:  True
         
     | 
| 197 | 
         | 
| 198 | 
         
            +
            ckpt_path = "zstanjj/HTML-Pruner-Phi-3.8B"
         
     | 
| 199 | 
         
             
            if torch.cuda.is_available():
         
     | 
| 200 | 
         
             
                device="cuda"
         
     | 
| 201 | 
         
             
            else:
         
     | 
| 
         | 
|
| 212 | 
         
             
            # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
         
     | 
| 213 | 
         
             
            ```
         
     | 
| 214 | 
         | 
| 215 | 
         
            +
            ---
         
     | 
| 216 | 
         | 
| 217 | 
         | 
| 218 | 
         
             
            ## Results
         
     |