Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
@@ -35,6 +35,7 @@ from playwright.async_api import async_playwright
|
|
35 |
from bs4 import BeautifulSoup
|
36 |
import requests
|
37 |
import trafilatura
|
|
|
38 |
|
39 |
|
40 |
#######################################################################################################################
|
@@ -2102,12 +2103,12 @@ def chunk_on_delimiter(input_string: str,
|
|
2102 |
# This function combines text chunks into larger blocks without exceeding a specified token count.
|
2103 |
# It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
|
2104 |
def combine_chunks_with_no_minimum(
|
2105 |
-
chunks:
|
2106 |
max_tokens: int,
|
2107 |
chunk_delimiter="\n\n",
|
2108 |
header: Optional[str] = None,
|
2109 |
add_ellipsis_for_overflow=False,
|
2110 |
-
) -> Tuple[
|
2111 |
dropped_chunk_count = 0
|
2112 |
output = [] # list to hold the final combined chunks
|
2113 |
output_indices = [] # list to hold the indices of the final combined chunks
|
|
|
35 |
from bs4 import BeautifulSoup
|
36 |
import requests
|
37 |
import trafilatura
|
38 |
+
from typing import Callable, Dict, List, Optional, Tuple
|
39 |
|
40 |
|
41 |
#######################################################################################################################
|
|
|
2103 |
# This function combines text chunks into larger blocks without exceeding a specified token count.
|
2104 |
# It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
|
2105 |
def combine_chunks_with_no_minimum(
|
2106 |
+
chunks: list[str],
|
2107 |
max_tokens: int,
|
2108 |
chunk_delimiter="\n\n",
|
2109 |
header: Optional[str] = None,
|
2110 |
add_ellipsis_for_overflow=False,
|
2111 |
+
) -> Tuple[list[str], list[int]]:
|
2112 |
dropped_chunk_count = 0
|
2113 |
output = [] # list to hold the final combined chunks
|
2114 |
output_indices = [] # list to hold the indices of the final combined chunks
|