Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,8 @@ from langchain.text_splitter import (
|
|
| 7 |
LABEL_TEXTSPLITTER = "LangChain's CharacterTextSplitter"
|
| 8 |
LABEL_RECURSIVE = "Langchain's RecursiveCharacterTextSplitter"
|
| 9 |
|
|
|
|
|
|
|
| 10 |
def extract_separators_from_string(separators_str):
|
| 11 |
try:
|
| 12 |
separators = separators_str[1:-1].split(", ")
|
|
@@ -18,36 +20,55 @@ def extract_separators_from_string(separators_str):
|
|
| 18 |
Please type it in the correct format: "['separator_1', 'separator_2', etc]"
|
| 19 |
""")
|
| 20 |
|
| 21 |
-
def change_split_selection(text, slider_count, split_selection, separator_selection):
|
| 22 |
print("Updating separator selection interactivity:")
|
| 23 |
return (
|
| 24 |
gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
|
| 25 |
-
chunk(text, slider_count, split_selection, separator_selection)
|
| 26 |
)
|
| 27 |
|
| 28 |
-
def chunk(text, length, splitter_selection, separators_str):
|
| 29 |
separators = extract_separators_from_string(separators_str)
|
| 30 |
|
| 31 |
if splitter_selection == LABEL_TEXTSPLITTER:
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
elif splitter_selection == LABEL_RECURSIVE:
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
output = [(split, str(i)) for i, split in enumerate(text_splits)]
|
| 53 |
return output
|
|
@@ -105,7 +126,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css="#textbox_id {color: red; font-samily
|
|
| 105 |
"Character count",
|
| 106 |
"Token count",
|
| 107 |
],
|
| 108 |
-
value="Token count",
|
| 109 |
label="Length count",
|
| 110 |
info="How should we count our chunk lengths?",
|
| 111 |
)
|
|
@@ -119,22 +140,22 @@ with gr.Blocks(theme=gr.themes.Soft(), css="#textbox_id {color: red; font-samily
|
|
| 119 |
)
|
| 120 |
text.change(
|
| 121 |
fn=chunk,
|
| 122 |
-
inputs=[text, slider_count, split_selection, separator_selection],
|
| 123 |
outputs=out,
|
| 124 |
)
|
| 125 |
length_unit_selection.change(
|
| 126 |
fn=chunk,
|
| 127 |
-
inputs=[text, slider_count, split_selection, separator_selection],
|
| 128 |
outputs=out,
|
| 129 |
)
|
| 130 |
split_selection.change(
|
| 131 |
fn=change_split_selection,
|
| 132 |
-
inputs=[text, slider_count, split_selection, separator_selection],
|
| 133 |
outputs=[separator_selection, out],
|
| 134 |
)
|
| 135 |
slider_count.change(
|
| 136 |
fn=chunk,
|
| 137 |
-
inputs=[text, slider_count, split_selection, separator_selection],
|
| 138 |
outputs=out,
|
| 139 |
)
|
| 140 |
demo.launch()
|
|
|
|
| 7 |
LABEL_TEXTSPLITTER = "LangChain's CharacterTextSplitter"
|
| 8 |
LABEL_RECURSIVE = "Langchain's RecursiveCharacterTextSplitter"
|
| 9 |
|
| 10 |
+
bert_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
|
| 11 |
+
|
| 12 |
def extract_separators_from_string(separators_str):
|
| 13 |
try:
|
| 14 |
separators = separators_str[1:-1].split(", ")
|
|
|
|
| 20 |
Please type it in the correct format: "['separator_1', 'separator_2', etc]"
|
| 21 |
""")
|
| 22 |
|
| 23 |
+
def change_split_selection(text, slider_count, split_selection, separator_selection, length_unit_selection):
|
| 24 |
print("Updating separator selection interactivity:")
|
| 25 |
return (
|
| 26 |
gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
|
| 27 |
+
chunk(text, slider_count, split_selection, separator_selection, length_unit_selection)
|
| 28 |
)
|
| 29 |
|
| 30 |
+
def chunk(text, length, splitter_selection, separators_str, length_unit_selection):
|
| 31 |
separators = extract_separators_from_string(separators_str)
|
| 32 |
|
| 33 |
if splitter_selection == LABEL_TEXTSPLITTER:
|
| 34 |
+
if "token" in length_unit_selection.lower():
|
| 35 |
+
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
|
| 36 |
+
AutoTokenizer.from_pretrained(tokenizer_name),
|
| 37 |
+
separator="",
|
| 38 |
+
chunk_size=length,
|
| 39 |
+
chunk_overlap=0,
|
| 40 |
+
length_function=len,
|
| 41 |
+
is_separator_regex=False,
|
| 42 |
+
)
|
| 43 |
+
else:
|
| 44 |
+
text_splitter = CharacterTextSplitter(
|
| 45 |
+
separator="",
|
| 46 |
+
chunk_size=length,
|
| 47 |
+
chunk_overlap=0,
|
| 48 |
+
length_function=len,
|
| 49 |
+
is_separator_regex=False,
|
| 50 |
+
)
|
| 51 |
elif splitter_selection == LABEL_RECURSIVE:
|
| 52 |
+
if "token" in length_unit_selection.lower():
|
| 53 |
+
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
| 54 |
+
AutoTokenizer.from_pretrained(tokenizer_name),
|
| 55 |
+
chunk_size=chunk_size,
|
| 56 |
+
chunk_overlap=0,
|
| 57 |
+
add_start_index=True,
|
| 58 |
+
strip_whitespace=False,
|
| 59 |
+
separators=separators,
|
| 60 |
+
)
|
| 61 |
+
else:
|
| 62 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 63 |
+
chunk_size=length,
|
| 64 |
+
chunk_overlap=0,
|
| 65 |
+
length_function=len,
|
| 66 |
+
add_start_index=True,
|
| 67 |
+
strip_whitespace=False,
|
| 68 |
+
separators=separators,
|
| 69 |
+
)
|
| 70 |
+
splits = text_splitter.create_documents([text])
|
| 71 |
+
text_splits = [split.page_content for split in splits]
|
| 72 |
|
| 73 |
output = [(split, str(i)) for i, split in enumerate(text_splits)]
|
| 74 |
return output
|
|
|
|
| 126 |
"Character count",
|
| 127 |
"Token count",
|
| 128 |
],
|
| 129 |
+
value=["Character count", "Token count (BERT tokens)"],
|
| 130 |
label="Length count",
|
| 131 |
info="How should we count our chunk lengths?",
|
| 132 |
)
|
|
|
|
| 140 |
)
|
| 141 |
text.change(
|
| 142 |
fn=chunk,
|
| 143 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
|
| 144 |
outputs=out,
|
| 145 |
)
|
| 146 |
length_unit_selection.change(
|
| 147 |
fn=chunk,
|
| 148 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
|
| 149 |
outputs=out,
|
| 150 |
)
|
| 151 |
split_selection.change(
|
| 152 |
fn=change_split_selection,
|
| 153 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
|
| 154 |
outputs=[separator_selection, out],
|
| 155 |
)
|
| 156 |
slider_count.change(
|
| 157 |
fn=chunk,
|
| 158 |
+
inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
|
| 159 |
outputs=out,
|
| 160 |
)
|
| 161 |
demo.launch()
|