Spaces:

m-ric
/

chunk_visualizer

Running

App Files Files Community

m-ric commited on Feb 15, 2024

Commit

40a40cb

verified ·

1 Parent(s): 51c0840

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -26

app.py CHANGED Viewed

@@ -7,6 +7,8 @@ from langchain.text_splitter import (
 LABEL_TEXTSPLITTER = "LangChain's CharacterTextSplitter"
 LABEL_RECURSIVE = "Langchain's RecursiveCharacterTextSplitter"
 def extract_separators_from_string(separators_str):
     try:
         separators = separators_str[1:-1].split(", ")
@@ -18,36 +20,55 @@ def extract_separators_from_string(separators_str):
         Please type it in the correct format: "['separator_1', 'separator_2', etc]"
         """)
-def change_split_selection(text, slider_count, split_selection, separator_selection):
     print("Updating separator selection interactivity:")
     return (
         gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
-        chunk(text, slider_count, split_selection, separator_selection)
     )
-def chunk(text, length, splitter_selection, separators_str):
     separators = extract_separators_from_string(separators_str)
     if splitter_selection == LABEL_TEXTSPLITTER:
-        text_splitter = CharacterTextSplitter(
-            separator="",
-            chunk_size=length,
-            chunk_overlap=0,
-            length_function=len,
-            is_separator_regex=False,
-        )
-        splits = text_splitter.create_documents([text])
-        text_splits = [split.page_content for split in splits]
     elif splitter_selection == LABEL_RECURSIVE:
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=length,
-            chunk_overlap=0,
-            length_function=len,
-            add_start_index=True,
-            separators=separators,
-        )
-        splits = text_splitter.create_documents([text])
-        text_splits = [split.page_content for split in splits]
     output = [(split, str(i)) for i, split in enumerate(text_splits)]
     return output
@@ -105,7 +126,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css="#textbox_id {color: red; font-samily
                 "Character count",
                 "Token count",
             ],
-            value="Token count",
             label="Length count",
             info="How should we count our chunk lengths?",
         )
@@ -119,22 +140,22 @@ with gr.Blocks(theme=gr.themes.Soft(), css="#textbox_id {color: red; font-samily
     )
     text.change(
         fn=chunk,
-        inputs=[text, slider_count, split_selection, separator_selection],
         outputs=out,
     )
     length_unit_selection.change(
         fn=chunk,
-        inputs=[text, slider_count, split_selection, separator_selection],
         outputs=out,
     )
     split_selection.change(
         fn=change_split_selection,
-        inputs=[text, slider_count, split_selection, separator_selection],
         outputs=[separator_selection, out],
     )
     slider_count.change(
         fn=chunk,
-        inputs=[text, slider_count, split_selection, separator_selection],
         outputs=out,
     )
 demo.launch()

 LABEL_TEXTSPLITTER = "LangChain's CharacterTextSplitter"
 LABEL_RECURSIVE = "Langchain's RecursiveCharacterTextSplitter"
+bert_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
 def extract_separators_from_string(separators_str):
     try:
         separators = separators_str[1:-1].split(", ")
         Please type it in the correct format: "['separator_1', 'separator_2', etc]"
         """)
+def change_split_selection(text, slider_count, split_selection, separator_selection, length_unit_selection):
     print("Updating separator selection interactivity:")
     return (
         gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
+        chunk(text, slider_count, split_selection, separator_selection, length_unit_selection)
     )
+def chunk(text, length, splitter_selection, separators_str, length_unit_selection):
     separators = extract_separators_from_string(separators_str)
     if splitter_selection == LABEL_TEXTSPLITTER:
+        if "token" in length_unit_selection.lower():
+            text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
+                AutoTokenizer.from_pretrained(tokenizer_name),
+                separator="",
+                chunk_size=length,
+                chunk_overlap=0,
+                length_function=len,
+                is_separator_regex=False,
+            )
+        else:
+            text_splitter = CharacterTextSplitter(
+                separator="",
+                chunk_size=length,
+                chunk_overlap=0,
+                length_function=len,
+                is_separator_regex=False,
+            )
     elif splitter_selection == LABEL_RECURSIVE:
+        if "token" in length_unit_selection.lower():
+            text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
+                AutoTokenizer.from_pretrained(tokenizer_name),
+                chunk_size=chunk_size,
+                chunk_overlap=0,
+                add_start_index=True,
+                strip_whitespace=False,
+                separators=separators,
+            )
+        else:
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=length,
+                chunk_overlap=0,
+                length_function=len,
+                add_start_index=True,
+                strip_whitespace=False,
+                separators=separators,
+            )
+    splits = text_splitter.create_documents([text])
+    text_splits = [split.page_content for split in splits]
     output = [(split, str(i)) for i, split in enumerate(text_splits)]
     return output
                 "Character count",
                 "Token count",
             ],
+            value=["Character count", "Token count (BERT tokens)"],
             label="Length count",
             info="How should we count our chunk lengths?",
         )
     )
     text.change(
         fn=chunk,
+        inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
         outputs=out,
     )
     length_unit_selection.change(
         fn=chunk,
+        inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
         outputs=out,
     )
     split_selection.change(
         fn=change_split_selection,
+        inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
         outputs=[separator_selection, out],
     )
     slider_count.change(
         fn=chunk,
+        inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
         outputs=out,
     )
 demo.launch()