Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,8 @@ import nltk
|
|
6 |
import gradio as gr
|
7 |
from langchain.embeddings import HuggingFaceEmbeddings
|
8 |
from langchain.vectorstores import FAISS
|
9 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
10 |
from sentence_transformers import SentenceTransformer
|
11 |
from transformers import AutoTokenizer
|
12 |
from nltk import sent_tokenize
|
@@ -78,8 +79,8 @@ def process_files(model_name, split_strategy, chunk_size=500, overlap_size=50, m
|
|
78 |
text += FileHandler.extract_text(file_path)
|
79 |
|
80 |
# Split text
|
81 |
-
if split_strategy == '
|
82 |
-
splitter =
|
83 |
else:
|
84 |
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
|
85 |
|
|
|
6 |
import gradio as gr
|
7 |
from langchain.embeddings import HuggingFaceEmbeddings
|
8 |
from langchain.vectorstores import FAISS
|
9 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
10 |
+
from langchain.text_splitters import TokenTextSplitter
|
11 |
from sentence_transformers import SentenceTransformer
|
12 |
from transformers import AutoTokenizer
|
13 |
from nltk import sent_tokenize
|
|
|
79 |
text += FileHandler.extract_text(file_path)
|
80 |
|
81 |
# Split text
|
82 |
+
if split_strategy == 'token':
|
83 |
+
splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
|
84 |
else:
|
85 |
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
|
86 |
|