Chris4K commited on
Commit
80f5976
·
verified ·
1 Parent(s): 600c08c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -6,7 +6,8 @@ import nltk
6
  import gradio as gr
7
  from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.vectorstores import FAISS
9
- from langchain_text_splitters import RecursiveCharacterTextSplitter, SentenceTextSplitter
 
10
  from sentence_transformers import SentenceTransformer
11
  from transformers import AutoTokenizer
12
  from nltk import sent_tokenize
@@ -78,8 +79,8 @@ def process_files(model_name, split_strategy, chunk_size=500, overlap_size=50, m
78
  text += FileHandler.extract_text(file_path)
79
 
80
  # Split text
81
- if split_strategy == 'sentence':
82
- splitter = SentenceTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
83
  else:
84
  splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
85
 
 
6
  import gradio as gr
7
  from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.vectorstores import FAISS
9
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
10
+ from langchain.text_splitters import TokenTextSplitter
11
  from sentence_transformers import SentenceTransformer
12
  from transformers import AutoTokenizer
13
  from nltk import sent_tokenize
 
79
  text += FileHandler.extract_text(file_path)
80
 
81
  # Split text
82
+ if split_strategy == 'token':
83
+ splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
84
  else:
85
  splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
86