hanzla commited on
Commit
65721af
·
1 Parent(s): a127868
Files changed (1) hide show
  1. src/pdfchatbot.py +9 -3
src/pdfchatbot.py CHANGED
@@ -11,7 +11,7 @@ from langchain.document_loaders import PyPDFLoader
11
  from langchain.prompts import PromptTemplate
12
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
13
  import spaces
14
- from langchain_text_splitters import CharacterTextSplitter
15
 
16
 
17
  class PDFChatBot:
@@ -96,7 +96,12 @@ class PDFChatBot:
96
  """
97
  Load the vector database from the documents and embeddings.
98
  """
99
- text_splitter = CharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=100)
 
 
 
 
 
100
  docs = text_splitter.split_documents(self.documents)
101
  self.vectordb = Chroma.from_documents(docs, self.embeddings)
102
 
@@ -132,7 +137,8 @@ class PDFChatBot:
132
  def create_organic_pipeline(self):
133
  self.pipeline = pipeline(
134
  "text-generation",
135
- model=self.config.get("autoModelForCausalLM"),
 
136
  model_kwargs={"torch_dtype": torch.bfloat16},
137
  device="cuda",
138
  )
 
11
  from langchain.prompts import PromptTemplate
12
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
13
  import spaces
14
+ from langchain_text_splitters import CharacterTextSplitter,RecursiveCharacterTextSplitter
15
 
16
 
17
  class PDFChatBot:
 
96
  """
97
  Load the vector database from the documents and embeddings.
98
  """
99
+ text_splitter = RecursiveCharacterTextSplitter(
100
+ chunk_size=256,
101
+ chunk_overlap=100,
102
+ length_function=len,
103
+ add_start_index=True,
104
+ )
105
  docs = text_splitter.split_documents(self.documents)
106
  self.vectordb = Chroma.from_documents(docs, self.embeddings)
107
 
 
137
  def create_organic_pipeline(self):
138
  self.pipeline = pipeline(
139
  "text-generation",
140
+ model=self.model,
141
+ tokenizer=self.tokenizer,
142
  model_kwargs={"torch_dtype": torch.bfloat16},
143
  device="cuda",
144
  )