Akjava commited on
Commit
96ba5d0
·
verified ·
1 Parent(s): 9df3361

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -7
app.py CHANGED
@@ -22,7 +22,7 @@ from exception import CustomExceptionHandling
22
  cache_file = "docs_processed.joblib"
23
  if os.path.exists(cache_file):
24
  docs_processed = joblib.load(cache_file)
25
- print("Loaded docs_processed from cache.")
26
  else:
27
  knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
28
  source_docs = [
@@ -134,7 +134,7 @@ def generate_t5(llama,message):#text size must be smaller than ctx(default=512)
134
  raise ValueError("llama not initialized")
135
  try:
136
  tokens = llama.tokenize(f"{message}".encode("utf-8"))
137
- print(f"text length={len(tokens)}")
138
  llama.encode(tokens)
139
  tokens = [llama.decoder_start_token()]
140
 
@@ -146,7 +146,7 @@ def generate_t5(llama,message):#text size must be smaller than ctx(default=512)
146
  top_k = 40
147
  top_p = 0.95
148
  repeat_penalty = 1.2
149
- print("stepped")
150
  for i in range(iteration):
151
  for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
152
  outputs+= llama.detokenize([token]).decode()
@@ -187,7 +187,7 @@ Search Query:
187
  global llama
188
  if llama == None:
189
  model_id = f"flan-t5-{t5_size}.Q8_0.gguf"
190
- llama = Llama(f"models/{model_id}",flash_attn=False,
191
  n_gpu_layers=0,
192
  n_threads=2,
193
  n_threads_batch=2
@@ -258,7 +258,7 @@ def respond(
258
 
259
  query = to_query(message)
260
  document = retriever_tool(query=query)
261
- print(document)
262
  answer(document,message)
263
  response = ""
264
  #do direct in here
@@ -270,9 +270,9 @@ def respond(
270
 
271
  # Create a chat interface
272
  # Set the title and description
273
- title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
274
  description = """
275
- - I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) which support T5 on server and it's doesn't support new models(like gemma3)
276
  - Search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
277
  - Qwen2.5-0.5B as good as small-size.
278
  - anyway google T5 series on CPU is amazing
 
22
  cache_file = "docs_processed.joblib"
23
  if os.path.exists(cache_file):
24
  docs_processed = joblib.load(cache_file)
25
+ #print("Loaded docs_processed from cache.")
26
  else:
27
  knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
28
  source_docs = [
 
134
  raise ValueError("llama not initialized")
135
  try:
136
  tokens = llama.tokenize(f"{message}".encode("utf-8"))
137
+ #print(f"text length={len(tokens)}")
138
  llama.encode(tokens)
139
  tokens = [llama.decoder_start_token()]
140
 
 
146
  top_k = 40
147
  top_p = 0.95
148
  repeat_penalty = 1.2
149
+
150
  for i in range(iteration):
151
  for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
152
  outputs+= llama.detokenize([token]).decode()
 
187
  global llama
188
  if llama == None:
189
  model_id = f"flan-t5-{t5_size}.Q8_0.gguf"
190
+ llama = Llama(f"models/{model_id}",flash_attn=False,verbose=False
191
  n_gpu_layers=0,
192
  n_threads=2,
193
  n_threads_batch=2
 
258
 
259
  query = to_query(message)
260
  document = retriever_tool(query=query)
261
+ #print(document)
262
  answer(document,message)
263
  response = ""
264
  #do direct in here
 
270
 
271
  # Create a chat interface
272
  # Set the title and description
273
+ title = "llama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
274
  description = """
275
+ - I use forked [llama-cpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) which support T5 on server and it's doesn't support new models(like gemma3)
276
  - Search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
277
  - Qwen2.5-0.5B as good as small-size.
278
  - anyway google T5 series on CPU is amazing