Update app.py
Browse files
app.py
CHANGED
@@ -22,7 +22,7 @@ from exception import CustomExceptionHandling
|
|
22 |
cache_file = "docs_processed.joblib"
|
23 |
if os.path.exists(cache_file):
|
24 |
docs_processed = joblib.load(cache_file)
|
25 |
-
print("Loaded docs_processed from cache.")
|
26 |
else:
|
27 |
knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
|
28 |
source_docs = [
|
@@ -134,7 +134,7 @@ def generate_t5(llama,message):#text size must be smaller than ctx(default=512)
|
|
134 |
raise ValueError("llama not initialized")
|
135 |
try:
|
136 |
tokens = llama.tokenize(f"{message}".encode("utf-8"))
|
137 |
-
print(f"text length={len(tokens)}")
|
138 |
llama.encode(tokens)
|
139 |
tokens = [llama.decoder_start_token()]
|
140 |
|
@@ -146,7 +146,7 @@ def generate_t5(llama,message):#text size must be smaller than ctx(default=512)
|
|
146 |
top_k = 40
|
147 |
top_p = 0.95
|
148 |
repeat_penalty = 1.2
|
149 |
-
|
150 |
for i in range(iteration):
|
151 |
for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
|
152 |
outputs+= llama.detokenize([token]).decode()
|
@@ -187,7 +187,7 @@ Search Query:
|
|
187 |
global llama
|
188 |
if llama == None:
|
189 |
model_id = f"flan-t5-{t5_size}.Q8_0.gguf"
|
190 |
-
llama = Llama(f"models/{model_id}",flash_attn=False,
|
191 |
n_gpu_layers=0,
|
192 |
n_threads=2,
|
193 |
n_threads_batch=2
|
@@ -258,7 +258,7 @@ def respond(
|
|
258 |
|
259 |
query = to_query(message)
|
260 |
document = retriever_tool(query=query)
|
261 |
-
print(document)
|
262 |
answer(document,message)
|
263 |
response = ""
|
264 |
#do direct in here
|
@@ -270,9 +270,9 @@ def respond(
|
|
270 |
|
271 |
# Create a chat interface
|
272 |
# Set the title and description
|
273 |
-
title = "
|
274 |
description = """
|
275 |
-
- I use forked [
|
276 |
- Search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
|
277 |
- Qwen2.5-0.5B as good as small-size.
|
278 |
- anyway google T5 series on CPU is amazing
|
|
|
22 |
cache_file = "docs_processed.joblib"
|
23 |
if os.path.exists(cache_file):
|
24 |
docs_processed = joblib.load(cache_file)
|
25 |
+
#print("Loaded docs_processed from cache.")
|
26 |
else:
|
27 |
knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
|
28 |
source_docs = [
|
|
|
134 |
raise ValueError("llama not initialized")
|
135 |
try:
|
136 |
tokens = llama.tokenize(f"{message}".encode("utf-8"))
|
137 |
+
#print(f"text length={len(tokens)}")
|
138 |
llama.encode(tokens)
|
139 |
tokens = [llama.decoder_start_token()]
|
140 |
|
|
|
146 |
top_k = 40
|
147 |
top_p = 0.95
|
148 |
repeat_penalty = 1.2
|
149 |
+
|
150 |
for i in range(iteration):
|
151 |
for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
|
152 |
outputs+= llama.detokenize([token]).decode()
|
|
|
187 |
global llama
|
188 |
if llama == None:
|
189 |
model_id = f"flan-t5-{t5_size}.Q8_0.gguf"
|
190 |
+
llama = Llama(f"models/{model_id}",flash_attn=False,verbose=False
|
191 |
n_gpu_layers=0,
|
192 |
n_threads=2,
|
193 |
n_threads_batch=2
|
|
|
258 |
|
259 |
query = to_query(message)
|
260 |
document = retriever_tool(query=query)
|
261 |
+
#print(document)
|
262 |
answer(document,message)
|
263 |
response = ""
|
264 |
#do direct in here
|
|
|
270 |
|
271 |
# Create a chat interface
|
272 |
# Set the title and description
|
273 |
+
title = "llama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
|
274 |
description = """
|
275 |
+
- I use forked [llama-cpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) which support T5 on server and it's doesn't support new models(like gemma3)
|
276 |
- Search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
|
277 |
- Qwen2.5-0.5B as good as small-size.
|
278 |
- anyway google T5 series on CPU is amazing
|