Akjava commited on
Commit
29d67eb
·
verified ·
1 Parent(s): 22ce07c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -140
app.py CHANGED
@@ -12,28 +12,13 @@ import subprocess
12
  import sys
13
  import joblib
14
  from llama_cpp import Llama
15
- from llama_cpp_agent import LlamaCppAgent
16
- from llama_cpp_agent import MessagesFormatterType
17
- from llama_cpp_agent.providers import LlamaCppPythonProvider
18
- from llama_cpp_agent.chat_history import BasicChatHistory
19
- from llama_cpp_agent.chat_history.messages import Roles
20
  import gradio as gr
21
  from huggingface_hub import hf_hub_download
22
  from typing import List, Tuple,Dict,Optional
23
  from logger import logging
24
  from exception import CustomExceptionHandling
25
 
26
- from smolagents.gradio_ui import GradioUI
27
- from smolagents import (
28
- CodeAgent,
29
- GoogleSearchTool,
30
- Model,
31
- Tool,
32
- LiteLLMModel,
33
- ToolCallingAgent,
34
- ChatMessage,tool,MessageRole
35
- )
36
-
37
  cache_file = "docs_processed.joblib"
38
  if os.path.exists(cache_file):
39
  docs_processed = joblib.load(cache_file)
@@ -91,24 +76,25 @@ retriever_tool = RetrieverTool(docs_processed)
91
  # Download gguf model files
92
  huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
93
 
 
94
  hf_hub_download(
95
- repo_id="bartowski/google_gemma-3-4b-it-GGUF",
96
- filename="google_gemma-3-4b-it-Q4_K_M.gguf",
97
  local_dir="./models",
98
  )
 
99
  hf_hub_download(
100
- repo_id="bartowski/google_gemma-3-1b-it-GGUF",
101
- filename="google_gemma-3-1b-it-Q5_K_M.gguf",
102
  local_dir="./models",
103
  )
104
 
105
  # Set the title and description
106
- title = "Gemma3-4B llama.cpp on cpu rag"
107
- description = """This is prompt version rag.\n fast and stable than [smolagent version](https://huggingface.co/spaces/Akjava/Gemma3-1B-llamacpp-cpu-rag-smolagents).but the prompt needs significant improvement."""
 
108
 
109
 
110
- llm = None
111
- llm_model = None
112
 
113
 
114
  query_system = """
@@ -140,38 +126,101 @@ Search Query: transformer model history
140
  def clean_text(text):
141
  cleaned = re.sub(r'[^\x00-\x7F]+', '', text) # Remove non-ASCII chars
142
  cleaned = re.sub(r'[^a-zA-Z0-9_\- ]', '', cleaned) #Then your original rule
 
143
  return cleaned
144
 
145
- def to_query(provider,question):
146
-
 
147
  try:
148
- query_agent = LlamaCppAgent(
149
- provider,
150
- system_prompt=f"{query_system}",
151
- predefined_messages_formatter_type=MessagesFormatterType.GEMMA_2,
152
- debug_output=False,
153
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
- message="""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  Now, rewrite the following question:
157
  User Question: %s
158
  Search Query:
159
- """%question
160
-
161
-
162
- settings = provider.get_provider_default_settings()
163
- messages = BasicChatHistory()
164
- result = query_agent.get_chat_response(
165
- message,
166
- llm_sampling_settings=settings,
167
- chat_history=messages,
168
- returns_streaming_generator=False,
169
- print_output=False,
170
- )
171
- return clean_text(result)
172
  except Exception as e:
173
  # Custom exception handling
174
  raise CustomExceptionHandling(e, sys) from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  def respond(
177
  message: str,
@@ -186,7 +235,6 @@ def respond(
186
  ):
187
  """
188
  Respond to a message using the Gemma3 model via Llama.cpp.
189
-
190
  Args:
191
  - message (str): The message to respond to.
192
  - history (List[Tuple[str, str]]): The chat history.
@@ -197,101 +245,13 @@ def respond(
197
  - top_p (float): The top-p of the model.
198
  - top_k (int): The top-k of the model.
199
  - repeat_penalty (float): The repetition penalty of the model.
200
-
201
  Returns:
202
  str: The response to the message.
203
  """
204
  if model is None:#
205
  return
206
-
207
- try:
208
- # Load the global variables
209
- global llm
210
- global llm_model
211
-
212
- # Load the model
213
- if llm is None or llm_model != model:
214
- llm = Llama(
215
- model_path=f"models/{model}",
216
- flash_attn=False,
217
- n_gpu_layers=0,
218
- n_batch=16,
219
- n_ctx=2048,
220
- n_threads=2,
221
- n_threads_batch=2,
222
- verbose=False
223
- )
224
- llm_model = model
225
- provider = LlamaCppPythonProvider(llm)
226
-
227
- query = to_query(provider,message)
228
- text = retriever_tool(query=f"{query}")
229
-
230
-
231
- #very sensitive against prompt
232
- retriever_system="""
233
- You are an AI assistant that answers questions based on below retrievered documents.
234
-
235
- Documents:
236
- ---
237
- %s
238
- ---
239
- Question: %s
240
- Answer:
241
- """ % (text,message)
242
-
243
- # Create the agent
244
- agent = LlamaCppAgent(
245
- provider,
246
- #system_prompt=f"{retriever_system}",
247
- system_prompt="you are kind assistant",
248
- predefined_messages_formatter_type=MessagesFormatterType.GEMMA_2,
249
- debug_output=False,
250
- )
251
-
252
- # Set the settings like temperature, top-k, top-p, max tokens, etc.
253
- settings = provider.get_provider_default_settings()
254
- settings.temperature = temperature
255
- settings.top_k = top_k
256
- settings.top_p = top_p
257
- settings.max_tokens = max_tokens
258
- settings.repeat_penalty = repeat_penalty
259
- settings.stream = True
260
-
261
- messages = BasicChatHistory()
262
-
263
- # Add the chat history
264
- for msn in history:
265
- user = {"role": Roles.user, "content": msn[0]}
266
- assistant = {"role": Roles.assistant, "content": msn[1]}
267
- messages.add_message(user)
268
- messages.add_message(assistant)
269
-
270
- # Get the response stream
271
- stream = agent.get_chat_response(
272
- retriever_system,
273
- #retriever_system+text,
274
- #retriever_system+text,
275
- llm_sampling_settings=settings,
276
- chat_history=messages,
277
- returns_streaming_generator=True,
278
- print_output=False,
279
- )
280
-
281
- # Log the success
282
- logging.info("Response stream generated successfully")
283
-
284
- # Generate the response
285
- outputs = ""
286
- for output in stream:
287
- outputs += output
288
- yield outputs
289
-
290
- # Handle exceptions that may occur during the process
291
- except Exception as e:
292
- # Custom exception handling
293
- raise CustomExceptionHandling(e, sys) from e
294
 
 
295
 
296
  # Create a chat interface
297
  demo = gr.ChatInterface(
@@ -303,12 +263,12 @@ demo = gr.ChatInterface(
303
  additional_inputs=[
304
  gr.Dropdown(
305
  choices=[
306
- "google_gemma-3-4b-it-Q4_K_M.gguf",
307
- "google_gemma-3-1b-it-Q5_K_M.gguf",
308
  ],
309
- value="google_gemma-3-4b-it-Q4_K_M.gguf",
310
  label="Model",
311
- info="Select the AI model to use for chat",
312
  ),
313
  gr.Textbox(
314
  value="You are a helpful assistant.",
 
12
  import sys
13
  import joblib
14
  from llama_cpp import Llama
15
+
 
 
 
 
16
  import gradio as gr
17
  from huggingface_hub import hf_hub_download
18
  from typing import List, Tuple,Dict,Optional
19
  from logger import logging
20
  from exception import CustomExceptionHandling
21
 
 
 
 
 
 
 
 
 
 
 
 
22
  cache_file = "docs_processed.joblib"
23
  if os.path.exists(cache_file):
24
  docs_processed = joblib.load(cache_file)
 
76
  # Download gguf model files
77
  huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
78
 
79
+
80
  hf_hub_download(
81
+ repo_id="mradermacher/Qwen2.5-0.5B-Rag-Thinking-i1-GGUF",
82
+ filename="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf",
83
  local_dir="./models",
84
  )
85
+ t5_size="base"
86
  hf_hub_download(
87
+ repo_id=f"Felladrin/gguf-flan-t5-{t5_size}",
88
+ filename=f"flan-t5-{size}.Q8_0.gguf",
89
  local_dir="./models",
90
  )
91
 
92
  # Set the title and description
93
+ title = "Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
94
+ description = """My Best CPU Rag Solution"""
95
+
96
 
97
 
 
 
98
 
99
 
100
  query_system = """
 
126
  def clean_text(text):
127
  cleaned = re.sub(r'[^\x00-\x7F]+', '', text) # Remove non-ASCII chars
128
  cleaned = re.sub(r'[^a-zA-Z0-9_\- ]', '', cleaned) #Then your original rule
129
+ cleaned = cleaned.replace("---","")
130
  return cleaned
131
 
132
+ def generate_t5(llama,message):#text size must be smaller than ctx(default=512)
133
+ if llama == None:
134
+ raise ValueError("llama not initialized")
135
  try:
136
+ tokens = llama.tokenize(f"{message}".encode("utf-8"))
137
+ print(f"text length={len(tokens)}")
138
+ #print(tokens)
139
+ llama.encode(tokens)
140
+ tokens = [llama.decoder_start_token()]
141
+
142
+
143
+ outputs =""
144
+ #TODO support stream
145
+ iteration = 1
146
+ temperature = 0.5
147
+ top_k = 40
148
+ top_p = 0.95
149
+ repeat_penalty = 1.2
150
+ print("stepped")
151
+ for i in range(iteration):
152
+ for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
153
+ outputs+= llama.detokenize([token]).decode()
154
+ if token == llama.token_eos():
155
+ break
156
+ return outputs
157
+ except Exception as e:
158
+ raise CustomExceptionHandling(e, sys) from e
159
+ return None
160
+
161
 
162
+ def to_query(question):
163
+ system = """
164
+ You are a query rewriter. Your task is to convert a user's question into a concise search query suitable for information retrieval.
165
+ The goal is to identify the most important keywords for a search engine.
166
+
167
+ Here are some examples:
168
+ User Question: What is transformer?
169
+ Search Query: transformer
170
+ User Question: How does a transformer model work in natural language processing?
171
+ Search Query: transformer model natural language processing
172
+ User Question: What are the advantages of using transformers over recurrent neural networks?
173
+ Search Query: transformer vs recurrent neural network advantages
174
+ User Question: Explain the attention mechanism in transformers.
175
+ Search Query: transformer attention mechanism
176
+ User Question: What are the different types of transformer architectures?
177
+ Search Query: transformer architectures
178
+ User Question: What is the history of the transformer model?
179
+ Search Query: transformer model history
180
+ ---
181
  Now, rewrite the following question:
182
  User Question: %s
183
  Search Query:
184
+ """% question
185
+ message = system
186
+ try:
187
+ global llama
188
+ if llama == None:
189
+ model_id = f"flan-t5-{t5_size}.Q8_0.gguf"
190
+ llama = Llama(f"models/{model_id}",flash_attn=False,
191
+ n_gpu_layers=0,
192
+ n_threads=2,
193
+ n_threads_batch=2
194
+ )
195
+ query = generate_t5(llama,message)
196
+ return clean_text(query)
197
  except Exception as e:
198
  # Custom exception handling
199
  raise CustomExceptionHandling(e, sys) from e
200
+ return None
201
+
202
+
203
+ def answer(document:str,question:str,model:str="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf")->str:
204
+ global llm
205
+ global llm_model
206
+ global provider
207
+ llm = Llama(
208
+ model_path=f"models/{model}",
209
+ flash_attn=False,
210
+ n_gpu_layers=0,
211
+ n_batch=1024,
212
+ n_ctx=2048*4,
213
+ n_threads=2,
214
+ n_threads_batch=2,
215
+ verbose=False
216
+ )
217
+ llm_model = model
218
+ #provider = LlamaCppPythonProvider(llm)
219
+
220
+ result = llm(qwen_prompt%(document,question),max_tokens=2048*4)
221
+ #answer = to_answer(provider,document,question)
222
+ return result['choices'][0]['text']
223
+
224
 
225
  def respond(
226
  message: str,
 
235
  ):
236
  """
237
  Respond to a message using the Gemma3 model via Llama.cpp.
 
238
  Args:
239
  - message (str): The message to respond to.
240
  - history (List[Tuple[str, str]]): The chat history.
 
245
  - top_p (float): The top-p of the model.
246
  - top_k (int): The top-k of the model.
247
  - repeat_penalty (float): The repetition penalty of the model.
 
248
  Returns:
249
  str: The response to the message.
250
  """
251
  if model is None:#
252
  return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
+ return to_query(message)
255
 
256
  # Create a chat interface
257
  demo = gr.ChatInterface(
 
263
  additional_inputs=[
264
  gr.Dropdown(
265
  choices=[
266
+
267
+ "Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf",
268
  ],
269
+ value="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf",
270
  label="Model",
271
+ info="Select the AI model to use for chat",visible=False
272
  ),
273
  gr.Textbox(
274
  value="You are a helpful assistant.",