bobpopboom commited on
Commit
2bd985b
·
verified ·
1 Parent(s): 5be4cb8

deep seek xD

Browse files
Files changed (1) hide show
  1. app.py +43 -46
app.py CHANGED
@@ -1,73 +1,70 @@
1
  import gradio as gr
2
  from transformers import AutoTokenizer
3
- import ctranslate2
4
  import torch
5
 
6
- # Determine device (ctranslate2 handles device placement internally)
7
- device = "cuda" if torch.cuda.is_available() else "cpu" # Still useful for other ops
8
- model="thrishala/mental_health_chatbot"
9
- model_download_link = "https://huggingface.co/mradermacher/TinyLlama-Friendly-Psychotherapist-GGUF/resolve/main/TinyLlama-Friendly-Psychotherapist.Q4_K_S.gguf"
10
- model_path = "./TinyLlama-Friendly-Psychotherapist.Q4_K_S.gguf" # gguf
11
 
12
  try:
13
- # 1. Load the tokenizer (same as before)
14
- tokenizer = AutoTokenizer.from_pretrained(model_path)
15
  tokenizer.pad_token = tokenizer.eos_token
16
  tokenizer.model_max_length = 4096
17
 
18
- # 2. Load the ctranslate2 model
19
- ct_model = ctranslate2.Translator(model_path) # Load the GGUF model
20
- ct_model.eval()
 
 
 
 
 
21
  except Exception as e:
22
  print(f"Error loading model: {e}")
23
  exit()
24
 
25
  def generate_text_streaming(prompt, max_new_tokens=128):
26
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(device)
27
-
28
- generated_tokens = []
29
-
30
- for _ in range(max_new_tokens):
31
- # ctranslate2 generation (adjust as needed)
32
- outputs = ct_model.translate_batch(
33
- inputs.input_ids.tolist(), # ctranslate2 needs list of token ids
34
- max_length=1, # Generate one token at a time
35
- beam_size=1, # Greedy decoding
36
- )
37
-
38
- new_token_id = outputs[0][0][-1] # Extract the generated token ID
39
- new_token = tokenizer.decode(new_token_id, skip_special_tokens=True)
40
-
41
- if new_token_id == tokenizer.eos_token_id:
42
- break
43
-
44
- generated_tokens.append(new_token_id)
45
-
46
- current_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
47
- yield current_text
48
 
49
- inputs["input_ids"] = torch.cat([inputs["input_ids"], torch.tensor([[new_token_id]], device=inputs["input_ids"].device)], dim=-1)
50
- inputs["attention_mask"] = torch.cat([inputs["attention_mask"], torch.ones(1, 1, device=inputs["attention_mask"].device)], dim=-1)
 
 
 
51
 
52
  def respond(message, history, system_message, max_tokens):
53
- # Build prompt with full history
54
  prompt = f"{system_message}\n"
55
  for user_msg, bot_msg in history:
56
  prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
57
  prompt += f"User: {message}\nAssistant:"
58
 
59
- # Keep track of the full response
60
- full_response = ""
61
-
62
  try:
63
- for token_chunk in generate_text_streaming(prompt, max_tokens):
64
- # Update the full response and yield incremental changes
65
- full_response = token_chunk
66
- yield full_response
67
-
68
  except Exception as e:
69
- print(f"Error during generation: {e}")
70
- yield "An error occurred."
71
 
72
  demo = gr.ChatInterface(
73
  respond,
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer
3
+ from llama_cpp import Llama
4
  import torch
5
 
6
+ # Configuration
7
+ MODEL_PATH = "./TinyLlama-Friendly-Psychotherapist.Q4_K_S.gguf"
8
+ MODEL_REPO = "thrishala/mental_health_chatbot"
 
 
9
 
10
  try:
11
+ # 1. Load the tokenizer from the original model repo
12
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
13
  tokenizer.pad_token = tokenizer.eos_token
14
  tokenizer.model_max_length = 4096
15
 
16
+ # 2. Load the GGUF model with llama-cpp-python
17
+ llm = Llama(
18
+ model_path=MODEL_PATH,
19
+ n_ctx=2048, # Context window size
20
+ n_threads=4, # CPU threads
21
+ n_gpu_layers=33 if torch.cuda.is_available() else 0, # GPU layers
22
+ )
23
+
24
  except Exception as e:
25
  print(f"Error loading model: {e}")
26
  exit()
27
 
28
  def generate_text_streaming(prompt, max_new_tokens=128):
29
+ # Tokenize using HF tokenizer
30
+ inputs = tokenizer(
31
+ prompt,
32
+ return_tensors="pt",
33
+ truncation=True,
34
+ max_length=4096
35
+ )
36
+
37
+ # Convert to string for llama.cpp
38
+ full_prompt = tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
39
+
40
+ # Create generator
41
+ stream = llm.create_completion(
42
+ prompt=full_prompt,
43
+ max_tokens=max_new_tokens,
44
+ temperature=0.7,
45
+ stream=True,
46
+ stop=["User:", "###"], # Stop sequences
47
+ )
 
 
 
48
 
49
+ generated_text = ""
50
+ for output in stream:
51
+ chunk = output["choices"][0]["text"]
52
+ generated_text += chunk
53
+ yield generated_text
54
 
55
  def respond(message, history, system_message, max_tokens):
56
+ # Build prompt with history
57
  prompt = f"{system_message}\n"
58
  for user_msg, bot_msg in history:
59
  prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
60
  prompt += f"User: {message}\nAssistant:"
61
 
 
 
 
62
  try:
63
+ for chunk in generate_text_streaming(prompt, max_tokens):
64
+ yield chunk
 
 
 
65
  except Exception as e:
66
+ print(f"Error: {e}")
67
+ yield "An error occurred during generation."
68
 
69
  demo = gr.ChatInterface(
70
  respond,