Spaces:

bobpopboom
/

Mental_chat

Sleeping

App Files Files Community

bobpopboom commited on Feb 10

Commit

2bd985b

verified ·

1 Parent(s): 5be4cb8

deep seek xD

Browse files

Files changed (1) hide show

app.py +43 -46

app.py CHANGED Viewed

@@ -1,73 +1,70 @@
 import gradio as gr
 from transformers import AutoTokenizer
-import ctranslate2
 import torch
-# Determine device (ctranslate2 handles device placement internally)
-device = "cuda" if torch.cuda.is_available() else "cpu"  # Still useful for other ops
-model="thrishala/mental_health_chatbot"
-model_download_link =  "https://huggingface.co/mradermacher/TinyLlama-Friendly-Psychotherapist-GGUF/resolve/main/TinyLlama-Friendly-Psychotherapist.Q4_K_S.gguf"
-model_path = "./TinyLlama-Friendly-Psychotherapist.Q4_K_S.gguf" # gguf
 try:
-    # 1. Load the tokenizer (same as before)
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
     tokenizer.pad_token = tokenizer.eos_token
     tokenizer.model_max_length = 4096
-    # 2. Load the ctranslate2 model
-    ct_model = ctranslate2.Translator(model_path)  # Load the GGUF model
-    ct_model.eval()
 except Exception as e:
     print(f"Error loading model: {e}")
     exit()
 def generate_text_streaming(prompt, max_new_tokens=128):
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(device)
-    generated_tokens = []
-    for _ in range(max_new_tokens):
-        # ctranslate2 generation (adjust as needed)
-        outputs = ct_model.translate_batch(
-            inputs.input_ids.tolist(),  # ctranslate2 needs list of token ids
-            max_length=1, # Generate one token at a time
-            beam_size=1, # Greedy decoding
-        )
-        new_token_id = outputs[0][0][-1]  # Extract the generated token ID
-        new_token = tokenizer.decode(new_token_id, skip_special_tokens=True)
-        if new_token_id == tokenizer.eos_token_id:
-            break
-        generated_tokens.append(new_token_id)
-        current_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-        yield current_text
-        inputs["input_ids"] = torch.cat([inputs["input_ids"], torch.tensor([[new_token_id]], device=inputs["input_ids"].device)], dim=-1)
-        inputs["attention_mask"] = torch.cat([inputs["attention_mask"], torch.ones(1, 1, device=inputs["attention_mask"].device)], dim=-1)
 def respond(message, history, system_message, max_tokens):
-    # Build prompt with full history
     prompt = f"{system_message}\n"
     for user_msg, bot_msg in history:
         prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
     prompt += f"User: {message}\nAssistant:"
-    # Keep track of the full response
-    full_response = ""
     try:
-        for token_chunk in generate_text_streaming(prompt, max_tokens):
-            # Update the full response and yield incremental changes
-            full_response = token_chunk
-            yield full_response
     except Exception as e:
-        print(f"Error during generation: {e}")
-        yield "An error occurred."
 demo = gr.ChatInterface(
     respond,

 import gradio as gr
 from transformers import AutoTokenizer
+from llama_cpp import Llama
 import torch
+# Configuration
+MODEL_PATH = "./TinyLlama-Friendly-Psychotherapist.Q4_K_S.gguf"
+MODEL_REPO = "thrishala/mental_health_chatbot"
 try:
+    # 1. Load the tokenizer from the original model repo
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
     tokenizer.pad_token = tokenizer.eos_token
     tokenizer.model_max_length = 4096
+    # 2. Load the GGUF model with llama-cpp-python
+    llm = Llama(
+        model_path=MODEL_PATH,
+        n_ctx=2048,  # Context window size
+        n_threads=4,  # CPU threads
+        n_gpu_layers=33 if torch.cuda.is_available() else 0,  # GPU layers
+    )
 except Exception as e:
     print(f"Error loading model: {e}")
     exit()
 def generate_text_streaming(prompt, max_new_tokens=128):
+    # Tokenize using HF tokenizer
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=4096
+    )
+    # Convert to string for llama.cpp
+    full_prompt = tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
+    # Create generator
+    stream = llm.create_completion(
+        prompt=full_prompt,
+        max_tokens=max_new_tokens,
+        temperature=0.7,
+        stream=True,
+        stop=["User:", "###"],  # Stop sequences
+    )
+    generated_text = ""
+    for output in stream:
+        chunk = output["choices"][0]["text"]
+        generated_text += chunk
+        yield generated_text
 def respond(message, history, system_message, max_tokens):
+    # Build prompt with history
     prompt = f"{system_message}\n"
     for user_msg, bot_msg in history:
         prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
     prompt += f"User: {message}\nAssistant:"
     try:
+        for chunk in generate_text_streaming(prompt, max_tokens):
+            yield chunk
     except Exception as e:
+        print(f"Error: {e}")
+        yield "An error occurred during generation."
 demo = gr.ChatInterface(
     respond,