Spaces:

bobpopboom
/

Mental_chat

Sleeping

App Files Files Community

bobpopboom commited on Feb 10

Commit

c215361

verified ·

1 Parent(s): 9c7a437

he

Browse files

Files changed (1) hide show

app.py +35 -48

app.py CHANGED Viewed

@@ -1,65 +1,52 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-# Determine device
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_id = "mradermacher/TinyLlama-Friendly-Psychotherapist-GGUF"
 try:
-    # Load model with appropriate settings
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        device_map="auto",
-        torch_dtype=torch.float16,
-        low_cpu_mem_usage=True,
-        max_memory={0: "15GiB"} if torch.cuda.is_available() else None,
-        offload_folder="offload",
-    ).eval()
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
     tokenizer.pad_token = tokenizer.eos_token
-    tokenizer.model_max_length = 4096  # Set to model's actual context length
 except Exception as e:
     print(f"Error loading model: {e}")
     exit()
 def generate_text_streaming(prompt, max_new_tokens=128):
-    inputs = tokenizer(
-        prompt,
-        return_tensors="pt",
-        truncation=True,
-        max_length=4096  # Match model's context length
-    ).to(model.device)
     generated_tokens = []
-    with torch.no_grad():
-        for _ in range(max_new_tokens):
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=1,
-                do_sample=False,
-                eos_token_id=tokenizer.eos_token_id,
-                return_dict_in_generate=True
-            )
-            new_token = outputs.sequences[0, -1]
-            generated_tokens.append(new_token)
-            # Update inputs for next iteration
-            inputs = {
-                "input_ids": torch.cat([inputs["input_ids"], new_token.unsqueeze(0).unsqueeze(0)], dim=-1),
-                "attention_mask": torch.cat([inputs["attention_mask"], torch.ones(1, 1, device=model.device)], dim=-1)
-            }
-            # Decode the accumulated tokens
-            current_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-            yield current_text  # Yield the full text so far
-            if new_token == tokenizer.eos_token_id:
-                break
 def respond(message, history, system_message, max_tokens):
     # Build prompt with full history

 import gradio as gr
+from transformers import AutoTokenizer
+import ctranslate2
 import torch
+# Determine device (ctranslate2 handles device placement internally)
+device = "cuda" if torch.cuda.is_available() else "cpu"  # Still useful for other ops
+model_path = "mradermacher/TinyLlama-Friendly-Psychotherapist-GGUF"  # Path to your GGUF model
 try:
+    # 1. Load the tokenizer (same as before)
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
     tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.model_max_length = 4096
+    # 2. Load the ctranslate2 model
+    ct_model = ctranslate2.Translator(model_path)  # Load the GGUF model
+    ct_model.eval()
 except Exception as e:
     print(f"Error loading model: {e}")
     exit()
 def generate_text_streaming(prompt, max_new_tokens=128):
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(device)
     generated_tokens = []
+    for _ in range(max_new_tokens):
+        # ctranslate2 generation (adjust as needed)
+        outputs = ct_model.translate_batch(
+            inputs.input_ids.tolist(),  # ctranslate2 needs list of token ids
+            max_length=1, # Generate one token at a time
+            beam_size=1, # Greedy decoding
+        ).eval()
+        new_token_id = outputs[0][0][-1]  # Extract the generated token ID
+        new_token = tokenizer.decode(new_token_id, skip_special_tokens=True)
+        if new_token_id == tokenizer.eos_token_id:
+            break
+        generated_tokens.append(new_token_id)
+        current_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        yield current_text
+        inputs["input_ids"] = torch.cat([inputs["input_ids"], torch.tensor([[new_token_id]], device=inputs["input_ids"].device)], dim=-1)
+        inputs["attention_mask"] = torch.cat([inputs["attention_mask"], torch.ones(1, 1, device=inputs["attention_mask"].device)], dim=-1)
 def respond(message, history, system_message, max_tokens):
     # Build prompt with full history