Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	deep seek xD
Browse files
    	
        app.py
    CHANGED
    
    | @@ -1,73 +1,70 @@ | |
| 1 | 
             
            import gradio as gr
         | 
| 2 | 
             
            from transformers import AutoTokenizer
         | 
| 3 | 
            -
            import  | 
| 4 | 
             
            import torch
         | 
| 5 |  | 
| 6 | 
            -
            #  | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
            model_download_link =  "https://huggingface.co/mradermacher/TinyLlama-Friendly-Psychotherapist-GGUF/resolve/main/TinyLlama-Friendly-Psychotherapist.Q4_K_S.gguf"
         | 
| 10 | 
            -
            model_path = "./TinyLlama-Friendly-Psychotherapist.Q4_K_S.gguf" # gguf
         | 
| 11 |  | 
| 12 | 
             
            try:
         | 
| 13 | 
            -
                # 1. Load the tokenizer  | 
| 14 | 
            -
                tokenizer = AutoTokenizer.from_pretrained( | 
| 15 | 
             
                tokenizer.pad_token = tokenizer.eos_token
         | 
| 16 | 
             
                tokenizer.model_max_length = 4096
         | 
| 17 |  | 
| 18 | 
            -
                # 2. Load the  | 
| 19 | 
            -
                 | 
| 20 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 21 | 
             
            except Exception as e:
         | 
| 22 | 
             
                print(f"Error loading model: {e}")
         | 
| 23 | 
             
                exit()
         | 
| 24 |  | 
| 25 | 
             
            def generate_text_streaming(prompt, max_new_tokens=128):
         | 
| 26 | 
            -
                 | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
                     | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
                     | 
| 40 | 
            -
             | 
| 41 | 
            -
                     | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
                    current_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
         | 
| 47 | 
            -
                    yield current_text
         | 
| 48 |  | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
|  | |
|  | |
|  | |
| 51 |  | 
| 52 | 
             
            def respond(message, history, system_message, max_tokens):
         | 
| 53 | 
            -
                # Build prompt with  | 
| 54 | 
             
                prompt = f"{system_message}\n"
         | 
| 55 | 
             
                for user_msg, bot_msg in history:
         | 
| 56 | 
             
                    prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
         | 
| 57 | 
             
                prompt += f"User: {message}\nAssistant:"
         | 
| 58 |  | 
| 59 | 
            -
                # Keep track of the full response
         | 
| 60 | 
            -
                full_response = ""
         | 
| 61 | 
            -
                
         | 
| 62 | 
             
                try:
         | 
| 63 | 
            -
                    for  | 
| 64 | 
            -
                         | 
| 65 | 
            -
                        full_response = token_chunk
         | 
| 66 | 
            -
                        yield full_response
         | 
| 67 | 
            -
                        
         | 
| 68 | 
             
                except Exception as e:
         | 
| 69 | 
            -
                    print(f"Error | 
| 70 | 
            -
                    yield "An error occurred."
         | 
| 71 |  | 
| 72 | 
             
            demo = gr.ChatInterface(
         | 
| 73 | 
             
                respond,
         | 
|  | |
| 1 | 
             
            import gradio as gr
         | 
| 2 | 
             
            from transformers import AutoTokenizer
         | 
| 3 | 
            +
            from llama_cpp import Llama
         | 
| 4 | 
             
            import torch
         | 
| 5 |  | 
| 6 | 
            +
            # Configuration
         | 
| 7 | 
            +
            MODEL_PATH = "./TinyLlama-Friendly-Psychotherapist.Q4_K_S.gguf"
         | 
| 8 | 
            +
            MODEL_REPO = "thrishala/mental_health_chatbot"
         | 
|  | |
|  | |
| 9 |  | 
| 10 | 
             
            try:
         | 
| 11 | 
            +
                # 1. Load the tokenizer from the original model repo
         | 
| 12 | 
            +
                tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
         | 
| 13 | 
             
                tokenizer.pad_token = tokenizer.eos_token
         | 
| 14 | 
             
                tokenizer.model_max_length = 4096
         | 
| 15 |  | 
| 16 | 
            +
                # 2. Load the GGUF model with llama-cpp-python
         | 
| 17 | 
            +
                llm = Llama(
         | 
| 18 | 
            +
                    model_path=MODEL_PATH,
         | 
| 19 | 
            +
                    n_ctx=2048,  # Context window size
         | 
| 20 | 
            +
                    n_threads=4,  # CPU threads
         | 
| 21 | 
            +
                    n_gpu_layers=33 if torch.cuda.is_available() else 0,  # GPU layers
         | 
| 22 | 
            +
                )
         | 
| 23 | 
            +
             | 
| 24 | 
             
            except Exception as e:
         | 
| 25 | 
             
                print(f"Error loading model: {e}")
         | 
| 26 | 
             
                exit()
         | 
| 27 |  | 
| 28 | 
             
            def generate_text_streaming(prompt, max_new_tokens=128):
         | 
| 29 | 
            +
                # Tokenize using HF tokenizer
         | 
| 30 | 
            +
                inputs = tokenizer(
         | 
| 31 | 
            +
                    prompt,
         | 
| 32 | 
            +
                    return_tensors="pt",
         | 
| 33 | 
            +
                    truncation=True,
         | 
| 34 | 
            +
                    max_length=4096
         | 
| 35 | 
            +
                )
         | 
| 36 | 
            +
                
         | 
| 37 | 
            +
                # Convert to string for llama.cpp
         | 
| 38 | 
            +
                full_prompt = tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
         | 
| 39 | 
            +
                
         | 
| 40 | 
            +
                # Create generator
         | 
| 41 | 
            +
                stream = llm.create_completion(
         | 
| 42 | 
            +
                    prompt=full_prompt,
         | 
| 43 | 
            +
                    max_tokens=max_new_tokens,
         | 
| 44 | 
            +
                    temperature=0.7,
         | 
| 45 | 
            +
                    stream=True,
         | 
| 46 | 
            +
                    stop=["User:", "###"],  # Stop sequences
         | 
| 47 | 
            +
                )
         | 
|  | |
|  | |
|  | |
| 48 |  | 
| 49 | 
            +
                generated_text = ""
         | 
| 50 | 
            +
                for output in stream:
         | 
| 51 | 
            +
                    chunk = output["choices"][0]["text"]
         | 
| 52 | 
            +
                    generated_text += chunk
         | 
| 53 | 
            +
                    yield generated_text
         | 
| 54 |  | 
| 55 | 
             
            def respond(message, history, system_message, max_tokens):
         | 
| 56 | 
            +
                # Build prompt with history
         | 
| 57 | 
             
                prompt = f"{system_message}\n"
         | 
| 58 | 
             
                for user_msg, bot_msg in history:
         | 
| 59 | 
             
                    prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
         | 
| 60 | 
             
                prompt += f"User: {message}\nAssistant:"
         | 
| 61 |  | 
|  | |
|  | |
|  | |
| 62 | 
             
                try:
         | 
| 63 | 
            +
                    for chunk in generate_text_streaming(prompt, max_tokens):
         | 
| 64 | 
            +
                        yield chunk
         | 
|  | |
|  | |
|  | |
| 65 | 
             
                except Exception as e:
         | 
| 66 | 
            +
                    print(f"Error: {e}")
         | 
| 67 | 
            +
                    yield "An error occurred during generation."
         | 
| 68 |  | 
| 69 | 
             
            demo = gr.ChatInterface(
         | 
| 70 | 
             
                respond,
         |