Spaces:

patrol114
/

YetiAI

Sleeping

File size: 2,835 Bytes

1ecdaca
f4a7d4e
1ecdaca
 
f4a7d4e
 
 
 
 
 
 
1ecdaca
f4a7d4e
 
 
 
 
 
 
1ecdaca
 
f4a7d4e
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
 
 
1ecdaca
f4a7d4e
1ecdaca
 
f4a7d4e
 
 
 
 
 
 
 
 
1ecdaca
 
 
f4a7d4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ecdaca
f4a7d4e
 
 
 
 
1ecdaca
 
 
f4a7d4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
 
 
 
 
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
1ecdaca
f4a7d4e
 
 
1ecdaca
f4a7d4e
 
 
 
 
1ecdaca
 
f4a7d4e
1ecdaca

import gradio as gr

from huggingface_hub import InferenceClient

from transformers import AutoTokenizer, AutoModelForCausalLM

import torch

# Initialize the InferenceClient

client = InferenceClient("01-ai/Yi-Coder-9B-Chat")

# Initialize tokenizer and model

model_path = "01-ai/Yi-Coder-9B-Chat"  # Make sure this is correct

tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto").eval()

def respond(

    message,

    history: list[tuple[str, str]],

    system_message,

    max_tokens,

    temperature,

    top_p,

    use_local_model: bool,

):

    messages = [{"role": "system", "content": system_message}]

    for user, assistant in history:

        if user:

            messages.append({"role": "user", "content": user})

        if assistant:

            messages.append({"role": "assistant", "content": assistant})

    messages.append({"role": "user", "content": message})

    if use_local_model:

        # Use local model

        input_ids = tokenizer.encode("".join([m["content"] for m in messages]), return_tensors="pt")

        input_ids = input_ids.to(model.device)

        

        with torch.no_grad():

            output = model.generate(

                input_ids,

                max_new_tokens=max_tokens,

                temperature=temperature,

                top_p=top_p,

                do_sample=True,

                pad_token_id=tokenizer.eos_token_id,

            )

        

        response = tokenizer.decode(output[0], skip_special_tokens=True)

        yield response

    else:

        # Use Hugging Face Inference API

        response = ""

        for message in client.text_generation(

            "".join([m["content"] for m in messages]),

            max_new_tokens=max_tokens,

            stream=True,

            temperature=temperature,

            top_p=top_p,

        ):

            response += message

            yield response

# Create Gradio interface

demo = gr.ChatInterface(

    respond,

    additional_inputs=[

        gr.Textbox(value="Odpowiadasz w Jezyku Polskim jesteś Coder/Developer/Programista tworzysz pełny kod..", label="System message"),

        gr.Slider(minimum=1, maximum=2048, value=2048, step=1, label="Max new tokens"),

        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),

        gr.Slider(

            minimum=0.1,

            maximum=1.0,

            value=0.95,

            step=0.05,

            label="Top-p (nucleus sampling)",

        ),

        gr.Checkbox(label="Use Local Model", value=False),

    ],

    title="Advanced Chat Interface",

    description="Chat with an AI model using either the Hugging Face Inference API or a local model.",

)

if name == "__main__":

    demo.launch()