import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import threading import torch # Detectar dispositivo automaticamente (GPU ou CPU) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Carregar modelo e tokenizer model_name = "lambdaindie/lambda-1v-1B" model = AutoModelForCausalLM.from_pretrained(model_name).to(device) tokenizer = AutoTokenizer.from_pretrained(model_name) stop_flag = {"stop": False} # Função de resposta def respond(prompt, history): stop_flag["stop"] = False full_prompt = f"\nThink a bit step-by-step before answering. \nQuestion: {prompt} \nAnswer:" inputs = tokenizer(full_prompt, return_tensors="pt").to(device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # Iniciar thread de geração generation_thread = threading.Thread( target=model.generate, kwargs={ "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "max_new_tokens": 512, "do_sample": True, "temperature": 0.7, "top_p": 0.9, "pad_token_id": tokenizer.eos_token_id, "streamer": streamer, } ) generation_thread.start() reasoning = "" for new_text in streamer: if stop_flag["stop"]: return "", history reasoning += new_text yield "", (history or []) + [(prompt, f"