import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import torch import threading model_name = "lambdaindie/lambda-1v-1B" # Carrega modelo na CPU de forma mais leve model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # ou torch.bfloat16 se suportar low_cpu_mem_usage=True ) tokenizer = AutoTokenizer.from_pretrained(model_name) stop_flag = {"stop": False} def respond(prompt, history): stop_flag["stop"] = False history = history[-3:] # Mantém apenas os últimos 3 pares full_prompt = f"\nThink a bit step-by-step before answering.\nQuestion: {prompt}\nAnswer:" inputs = tokenizer(full_prompt, return_tensors="pt") streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_thread = threading.Thread( target=model.generate, kwargs={ "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "max_new_tokens": 512, "do_sample": True, "temperature": 0.7, "top_p": 0.9, "pad_token_id": tokenizer.eos_token_id, "streamer": streamer, } ) generation_thread.start() reasoning = "" for new_text in streamer: if stop_flag["stop"]: return "", history reasoning += new_text yield "", history + [(prompt, f"