import gradio as gr import spaces from cumo.model.builder import load_pretrained_model from cumo.mm_utils import process_images, tokenizer_image_token from cumo.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN import torch from PIL import Image model_path = "BenkHel/CumoThesis" model_base = None model_name = "CumoThesis" # oder "BenkHel/CumoThesis" load_8bit = False load_4bit = False device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer, model, image_processor, context_len = load_pretrained_model( model_path, model_base, model_name, load_8bit, load_4bit, device, use_flash_attn=False ) PROMPT = "What material is this item and how is it disposed of?" PROMPT_WITH_IMAGE = f"{DEFAULT_IMAGE_TOKEN} {PROMPT}" @spaces.GPU def classify_image(image): if image is None: return "Please upload an image." if not isinstance(image, Image.Image): image = Image.fromarray(image) images = process_images([image], image_processor, model.config) images = [img.to(device, dtype=torch.float16) for img in images] image_args = {"images": images} input_ids = tokenizer_image_token(PROMPT_WITH_IMAGE, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(device) with torch.no_grad(): outputs = model.generate( inputs=input_ids, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id, **image_args ) output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) answer = output_text[len(PROMPT):].strip() if output_text.startswith(PROMPT) else output_text return answer iface = gr.Interface( fn=classify_image, inputs=gr.Image(type="pil", label="Upload an image of a waste item"), outputs=gr.Textbox(label="Classification & Disposal Recommendation"), title="CuMo Waste Classifier", description="Upload a photo of a household waste item. The model will classify the material and recommend how to dispose of it." ) if __name__ == "__main__": iface.launch()