Construction_Snag_Tool_Llama_3.2_Vision

Running

File size: 1,904 Bytes

4ec8ad4

import gradio as gr
from PIL import Image
import torch
from transformers import AutoProcessor, LlamaForCausalLM, LlamaTokenizer

# Load the Llama 2 model and processor
# Note: You'll need to replace these with the actual Llama 3.2 vision model when it becomes available
model_name = "meta-llama/Llama-2-7b-chat-hf"
processor = AutoProcessor.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)
tokenizer = LlamaTokenizer.from_pretrained(model_name)

def analyze_construction_image(image):
    # Process the image
    inputs = processor(images=image, return_tensors="pt")
    
    # Generate text based on the image
    prompt = "Analyze this construction image and identify the snag category, snag description, and steps to desnag."
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    
    # Concatenate the image embeddings with the text input
    combined_inputs = torch.cat([inputs.pixel_values, input_ids], dim=1)
    
    # Generate output
    outputs = model.generate(combined_inputs, max_length=300)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Parse the result (this is a simplified example)
    lines = result.split('\n')
    snag_category = lines[0] if len(lines) > 0 else "N/A"
    snag_description = lines[1] if len(lines) > 1 else "N/A"
    desnag_steps = lines[2:] if len(lines) > 2 else ["N/A"]
    
    return snag_category, snag_description, "\n".join(desnag_steps)

# Create the Gradio interface
iface = gr.Interface(
    fn=analyze_construction_image,
    inputs=gr.Image(type="pil"),
    outputs=[
        gr.Textbox(label="Snag Category"),
        gr.Textbox(label="Snag Description"),
        gr.Textbox(label="Steps to Desnag")
    ],
    title="Construction Image Analyzer",
    description="Upload a construction site image to identify issues and get desnag steps."
)

# Launch the app
iface.launch()