capradeepgujaran's picture
Create app.py
4ec8ad4 verified
raw
history blame
1.9 kB
import gradio as gr
from PIL import Image
import torch
from transformers import AutoProcessor, LlamaForCausalLM, LlamaTokenizer
# Load the Llama 2 model and processor
# Note: You'll need to replace these with the actual Llama 3.2 vision model when it becomes available
model_name = "meta-llama/Llama-2-7b-chat-hf"
processor = AutoProcessor.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)
tokenizer = LlamaTokenizer.from_pretrained(model_name)
def analyze_construction_image(image):
# Process the image
inputs = processor(images=image, return_tensors="pt")
# Generate text based on the image
prompt = "Analyze this construction image and identify the snag category, snag description, and steps to desnag."
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
# Concatenate the image embeddings with the text input
combined_inputs = torch.cat([inputs.pixel_values, input_ids], dim=1)
# Generate output
outputs = model.generate(combined_inputs, max_length=300)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Parse the result (this is a simplified example)
lines = result.split('\n')
snag_category = lines[0] if len(lines) > 0 else "N/A"
snag_description = lines[1] if len(lines) > 1 else "N/A"
desnag_steps = lines[2:] if len(lines) > 2 else ["N/A"]
return snag_category, snag_description, "\n".join(desnag_steps)
# Create the Gradio interface
iface = gr.Interface(
fn=analyze_construction_image,
inputs=gr.Image(type="pil"),
outputs=[
gr.Textbox(label="Snag Category"),
gr.Textbox(label="Snag Description"),
gr.Textbox(label="Steps to Desnag")
],
title="Construction Image Analyzer",
description="Upload a construction site image to identify issues and get desnag steps."
)
# Launch the app
iface.launch()