File size: 2,757 Bytes
d7dbc2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import requests
import base64
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr

# Load the Hugging Face model and tokenizer
model_id = "meta-llama/llama-3-2-90b-vision-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

def input_image_setup(uploaded_file):
    """
    Encodes the uploaded image file into a base64 string.

    Parameters:
    - uploaded_file: File-like object uploaded via Gradio.

    Returns:
    - encoded_image (str): Base64 encoded string of the image data.
    """
    if uploaded_file is not None:
        bytes_data = uploaded_file.read()
        encoded_image = base64.b64encode(bytes_data).decode("utf-8")
        return encoded_image
    else:
        raise FileNotFoundError("No file uploaded")

def generate_model_response(encoded_image, user_query, assistant_prompt="You are a helpful assistant. Answer the following user query in 1 or 2 sentences: "):
    """
    Sends an image and a query to the model and retrieves the description or answer.

    Parameters:
    - encoded_image (str): Base64-encoded image string.
    - user_query (str): The user's question about the image.
    - assistant_prompt (str): Optional prompt to guide the model's response.

    Returns:
    - str: The model's response for the given image and query.
    """
    
    # Prepare input for the model
    input_text = assistant_prompt + user_query + "\n![Image](data:image/jpeg;base64," + encoded_image + ")"
    
    # Tokenize input text
    inputs = tokenizer(input_text, return_tensors="pt")
    
    # Generate response from the model
    outputs = model.generate(**inputs)
    
    # Decode and return the model's response
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response_text

def process_image_and_query(uploaded_file, user_query):
    """
    Process the uploaded image and user query to generate a response from the model.

    Parameters:
    - uploaded_file: The uploaded image file.
    - user_query: The user's question about the image.

    Returns:
    - str: The generated response from the model.
    """
    
    # Encode the uploaded image
    encoded_image = input_image_setup(uploaded_file)
    
    # Generate response using the encoded image and user query
    response = generate_model_response(encoded_image, user_query)
    
    return response

# Create Gradio interface
iface = gr.Interface(
    fn=process_image_and_query,
    inputs=[
        gr.inputs.Image(type="file", label="Upload Image"),
        gr.inputs.Textbox(label="User Query", placeholder="Enter your question about the image...")
    ],
    outputs="text",
)

# Launch the Gradio app
iface.launch()