Kilos1 commited on
Commit
d7dbc2c
·
verified ·
1 Parent(s): dbd1c33

Update multimodal_queries.py

Browse files
Files changed (1) hide show
  1. multimodal_queries.py +86 -0
multimodal_queries.py CHANGED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import base64
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ import gradio as gr
5
+
6
+ # Load the Hugging Face model and tokenizer
7
+ model_id = "meta-llama/llama-3-2-90b-vision-instruct"
8
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
9
+ model = AutoModelForCausalLM.from_pretrained(model_id)
10
+
11
+ def input_image_setup(uploaded_file):
12
+ """
13
+ Encodes the uploaded image file into a base64 string.
14
+
15
+ Parameters:
16
+ - uploaded_file: File-like object uploaded via Gradio.
17
+
18
+ Returns:
19
+ - encoded_image (str): Base64 encoded string of the image data.
20
+ """
21
+ if uploaded_file is not None:
22
+ bytes_data = uploaded_file.read()
23
+ encoded_image = base64.b64encode(bytes_data).decode("utf-8")
24
+ return encoded_image
25
+ else:
26
+ raise FileNotFoundError("No file uploaded")
27
+
28
+ def generate_model_response(encoded_image, user_query, assistant_prompt="You are a helpful assistant. Answer the following user query in 1 or 2 sentences: "):
29
+ """
30
+ Sends an image and a query to the model and retrieves the description or answer.
31
+
32
+ Parameters:
33
+ - encoded_image (str): Base64-encoded image string.
34
+ - user_query (str): The user's question about the image.
35
+ - assistant_prompt (str): Optional prompt to guide the model's response.
36
+
37
+ Returns:
38
+ - str: The model's response for the given image and query.
39
+ """
40
+
41
+ # Prepare input for the model
42
+ input_text = assistant_prompt + user_query + "\n![Image](data:image/jpeg;base64," + encoded_image + ")"
43
+
44
+ # Tokenize input text
45
+ inputs = tokenizer(input_text, return_tensors="pt")
46
+
47
+ # Generate response from the model
48
+ outputs = model.generate(**inputs)
49
+
50
+ # Decode and return the model's response
51
+ response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
52
+
53
+ return response_text
54
+
55
+ def process_image_and_query(uploaded_file, user_query):
56
+ """
57
+ Process the uploaded image and user query to generate a response from the model.
58
+
59
+ Parameters:
60
+ - uploaded_file: The uploaded image file.
61
+ - user_query: The user's question about the image.
62
+
63
+ Returns:
64
+ - str: The generated response from the model.
65
+ """
66
+
67
+ # Encode the uploaded image
68
+ encoded_image = input_image_setup(uploaded_file)
69
+
70
+ # Generate response using the encoded image and user query
71
+ response = generate_model_response(encoded_image, user_query)
72
+
73
+ return response
74
+
75
+ # Create Gradio interface
76
+ iface = gr.Interface(
77
+ fn=process_image_and_query,
78
+ inputs=[
79
+ gr.inputs.Image(type="file", label="Upload Image"),
80
+ gr.inputs.Textbox(label="User Query", placeholder="Enter your question about the image...")
81
+ ],
82
+ outputs="text",
83
+ )
84
+
85
+ # Launch the Gradio app
86
+ iface.launch()