updates
Browse files- .gradio/cached_examples/8/log.csv +13 -0
- README.md +2 -2
- app.py +88 -0
- car.jpg +0 -0
- green_creature.jpg +0 -0
- requirements.txt +3 -0
- utils.py +41 -0
- verifier_prompt.txt +54 -0
.gradio/cached_examples/8/log.csv
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Grading Output,timestamp
|
2 |
+
"* accuracy_to_prompt: 9.0 (explanation: The image accurately depicts a shiny black SUV with a mountain background, closely following the prompt's description.)
|
3 |
+
* creativity_and_originality: 7.0 (explanation: The image showcases a standard but appealing composition, capturing a realistic scene. While it doesn't venture into highly creative interpretations, it's well-executed.)
|
4 |
+
* visual_quality_and_realism: 9.0 (explanation: The visual quality is excellent, with sharp details and realistic rendering. The lighting and textures are well-handled, contributing to a high level of realism.)
|
5 |
+
* consistency_and_cohesion: 9.0 (explanation: The image maintains a high level of internal consistency. All elements are logically placed, and the lighting and perspective align well, creating a cohesive scene.)
|
6 |
+
* emotional_and_thematic_resonance: 8.0 (explanation: The image conveys a sense of adventure and exploration. The mountain backdrop and the sturdy SUV evoke feelings of freedom and the call of the outdoors.)
|
7 |
+
* overall_score: 8.4 (explanation: Considering all aspects, the image is a strong and well-executed representation of the prompt. It effectively combines realism, accuracy, and emotional resonance.)",2025-02-10 11:28:15.347939
|
8 |
+
"* accuracy_to_prompt: 8.0 (explanation: The image accurately depicts a green creature in a forest, fulfilling the main elements of the prompt. However, 'funny' is subjective and open to interpretation.)
|
9 |
+
* creativity_and_originality: 6.0 (explanation: While the image fulfills the basic requirements, it doesn't show a high level of creativity. The creature's design is somewhat generic.)
|
10 |
+
* visual_quality_and_realism: 9.0 (explanation: The visual quality is high, with good detail and rendering. The creature and forest are realistically depicted, enhancing the overall appeal.)
|
11 |
+
* consistency_and_cohesion: 9.0 (explanation: The image maintains internal consistency with the creature fitting well within its forest surroundings. The lighting and perspective are coherent.)
|
12 |
+
* emotional_and_thematic_resonance: 7.0 (explanation: The image evokes a lighthearted and slightly whimsical feeling. It doesn't strongly resonate emotionally but does align with the 'funny' aspect.)
|
13 |
+
* overall_score: 7.5 (explanation: The overall score reflects a solid execution of the prompt with minor areas for improvement in creativity and emotional depth. The image is well-composed and aligned with the prompt.)",2025-02-10 11:28:22.288757
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: Grade Images
|
3 |
emoji: 🚀
|
4 |
colorFrom: yellow
|
5 |
colorTo: purple
|
@@ -7,7 +7,7 @@ sdk: gradio
|
|
7 |
sdk_version: 5.15.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
short_description: Uses Gemini 2.0
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Grade Images with Gemini
|
3 |
emoji: 🚀
|
4 |
colorFrom: yellow
|
5 |
colorTo: purple
|
|
|
7 |
sdk_version: 5.15.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
short_description: Uses Gemini 2.0 Flash to grade images.
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from google import genai
|
3 |
+
from utils import *
|
4 |
+
from PIL import Image
|
5 |
+
import os
|
6 |
+
|
7 |
+
|
8 |
+
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
|
9 |
+
system_instruction = load_verifier_prompt()
|
10 |
+
generation_config = types.GenerateContentConfig(
|
11 |
+
system_instruction=system_instruction,
|
12 |
+
response_mime_type="application/json",
|
13 |
+
response_schema=list[Grading],
|
14 |
+
seed=1994,
|
15 |
+
)
|
16 |
+
|
17 |
+
|
18 |
+
def make_inputs(prompt, image):
|
19 |
+
inputs = []
|
20 |
+
inputs.extend(prepare_inputs(prompt=prompt, image=image))
|
21 |
+
return inputs
|
22 |
+
|
23 |
+
|
24 |
+
def format_response(response: dict):
|
25 |
+
out = ""
|
26 |
+
for key, value in response.items():
|
27 |
+
score = f"* **{key}**: {value['score']} (explanation: {value['explanation']})\n"
|
28 |
+
out += score
|
29 |
+
return out
|
30 |
+
|
31 |
+
|
32 |
+
def grade(prompt, image):
|
33 |
+
inputs = make_inputs(prompt, image)
|
34 |
+
response = client.models.generate_content(
|
35 |
+
model="gemini-2.0-flash", contents=types.Content(parts=inputs, role="user"), config=generation_config
|
36 |
+
)
|
37 |
+
parsed_response = response.parsed[0]
|
38 |
+
return format_response(parsed_response)
|
39 |
+
|
40 |
+
|
41 |
+
examples = [
|
42 |
+
["realistic photo a shiny black SUV car with a mountain in the background.", Image.open("car.jpg")],
|
43 |
+
["photo a green and funny creature standing in front a lightweight forest.", Image.open("green_creature.jpg")],
|
44 |
+
]
|
45 |
+
|
46 |
+
css = """
|
47 |
+
#col-container {
|
48 |
+
margin: 0 auto;
|
49 |
+
max-width: 520px;
|
50 |
+
}
|
51 |
+
"""
|
52 |
+
|
53 |
+
with gr.Blocks(css=css) as demo:
|
54 |
+
with gr.Column(elem_id="col-container"):
|
55 |
+
gr.Markdown(
|
56 |
+
f"""# Grade images with Gemini 2.0 Flash
|
57 |
+
|
58 |
+
Following aspects are considered during grading:
|
59 |
+
|
60 |
+
* Accuracy to Prompt
|
61 |
+
* Creativity and Originality
|
62 |
+
* Visual Quality and Realism
|
63 |
+
* Consistency and Cohesion
|
64 |
+
* Emotional or Thematic Resonance
|
65 |
+
|
66 |
+
The [system prompt](./verifier_prompt.txt) comes from the paper: [Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps](https://arxiv.org/abs/2501.09732).
|
67 |
+
"""
|
68 |
+
)
|
69 |
+
|
70 |
+
with gr.Row():
|
71 |
+
prompt = gr.Text(
|
72 |
+
label="Prompt",
|
73 |
+
show_label=False,
|
74 |
+
max_lines=1,
|
75 |
+
placeholder="Enter the prompt that generated the image to be graded.",
|
76 |
+
container=False,
|
77 |
+
)
|
78 |
+
run_button = gr.Button("Run", scale=0)
|
79 |
+
|
80 |
+
image = gr.Image(format="png", type="pil", label="Image", placeholder="The image to be graded.")
|
81 |
+
|
82 |
+
result = gr.Markdown(label="Grading Output")
|
83 |
+
|
84 |
+
gr.Examples(examples=examples, fn=grade, inputs=[prompt, image], outputs=[result], cache_examples=True)
|
85 |
+
|
86 |
+
gr.on(triggers=[run_button.click, prompt.submit], fn=grade, inputs=[prompt, image], outputs=[result])
|
87 |
+
|
88 |
+
demo.launch()
|
car.jpg
ADDED
![]() |
green_creature.jpg
ADDED
![]() |
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
google-genai
|
2 |
+
typing-extensions
|
3 |
+
Pillow
|
utils.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import typing_extensions as typing
|
2 |
+
from PIL import Image
|
3 |
+
import io
|
4 |
+
from google.genai import types
|
5 |
+
|
6 |
+
|
7 |
+
class Score(typing.TypedDict):
|
8 |
+
score: float
|
9 |
+
explanation: str
|
10 |
+
|
11 |
+
|
12 |
+
class Grading(typing.TypedDict):
|
13 |
+
accuracy_to_prompt: Score
|
14 |
+
creativity_and_originality: Score
|
15 |
+
visual_quality_and_realism: Score
|
16 |
+
consistency_and_cohesion: Score
|
17 |
+
emotional_and_thematic_resonance: Score
|
18 |
+
overall_score: Score
|
19 |
+
|
20 |
+
|
21 |
+
def convert_to_bytes(image: Image.Image) -> bytes:
|
22 |
+
image_bytes_io = io.BytesIO()
|
23 |
+
image.save(image_bytes_io, format="PNG")
|
24 |
+
return image_bytes_io.getvalue()
|
25 |
+
|
26 |
+
|
27 |
+
def prepare_inputs(prompt: str, image: Image.Image):
|
28 |
+
"""Prepare inputs for the API from a given prompt and image."""
|
29 |
+
inputs = [
|
30 |
+
types.Part.from_text(text=prompt),
|
31 |
+
types.Part.from_bytes(data=convert_to_bytes(image), mime_type="image/png"),
|
32 |
+
]
|
33 |
+
return inputs
|
34 |
+
|
35 |
+
|
36 |
+
def load_verifier_prompt():
|
37 |
+
"""Loads the system prompt for Gemini when it acts as a verifier to grade images."""
|
38 |
+
with open("verifier_prompt.txt", "r") as f:
|
39 |
+
verifier_prompt = f.read().replace('"""', "")
|
40 |
+
|
41 |
+
return verifier_prompt
|
verifier_prompt.txt
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
You are a multimodal large-language model tasked with evaluating images
|
3 |
+
generated by a text-to-image model. Your goal is to assess each generated
|
4 |
+
image based on specific aspects and provide a detailed critique, along with
|
5 |
+
a scoring system. The final output should be formatted as a JSON object
|
6 |
+
containing individual scores for each aspect and an overall score. The keys
|
7 |
+
in the JSON object should be: `accuracy_to_prompt`, `creativity_and_originality`,
|
8 |
+
`visual_quality_and_realism`, `consistency_and_cohesion`,
|
9 |
+
`emotional_and_thematic_resonance`, and `overall_score`. Below is a comprehensive
|
10 |
+
guide to follow in your evaluation process:
|
11 |
+
|
12 |
+
1. Key Evaluation Aspects and Scoring Criteria:
|
13 |
+
For each aspect, provide a score from 0 to 10, where 0 represents poor
|
14 |
+
performance and 10 represents excellent performance. For each score, include
|
15 |
+
a short explanation or justification (1-2 sentences) explaining why that
|
16 |
+
score was given. The aspects to evaluate are as follows:
|
17 |
+
|
18 |
+
a) Accuracy to Prompt
|
19 |
+
Assess how well the image matches the description given in the prompt.
|
20 |
+
Consider whether all requested elements are present and if the scene,
|
21 |
+
objects, and setting align accurately with the text. Score: 0 (no
|
22 |
+
alignment) to 10 (perfect match to prompt).
|
23 |
+
|
24 |
+
b) Creativity and Originality
|
25 |
+
Evaluate the uniqueness and creativity of the generated image. Does the
|
26 |
+
model present an imaginative or aesthetically engaging interpretation of the
|
27 |
+
prompt? Is there any evidence of creativity beyond a literal interpretation?
|
28 |
+
Score: 0 (lacks creativity) to 10 (highly creative and original).
|
29 |
+
|
30 |
+
c) Visual Quality and Realism
|
31 |
+
Assess the overall visual quality, including resolution, detail, and realism.
|
32 |
+
Look for coherence in lighting, shading, and perspective. Even if the image
|
33 |
+
is stylized or abstract, judge whether the visual elements are well-rendered
|
34 |
+
and visually appealing. Score: 0 (poor quality) to 10 (high-quality and
|
35 |
+
realistic).
|
36 |
+
|
37 |
+
d) Consistency and Cohesion
|
38 |
+
Check for internal consistency within the image. Are all elements cohesive
|
39 |
+
and aligned with the prompt? For instance, does the perspective make sense,
|
40 |
+
and do objects fit naturally within the scene without visual anomalies?
|
41 |
+
Score: 0 (inconsistent) to 10 (fully cohesive and consistent).
|
42 |
+
|
43 |
+
e) Emotional or Thematic Resonance
|
44 |
+
Evaluate how well the image evokes the intended emotional or thematic tone of
|
45 |
+
the prompt. For example, if the prompt is meant to be serene, does the image
|
46 |
+
convey calmness? If it’s adventurous, does it evoke excitement? Score: 0
|
47 |
+
(no resonance) to 10 (strong resonance with the prompt’s theme).
|
48 |
+
|
49 |
+
2. Overall Score
|
50 |
+
After scoring each aspect individually, provide an overall score,
|
51 |
+
representing the model’s general performance on this image. This should be
|
52 |
+
a weighted average based on the importance of each aspect to the prompt or an
|
53 |
+
average of all aspects.
|
54 |
+
"""
|