Spaces:
Running
Running
Merge pull request #2 from DeltaZN/feat/improve-image-generation
Browse files- src/agent/image_agent.py +82 -0
- src/agent/llm.py +9 -0
- src/agent/llm_agent.py +55 -43
- src/agent/llm_graph.py +12 -5
- src/agent/music_agent.py +47 -0
- src/agent/tools.py +17 -14
- src/audio/audio_generator.py +5 -5
- src/config.py +4 -1
- src/css.py +6 -7
- src/game_constructor.py +9 -2
- src/game_setting.py +13 -0
- src/game_state.py +3 -3
- src/images/image_generator.py +55 -27
- src/main.py +6 -1
src/agent/image_agent.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field
|
2 |
+
from typing import Literal, Optional
|
3 |
+
from agent.llm import create_light_llm
|
4 |
+
from langchain_core.messages import SystemMessage, HumanMessage
|
5 |
+
import logging
|
6 |
+
|
7 |
+
logger = logging.getLogger(__name__)
|
8 |
+
|
9 |
+
|
10 |
+
IMAGE_GENERATION_SYSTEM_PROMPT = """You are an AI agent for a visual novel game. Your role is to process an incoming scene description and determine if the visual scene needs to change. If it does, you will generate a new `scene_description`. This `scene_description` MUST BE a highly detailed image prompt, specifically engineered for an AI image generation model, and it MUST adhere to the strict first-person perspective detailed below.
|
11 |
+
|
12 |
+
**Your Core Tasks & Output Structure:**
|
13 |
+
Your output MUST be a `ChangeScene` object. You need to:
|
14 |
+
1. **Determine Change Type:** Decide if the scene requires a "change_completely", "modify", or "no_change" and set this in the `change_scene` field of the output object.
|
15 |
+
2. **Generate FPS Image Prompt:** If your decision is "change_completely" or "modify", you MUST then generate the image prompt and place it in the `scene_description` field of the output object. If "no_change", this field can be null or empty.
|
16 |
+
|
17 |
+
**Mandatory: First-Person Perspective (FPS) for Image Prompts**
|
18 |
+
The image prompt you generate for the `scene_description` field MUST strictly describe the scene from a first-person perspective (FPS), as if the player is looking directly through the character's eyes.
|
19 |
+
* **Viewpoint:** All descriptions must be from the character's eye level, looking forward or as indicated by the scene.
|
20 |
+
* **Character Visibility:** The scene must be depicted strictly as if looking through the character's eyes. NO part of the character's own body (e.g., hands, arms, feet, clothing on them) should be visible or described in the prompt. The view is purely what is external to the character.
|
21 |
+
* **Immersion:** Focus on what the character directly sees and perceives in their immediate environment. Use phrasing that reflects this, for example: "I see...", "Before me lies...", "Looking through the grimy window...", "The corridor stretches out in front of me."
|
22 |
+
|
23 |
+
**Guidelines for Crafting the FPS Image Prompt (for `scene_description` field):**
|
24 |
+
When generating the image prompt, ensure it's detailed and considers the following aspects, all from the character's first-person viewpoint:
|
25 |
+
|
26 |
+
1. **Subject & Focus (as seen by the character):**
|
27 |
+
* What is the primary subject or point of interest directly in the character's view?
|
28 |
+
* Describe any other characters visible to the POV character: their appearance (from the character's perspective), clothing, expressions, posture, and actions.
|
29 |
+
* Detail key objects, items, or environmental elements the character is interacting with or observing.
|
30 |
+
|
31 |
+
2. **Setting & Environment (from the character's perspective):**
|
32 |
+
* Describe the immediate surroundings as the character would see them.
|
33 |
+
* Time of day and weather conditions as perceived by the character.
|
34 |
+
* Specific architectural or natural features visible in the character's field of view.
|
35 |
+
|
36 |
+
3. **Art Style & Medium:**
|
37 |
+
* Specify the desired visual style (e.g., photorealistic, anime, manga, watercolor, oil painting, pixel art, 3D render, concept art, comic book).
|
38 |
+
* Mention any specific artist influences if relevant (e.g., "in the style of Studio Ghibli").
|
39 |
+
|
40 |
+
4. **Composition & Framing (from the character's viewpoint):**
|
41 |
+
* How is the scene framed from the character's eyes? (e.g., "looking straight ahead at a door," "view through a sniper scope," "gazing up at a tall tower").
|
42 |
+
* Describe the arrangement of elements as perceived by the character. Avoid terms like "medium shot" or "wide shot" unless they can be rephrased from an FPS view (e.g., "a wide vista opens up before me").
|
43 |
+
|
44 |
+
5. **Lighting & Atmosphere (as perceived by the character):**
|
45 |
+
* Describe lighting conditions (e.g., "bright sunlight streams through the window in front of me," "only the dim glow of my flashlight illuminates the passage ahead," "neon signs reflect off the wet street I'm looking at").
|
46 |
+
* What is the overall mood or atmosphere from the character's perspective? (e.g., "a tense silence hangs in the air as I look down the dark hallway," "a sense of peace as I gaze at the sunset over the mountains").
|
47 |
+
|
48 |
+
6. **Color Palette:**
|
49 |
+
* Specify dominant colors or a color scheme relevant to what the character sees.
|
50 |
+
|
51 |
+
7. **Details & Keywords:**
|
52 |
+
* Include crucial details from the input scene description that the character would notice.
|
53 |
+
* Use descriptive adjectives and strong keywords.
|
54 |
+
|
55 |
+
**Example for the `scene_description` field (the FPS image prompt):**
|
56 |
+
"FPS view. Through the cockpit window of a futuristic hovercar, a sprawling neon-lit cyberpunk city stretches out under a stormy, rain-lashed sky. Rain streaks across the glass. The hum of the engine is palpable. Photorealistic, Blade Runner style. Cool blue and vibrant pink neon palette."
|
57 |
+
"""
|
58 |
+
|
59 |
+
|
60 |
+
class ChangeScene(BaseModel):
|
61 |
+
change_scene: Literal["change_completely", "modify", "no_change"] = Field(
|
62 |
+
description="Whether the scene should be completely changed, just modified or not changed at all"
|
63 |
+
)
|
64 |
+
scene_description: Optional[str] = None
|
65 |
+
|
66 |
+
|
67 |
+
image_prompt_generator_llm = create_light_llm(0.1).with_structured_output(ChangeScene)
|
68 |
+
|
69 |
+
async def generate_image_prompt(scene_description: str, request_id: str) -> ChangeScene:
|
70 |
+
"""
|
71 |
+
Generates a detailed image prompt string based on a scene description.
|
72 |
+
This prompt is intended for use with an AI image generation model.
|
73 |
+
"""
|
74 |
+
logger.info(f"Generating image prompt for the current scene: {request_id}")
|
75 |
+
response = await image_prompt_generator_llm.ainvoke(
|
76 |
+
[
|
77 |
+
SystemMessage(content=IMAGE_GENERATION_SYSTEM_PROMPT),
|
78 |
+
HumanMessage(content=scene_description),
|
79 |
+
]
|
80 |
+
)
|
81 |
+
logger.info(f"Image prompt generated: {request_id}")
|
82 |
+
return response
|
src/agent/llm.py
CHANGED
@@ -43,6 +43,15 @@ def create_llm(
|
|
43 |
top_p=top_p,
|
44 |
thinking_budget=1024,
|
45 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
|
48 |
def create_precise_llm() -> ChatGoogleGenerativeAI:
|
|
|
43 |
top_p=top_p,
|
44 |
thinking_budget=1024,
|
45 |
)
|
46 |
+
|
47 |
+
|
48 |
+
def create_light_llm(temperature: float = settings.temperature, top_p: float = settings.top_p):
|
49 |
+
return ChatGoogleGenerativeAI(
|
50 |
+
model="gemini-2.0-flash",
|
51 |
+
google_api_key=_get_api_key(),
|
52 |
+
temperature=temperature,
|
53 |
+
top_p=top_p
|
54 |
+
)
|
55 |
|
56 |
|
57 |
def create_precise_llm() -> ChatGoogleGenerativeAI:
|
src/agent/llm_agent.py
CHANGED
@@ -1,61 +1,73 @@
|
|
1 |
-
"""Simple interface for querying the LLM directly."""
|
2 |
-
|
3 |
-
import logging
|
4 |
-
from typing import List, Optional
|
5 |
-
|
6 |
-
from pydantic import BaseModel, Field
|
7 |
-
|
8 |
from agent.llm import create_llm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
|
13 |
-
class ChangeScene(BaseModel):
|
14 |
-
"""Information about a scene change."""
|
15 |
-
|
16 |
-
change_scene: bool = Field(description="Whether the scene should change")
|
17 |
-
scene_description: Optional[str] = None
|
18 |
-
|
19 |
-
|
20 |
-
class ChangeMusic(BaseModel):
|
21 |
-
"""Information about a music change."""
|
22 |
-
|
23 |
-
change_music: bool = Field(description="Whether the music should change")
|
24 |
-
music_description: Optional[str] = None
|
25 |
-
|
26 |
-
|
27 |
class PlayerOption(BaseModel):
|
28 |
-
"""Single option for the player."""
|
29 |
-
|
30 |
option_description: str = Field(
|
31 |
-
description=
|
32 |
-
"Description of the option, e.g. '[Say] Hello!' "
|
33 |
-
"or 'Go to the forest'"
|
34 |
-
)
|
35 |
)
|
36 |
|
37 |
|
38 |
class LLMOutput(BaseModel):
|
39 |
-
"""Expected structure returned by the LLM."""
|
40 |
-
|
41 |
-
change_scene: ChangeScene
|
42 |
-
change_music: ChangeMusic
|
43 |
game_message: str = Field(
|
44 |
-
description=
|
45 |
-
"Message shown to the player, e.g. 'You entered the forest...'"
|
46 |
-
)
|
47 |
)
|
48 |
player_options: List[PlayerOption] = Field(
|
49 |
-
description="
|
50 |
)
|
51 |
|
52 |
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
"""Send user text to the LLM and return the parsed response."""
|
58 |
-
logger.info("User choice: %s", text)
|
59 |
-
response: LLMOutput = await _llm.ainvoke(text)
|
60 |
-
logger.info("LLM response: %s", response)
|
61 |
-
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from agent.llm import create_llm
|
2 |
+
from pydantic import BaseModel, Field
|
3 |
+
from typing import List
|
4 |
+
import logging
|
5 |
+
from agent.image_agent import ChangeScene
|
6 |
+
import asyncio
|
7 |
+
from agent.music_agent import generate_music_prompt
|
8 |
+
from agent.image_agent import generate_image_prompt
|
9 |
+
import uuid
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
class PlayerOption(BaseModel):
|
|
|
|
|
15 |
option_description: str = Field(
|
16 |
+
description="The description of the option, Examples: [Change location] Go to the forest; [Say] Hello!"
|
|
|
|
|
|
|
17 |
)
|
18 |
|
19 |
|
20 |
class LLMOutput(BaseModel):
|
|
|
|
|
|
|
|
|
21 |
game_message: str = Field(
|
22 |
+
description="The message to the player, Example: You entered the forest, and you see unknown scary creatures. What do you do?"
|
|
|
|
|
23 |
)
|
24 |
player_options: List[PlayerOption] = Field(
|
25 |
+
description="The list of up to 3 options for the player to choose from."
|
26 |
)
|
27 |
|
28 |
|
29 |
+
class MultiAgentResponse(BaseModel):
|
30 |
+
game_message: str = Field(
|
31 |
+
description="The message to the player, Example: You entered the forest, and you see unknown scary creatures. What do you do?"
|
32 |
+
)
|
33 |
+
player_options: List[PlayerOption] = Field(
|
34 |
+
description="The list of up to 3 options for the player to choose from."
|
35 |
+
)
|
36 |
+
music_prompt: str = Field(description="The prompt for the music generation model.")
|
37 |
+
change_scene: ChangeScene = Field(description="The change to the scene.")
|
38 |
+
|
39 |
+
llm = create_llm().with_structured_output(MultiAgentResponse)
|
40 |
+
|
41 |
+
|
42 |
+
async def process_user_input(input: str) -> MultiAgentResponse:
|
43 |
+
"""
|
44 |
+
Process user input and update the state.
|
45 |
+
"""
|
46 |
+
request_id = str(uuid.uuid4())
|
47 |
+
logger.info(f"LLM input received: {request_id}")
|
48 |
+
|
49 |
+
response: LLMOutput = await llm.ainvoke(input)
|
50 |
+
|
51 |
+
# return response
|
52 |
+
current_state = f"""{input}
|
53 |
+
|
54 |
+
Game reaction: {response.game_message}
|
55 |
+
Player options: {response.player_options}
|
56 |
+
"""
|
57 |
+
|
58 |
+
music_prompt_task = generate_music_prompt(current_state, request_id)
|
59 |
+
|
60 |
+
change_scene_task = generate_image_prompt(current_state, request_id)
|
61 |
+
|
62 |
+
music_prompt, change_scene = await asyncio.gather(music_prompt_task, change_scene_task)
|
63 |
+
|
64 |
+
multi_agent_response = MultiAgentResponse(
|
65 |
+
game_message=response.game_message,
|
66 |
+
player_options=response.player_options,
|
67 |
+
music_prompt=music_prompt,
|
68 |
+
change_scene=change_scene,
|
69 |
+
)
|
70 |
+
|
71 |
+
logger.info(f"LLM responded: {request_id}")
|
72 |
|
73 |
+
return multi_agent_response
|
|
|
|
|
|
|
|
|
|
src/agent/llm_graph.py
CHANGED
@@ -3,8 +3,9 @@
|
|
3 |
import logging
|
4 |
from dataclasses import dataclass
|
5 |
from typing import Any, Dict, Optional
|
6 |
-
|
7 |
from langgraph.graph import END, StateGraph
|
|
|
8 |
|
9 |
from agent.tools import (
|
10 |
check_ending,
|
@@ -14,7 +15,7 @@ from agent.tools import (
|
|
14 |
update_state_with_choice,
|
15 |
)
|
16 |
from agent.state import get_user_state
|
17 |
-
|
18 |
logger = logging.getLogger(__name__)
|
19 |
|
20 |
|
@@ -59,11 +60,13 @@ async def node_init_game(state: GraphState) -> GraphState:
|
|
59 |
first_scene = await generate_scene.ainvoke(
|
60 |
{"user_hash": state.user_hash, "last_choice": "start"}
|
61 |
)
|
|
|
|
|
62 |
await generate_scene_image.ainvoke(
|
63 |
{
|
64 |
"user_hash": state.user_hash,
|
65 |
"scene_id": first_scene["scene_id"],
|
66 |
-
"
|
67 |
}
|
68 |
)
|
69 |
state.scene = first_scene
|
@@ -91,13 +94,17 @@ async def node_player_step(state: GraphState) -> GraphState:
|
|
91 |
"last_choice": state.choice_text,
|
92 |
}
|
93 |
)
|
94 |
-
await
|
|
|
95 |
{
|
96 |
"user_hash": state.user_hash,
|
97 |
"scene_id": next_scene["scene_id"],
|
98 |
-
"
|
|
|
99 |
}
|
100 |
)
|
|
|
|
|
101 |
state.scene = next_scene
|
102 |
return state
|
103 |
|
|
|
3 |
import logging
|
4 |
from dataclasses import dataclass
|
5 |
from typing import Any, Dict, Optional
|
6 |
+
import asyncio
|
7 |
from langgraph.graph import END, StateGraph
|
8 |
+
from agent.image_agent import generate_image_prompt
|
9 |
|
10 |
from agent.tools import (
|
11 |
check_ending,
|
|
|
15 |
update_state_with_choice,
|
16 |
)
|
17 |
from agent.state import get_user_state
|
18 |
+
from audio.audio_generator import change_music_tone
|
19 |
logger = logging.getLogger(__name__)
|
20 |
|
21 |
|
|
|
60 |
first_scene = await generate_scene.ainvoke(
|
61 |
{"user_hash": state.user_hash, "last_choice": "start"}
|
62 |
)
|
63 |
+
change_scene = await generate_image_prompt(first_scene["description"], state.user_hash)
|
64 |
+
logger.info(f"Change scene: {change_scene}")
|
65 |
await generate_scene_image.ainvoke(
|
66 |
{
|
67 |
"user_hash": state.user_hash,
|
68 |
"scene_id": first_scene["scene_id"],
|
69 |
+
"change_scene": change_scene,
|
70 |
}
|
71 |
)
|
72 |
state.scene = first_scene
|
|
|
94 |
"last_choice": state.choice_text,
|
95 |
}
|
96 |
)
|
97 |
+
change_scene = await generate_image_prompt(next_scene["description"], state.user_hash)
|
98 |
+
image_task = generate_scene_image.ainvoke(
|
99 |
{
|
100 |
"user_hash": state.user_hash,
|
101 |
"scene_id": next_scene["scene_id"],
|
102 |
+
"current_image": user_state.assets[scene_id],
|
103 |
+
"change_scene": change_scene,
|
104 |
}
|
105 |
)
|
106 |
+
music_task = change_music_tone(state.user_hash, next_scene["music"])
|
107 |
+
await asyncio.gather(image_task, music_task)
|
108 |
state.scene = next_scene
|
109 |
return state
|
110 |
|
src/agent/music_agent.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel
|
2 |
+
from agent.llm import create_light_llm
|
3 |
+
from langchain_core.messages import SystemMessage, HumanMessage
|
4 |
+
import logging
|
5 |
+
|
6 |
+
logger = logging.getLogger(__name__)
|
7 |
+
|
8 |
+
music_options = """Instruments: 303 Acid Bass, 808 Hip Hop Beat, Accordion, Alto Saxophone, Bagpipes, Balalaika Ensemble, Banjo, Bass Clarinet, Bongos, Boomy Bass, Bouzouki, Buchla Synths, Cello, Charango, Clavichord, Conga Drums, Didgeridoo, Dirty Synths, Djembe, Drumline, Dulcimer, Fiddle, Flamenco Guitar, Funk Drums, Glockenspiel, Guitar, Hang Drum, Harmonica, Harp, Harpsichord, Hurdy-gurdy, Kalimba, Koto, Lyre, Mandolin, Maracas, Marimba, Mbira, Mellotron, Metallic Twang, Moog Oscillations, Ocarina, Persian Tar, Pipa, Precision Bass, Ragtime Piano, Rhodes Piano, Shamisen, Shredding Guitar, Sitar, Slide Guitar, Smooth Pianos, Spacey Synths, Steel Drum, Synth Pads, Tabla, TR-909 Drum Machine, Trumpet, Tuba, Vibraphone, Viola Ensemble, Warm Acoustic Guitar, Woodwinds, ...
|
9 |
+
Music Genre: Acid Jazz, Afrobeat, Alternative Country, Baroque, Bengal Baul, Bhangra, Bluegrass, Blues Rock, Bossa Nova, Breakbeat, Celtic Folk, Chillout, Chiptune, Classic Rock, Contemporary R&B, Cumbia, Deep House, Disco Funk, Drum & Bass, Dubstep, EDM, Electro Swing, Funk Metal, G-funk, Garage Rock, Glitch Hop, Grime, Hyperpop, Indian Classical, Indie Electronic, Indie Folk, Indie Pop, Irish Folk, Jam Band, Jamaican Dub, Jazz Fusion, Latin Jazz, Lo-Fi Hip Hop, Marching Band, Merengue, New Jack Swing, Minimal Techno, Moombahton, Neo-Soul, Orchestral Score, Piano Ballad, Polka, Post-Punk, 60s Psychedelic Rock, Psytrance, R&B, Reggae, Reggaeton, Renaissance Music, Salsa, Shoegaze, Ska, Surf Rock, Synthpop, Techno, Trance, Trap Beat, Trip Hop, Vaporwave, Witch house, ...
|
10 |
+
Mood/Description: Acoustic Instruments, Ambient, Bright Tones, Chill, Crunchy Distortion, Danceable, Dreamy, Echo, Emotional, Ethereal Ambience, Experimental, Fat Beats, Funky, Glitchy Effects, Huge Drop, Live Performance, Lo-fi, Ominous Drone, Psychedelic, Rich Orchestration, Saturated Tones, Subdued Melody, Sustained Chords, Swirling Phasers, Tight Groove, Unsettling, Upbeat, Virtuoso, Weird Noises, ...
|
11 |
+
"""
|
12 |
+
system_prompt = f"""
|
13 |
+
You are a music agent responsible for generating appropriate music tones for scenes in a visual novel game.
|
14 |
+
|
15 |
+
Your task is to analyze the current scene description and generate a detailed music prompt that captures:
|
16 |
+
1. The emotional atmosphere
|
17 |
+
2. The intensity level
|
18 |
+
3. The genre/style that best fits the scene
|
19 |
+
4. Specific instruments that would enhance the mood
|
20 |
+
|
21 |
+
You have access to a wide range of musical elements including:
|
22 |
+
{music_options}
|
23 |
+
|
24 |
+
When generating a music prompt:
|
25 |
+
- Consider the scene's context, mood, and any suspense elements
|
26 |
+
- Choose instruments that complement the scene's atmosphere
|
27 |
+
- Select a genre that matches the story's setting and tone
|
28 |
+
- Include specific mood descriptors to guide the music generation
|
29 |
+
|
30 |
+
Your output should be a concise but detailed prompt that the music generation model can use to create an appropriate soundtrack for the scene.
|
31 |
+
"""
|
32 |
+
|
33 |
+
|
34 |
+
class MusicPrompt(BaseModel):
|
35 |
+
prompt: str
|
36 |
+
|
37 |
+
|
38 |
+
llm = create_light_llm(0.1).with_structured_output(MusicPrompt)
|
39 |
+
|
40 |
+
|
41 |
+
async def generate_music_prompt(scene_description: str, request_id: str) -> str:
|
42 |
+
logger.info(f"Generating music prompt for the current scene: {request_id}")
|
43 |
+
response = await llm.ainvoke(
|
44 |
+
[SystemMessage(content=system_prompt), HumanMessage(content=scene_description)]
|
45 |
+
)
|
46 |
+
logger.info(f"Music prompt generated: {request_id}")
|
47 |
+
return response.prompt
|
src/agent/tools.py
CHANGED
@@ -18,7 +18,8 @@ from agent.models import (
|
|
18 |
)
|
19 |
from agent.prompts import ENDING_CHECK_PROMPT, SCENE_PROMPT, STORY_FRAME_PROMPT
|
20 |
from agent.state import get_user_state, set_user_state
|
21 |
-
from images.image_generator import generate_image
|
|
|
22 |
|
23 |
logger = logging.getLogger(__name__)
|
24 |
|
@@ -71,11 +72,9 @@ async def generate_scene(
|
|
71 |
prompt = SCENE_PROMPT.format(
|
72 |
lore=state.story_frame.lore,
|
73 |
goal=state.story_frame.goal,
|
74 |
-
milestones=
|
75 |
-
endings=
|
76 |
-
history=
|
77 |
-
f"{c.scene_id}:{c.choice_text}" for c in state.user_choices
|
78 |
-
),
|
79 |
last_choice=last_choice,
|
80 |
)
|
81 |
resp: SceneLLM = await llm.ainvoke(prompt)
|
@@ -107,11 +106,19 @@ async def generate_scene(
|
|
107 |
async def generate_scene_image(
|
108 |
user_hash: Annotated[str, "User session ID"],
|
109 |
scene_id: Annotated[str, "Scene ID"],
|
110 |
-
|
|
|
111 |
) -> Annotated[str, "Path to generated image"]:
|
112 |
"""Generate an image for a scene and save the path in the state."""
|
113 |
try:
|
114 |
-
image_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
state = get_user_state(user_hash)
|
116 |
if scene_id in state.scenes:
|
117 |
state.scenes[scene_id].image = image_path
|
@@ -152,14 +159,10 @@ async def check_ending(
|
|
152 |
if not state.story_frame:
|
153 |
return _err("No story frame")
|
154 |
llm = create_llm().with_structured_output(EndingCheckResult)
|
155 |
-
history =
|
156 |
-
f"{c.scene_id}:{c.choice_text}" for c in state.user_choices
|
157 |
-
)
|
158 |
prompt = ENDING_CHECK_PROMPT.format(
|
159 |
history=history,
|
160 |
-
endings=
|
161 |
-
f"{e.id}:{e.condition}" for e in state.story_frame.endings
|
162 |
-
),
|
163 |
)
|
164 |
resp: EndingCheckResult = await llm.ainvoke(prompt)
|
165 |
if resp.ending_reached and resp.ending:
|
|
|
18 |
)
|
19 |
from agent.prompts import ENDING_CHECK_PROMPT, SCENE_PROMPT, STORY_FRAME_PROMPT
|
20 |
from agent.state import get_user_state, set_user_state
|
21 |
+
from images.image_generator import modify_image, generate_image
|
22 |
+
from agent.image_agent import ChangeScene
|
23 |
|
24 |
logger = logging.getLogger(__name__)
|
25 |
|
|
|
72 |
prompt = SCENE_PROMPT.format(
|
73 |
lore=state.story_frame.lore,
|
74 |
goal=state.story_frame.goal,
|
75 |
+
milestones=",".join(m.id for m in state.story_frame.milestones),
|
76 |
+
endings=",".join(e.id for e in state.story_frame.endings),
|
77 |
+
history="; ".join(f"{c.scene_id}:{c.choice_text}" for c in state.user_choices),
|
|
|
|
|
78 |
last_choice=last_choice,
|
79 |
)
|
80 |
resp: SceneLLM = await llm.ainvoke(prompt)
|
|
|
106 |
async def generate_scene_image(
|
107 |
user_hash: Annotated[str, "User session ID"],
|
108 |
scene_id: Annotated[str, "Scene ID"],
|
109 |
+
change_scene: Annotated[ChangeScene, "Prompt for image generation"],
|
110 |
+
current_image: Annotated[str, "Current image"] | None = None,
|
111 |
) -> Annotated[str, "Path to generated image"]:
|
112 |
"""Generate an image for a scene and save the path in the state."""
|
113 |
try:
|
114 |
+
image_path = current_image
|
115 |
+
if change_scene.change_scene == "change_completely" or change_scene.change_scene == "modify":
|
116 |
+
image_path, _ = await (
|
117 |
+
generate_image(change_scene.scene_description)
|
118 |
+
if current_image is None
|
119 |
+
# for now always modify the image to avoid the generating an update in a completely wrong style
|
120 |
+
else modify_image(current_image, change_scene.scene_description)
|
121 |
+
)
|
122 |
state = get_user_state(user_hash)
|
123 |
if scene_id in state.scenes:
|
124 |
state.scenes[scene_id].image = image_path
|
|
|
159 |
if not state.story_frame:
|
160 |
return _err("No story frame")
|
161 |
llm = create_llm().with_structured_output(EndingCheckResult)
|
162 |
+
history = "; ".join(f"{c.scene_id}:{c.choice_text}" for c in state.user_choices)
|
|
|
|
|
163 |
prompt = ENDING_CHECK_PROMPT.format(
|
164 |
history=history,
|
165 |
+
endings=",".join(f"{e.id}:{e.condition}" for e in state.story_frame.endings),
|
|
|
|
|
166 |
)
|
167 |
resp: EndingCheckResult = await llm.ainvoke(prompt)
|
168 |
if resp.ending_reached and resp.ending:
|
src/audio/audio_generator.py
CHANGED
@@ -13,10 +13,12 @@ logger = logging.getLogger(__name__)
|
|
13 |
client = genai.Client(api_key=settings.gemini_api_key.get_secret_value(), http_options={'api_version': 'v1alpha'})
|
14 |
|
15 |
async def generate_music(user_hash: str, music_tone: str, receive_audio):
|
16 |
-
|
|
|
|
|
17 |
client.aio.live.music.connect(model='models/lyria-realtime-exp') as session,
|
18 |
asyncio.TaskGroup() as tg,
|
19 |
-
|
20 |
# Set up task to receive server messages.
|
21 |
tg.create_task(receive_audio(session, user_hash))
|
22 |
|
@@ -31,10 +33,9 @@ async def generate_music(user_hash: str, music_tone: str, receive_audio):
|
|
31 |
)
|
32 |
await session.play()
|
33 |
logger.info(f"Started music generation for user hash {user_hash}, music tone: {music_tone}")
|
34 |
-
await cleanup_music_session(user_hash)
|
35 |
sessions[user_hash] = {
|
36 |
'session': session,
|
37 |
-
'queue': queue.Queue(
|
38 |
}
|
39 |
|
40 |
async def change_music_tone(user_hash: str, new_tone):
|
@@ -43,7 +44,6 @@ async def change_music_tone(user_hash: str, new_tone):
|
|
43 |
if not session:
|
44 |
logger.error(f"No session found for user hash {user_hash}")
|
45 |
return
|
46 |
-
await session.reset_context()
|
47 |
await session.set_weighted_prompts(
|
48 |
prompts=[types.WeightedPrompt(text=new_tone, weight=1.0)]
|
49 |
)
|
|
|
13 |
client = genai.Client(api_key=settings.gemini_api_key.get_secret_value(), http_options={'api_version': 'v1alpha'})
|
14 |
|
15 |
async def generate_music(user_hash: str, music_tone: str, receive_audio):
|
16 |
+
if user_hash in sessions:
|
17 |
+
return
|
18 |
+
async with (
|
19 |
client.aio.live.music.connect(model='models/lyria-realtime-exp') as session,
|
20 |
asyncio.TaskGroup() as tg,
|
21 |
+
):
|
22 |
# Set up task to receive server messages.
|
23 |
tg.create_task(receive_audio(session, user_hash))
|
24 |
|
|
|
33 |
)
|
34 |
await session.play()
|
35 |
logger.info(f"Started music generation for user hash {user_hash}, music tone: {music_tone}")
|
|
|
36 |
sessions[user_hash] = {
|
37 |
'session': session,
|
38 |
+
'queue': queue.Queue()
|
39 |
}
|
40 |
|
41 |
async def change_music_tone(user_hash: str, new_tone):
|
|
|
44 |
if not session:
|
45 |
logger.error(f"No session found for user hash {user_hash}")
|
46 |
return
|
|
|
47 |
await session.set_weighted_prompts(
|
48 |
prompts=[types.WeightedPrompt(text=new_tone, weight=1.0)]
|
49 |
)
|
src/config.py
CHANGED
@@ -24,8 +24,11 @@ class BaseAppSettings(BaseSettings):
|
|
24 |
|
25 |
class AppSettings(BaseAppSettings):
|
26 |
gemini_api_key: SecretStr
|
|
|
|
|
27 |
top_p: float = 0.95
|
28 |
temperature: float = 0.5
|
29 |
-
|
|
|
30 |
|
31 |
settings = AppSettings()
|
|
|
24 |
|
25 |
class AppSettings(BaseAppSettings):
|
26 |
gemini_api_key: SecretStr
|
27 |
+
gemini_api_keys: SecretStr
|
28 |
+
# assistant_api_key: SecretStr
|
29 |
top_p: float = 0.95
|
30 |
temperature: float = 0.5
|
31 |
+
pregenerate_next_scene: bool = True
|
32 |
+
|
33 |
|
34 |
settings = AppSettings()
|
src/css.py
CHANGED
@@ -33,11 +33,11 @@ custom_css = """
|
|
33 |
background: rgba(0,0,0,0.7) !important;
|
34 |
border: none !important;
|
35 |
color: white !important;
|
36 |
-
font-size:
|
37 |
line-height: 1.5 !important;
|
38 |
-
padding:
|
39 |
border-radius: 10px !important;
|
40 |
-
margin-bottom:
|
41 |
}
|
42 |
|
43 |
img {
|
@@ -49,7 +49,7 @@ img {
|
|
49 |
border: none !important;
|
50 |
color: white !important;
|
51 |
-webkit-text-fill-color: white !important;
|
52 |
-
font-size:
|
53 |
resize: none !important;
|
54 |
}
|
55 |
|
@@ -57,13 +57,12 @@ img {
|
|
57 |
.choice-buttons {
|
58 |
background: rgba(0,0,0,0.7) !important;
|
59 |
border-radius: 10px !important;
|
60 |
-
padding:
|
61 |
}
|
62 |
|
63 |
.choice-buttons label {
|
64 |
color: white !important;
|
65 |
-
font-size:
|
66 |
-
margin-bottom: 10px !important;
|
67 |
}
|
68 |
|
69 |
/* Fix radio button backgrounds */
|
|
|
33 |
background: rgba(0,0,0,0.7) !important;
|
34 |
border: none !important;
|
35 |
color: white !important;
|
36 |
+
font-size: 15px !important;
|
37 |
line-height: 1.5 !important;
|
38 |
+
padding: 10px !important;
|
39 |
border-radius: 10px !important;
|
40 |
+
margin-bottom: 10px !important;
|
41 |
}
|
42 |
|
43 |
img {
|
|
|
49 |
border: none !important;
|
50 |
color: white !important;
|
51 |
-webkit-text-fill-color: white !important;
|
52 |
+
font-size: 15px !important;
|
53 |
resize: none !important;
|
54 |
}
|
55 |
|
|
|
57 |
.choice-buttons {
|
58 |
background: rgba(0,0,0,0.7) !important;
|
59 |
border-radius: 10px !important;
|
60 |
+
padding: 10px !important;
|
61 |
}
|
62 |
|
63 |
.choice-buttons label {
|
64 |
color: white !important;
|
65 |
+
font-size: 14px !important;
|
|
|
66 |
}
|
67 |
|
68 |
/* Fix radio button backgrounds */
|
src/game_constructor.py
CHANGED
@@ -1,10 +1,16 @@
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
import uuid
|
|
|
|
|
|
|
|
|
4 |
from game_setting import Character, GameSetting
|
5 |
from agent.runner import process_step
|
6 |
from audio.audio_generator import start_music_generation
|
7 |
import asyncio
|
|
|
|
|
8 |
|
9 |
# Predefined suggestions for demo
|
10 |
SETTING_SUGGESTIONS = [
|
@@ -105,6 +111,7 @@ def save_game_config(
|
|
105 |
except Exception as e:
|
106 |
return f"❌ Error saving configuration: {str(e)}"
|
107 |
|
|
|
108 |
async def start_game_with_settings(
|
109 |
user_hash: str,
|
110 |
setting_desc: str,
|
@@ -139,6 +146,8 @@ async def start_game_with_settings(
|
|
139 |
)
|
140 |
|
141 |
game_setting = GameSetting(character=character, setting=setting_desc, genre=genre)
|
|
|
|
|
142 |
|
143 |
# Запускаем LLM-граф для инициализации истории
|
144 |
result = await process_step(
|
@@ -149,8 +158,6 @@ async def start_game_with_settings(
|
|
149 |
genre=game_setting.genre,
|
150 |
)
|
151 |
|
152 |
-
asyncio.create_task(start_music_generation(user_hash, "neutral"))
|
153 |
-
|
154 |
scene = result["scene"]
|
155 |
scene_text = scene["description"]
|
156 |
scene_image = scene.get("image", "")
|
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
import uuid
|
4 |
+
from game_setting import Character, GameSetting, get_user_story
|
5 |
+
from game_state import story, state, get_current_scene
|
6 |
+
from agent.llm_agent import process_user_input
|
7 |
+
from images.image_generator import generate_image
|
8 |
from game_setting import Character, GameSetting
|
9 |
from agent.runner import process_step
|
10 |
from audio.audio_generator import start_music_generation
|
11 |
import asyncio
|
12 |
+
from config import settings
|
13 |
+
|
14 |
|
15 |
# Predefined suggestions for demo
|
16 |
SETTING_SUGGESTIONS = [
|
|
|
111 |
except Exception as e:
|
112 |
return f"❌ Error saving configuration: {str(e)}"
|
113 |
|
114 |
+
|
115 |
async def start_game_with_settings(
|
116 |
user_hash: str,
|
117 |
setting_desc: str,
|
|
|
146 |
)
|
147 |
|
148 |
game_setting = GameSetting(character=character, setting=setting_desc, genre=genre)
|
149 |
+
|
150 |
+
asyncio.create_task(start_music_generation(user_hash, "neutral"))
|
151 |
|
152 |
# Запускаем LLM-граф для инициализации истории
|
153 |
result = await process_step(
|
|
|
158 |
genre=game_setting.genre,
|
159 |
)
|
160 |
|
|
|
|
|
161 |
scene = result["scene"]
|
162 |
scene_text = scene["description"]
|
163 |
scene_image = scene.get("image", "")
|
src/game_setting.py
CHANGED
@@ -1,12 +1,25 @@
|
|
1 |
from pydantic import BaseModel
|
2 |
|
|
|
3 |
class Character(BaseModel):
|
4 |
name: str
|
5 |
age: str
|
6 |
background: str
|
7 |
personality: str
|
8 |
|
|
|
9 |
class GameSetting(BaseModel):
|
10 |
character: Character
|
11 |
setting: str
|
12 |
genre: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from pydantic import BaseModel
|
2 |
|
3 |
+
|
4 |
class Character(BaseModel):
|
5 |
name: str
|
6 |
age: str
|
7 |
background: str
|
8 |
personality: str
|
9 |
|
10 |
+
|
11 |
class GameSetting(BaseModel):
|
12 |
character: Character
|
13 |
setting: str
|
14 |
genre: str
|
15 |
+
|
16 |
+
|
17 |
+
def get_user_story(
|
18 |
+
scene_description: str, scene_image_description: str, user_choice: str
|
19 |
+
) -> str:
|
20 |
+
return f"""Current scene description:
|
21 |
+
{scene_description}
|
22 |
+
Current scene image description: {scene_image_description}
|
23 |
+
|
24 |
+
User's choice: {user_choice}
|
25 |
+
"""
|
src/game_state.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
-
|
2 |
story = {
|
3 |
"start": {
|
4 |
"text": "You wake up in a mysterious forest. What do you do?",
|
5 |
"image": "forest.jpg",
|
6 |
-
"choices":
|
7 |
"music_tone": "neutral",
|
|
|
8 |
},
|
9 |
}
|
10 |
|
@@ -12,4 +12,4 @@ state = {"scene": "start"}
|
|
12 |
|
13 |
def get_current_scene():
|
14 |
scene = story[state["scene"]]
|
15 |
-
return scene["text"], scene["image"], scene["choices"]
|
|
|
|
|
1 |
story = {
|
2 |
"start": {
|
3 |
"text": "You wake up in a mysterious forest. What do you do?",
|
4 |
"image": "forest.jpg",
|
5 |
+
"choices": {"Explore": None, "Wait": None},
|
6 |
"music_tone": "neutral",
|
7 |
+
"img_description": "forest in the fog",
|
8 |
},
|
9 |
}
|
10 |
|
|
|
12 |
|
13 |
def get_current_scene():
|
14 |
scene = story[state["scene"]]
|
15 |
+
return scene["text"], scene["image"], scene["choices"].keys()
|
src/images/image_generator.py
CHANGED
@@ -6,25 +6,47 @@ from io import BytesIO
|
|
6 |
from datetime import datetime
|
7 |
from config import settings
|
8 |
import logging
|
|
|
|
|
9 |
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
client = genai.Client(api_key=settings.gemini_api_key.get_secret_value()).aio
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
async def generate_image(prompt: str) -> tuple[str, str] | None:
|
15 |
"""
|
16 |
Generate an image using Google's Gemini model and save it to generated/images directory.
|
17 |
-
|
18 |
Args:
|
19 |
prompt (str): The text prompt to generate the image from
|
20 |
-
|
21 |
Returns:
|
22 |
str: Path to the generated image file, or None if generation failed
|
23 |
"""
|
24 |
# Ensure the generated/images directory exists
|
25 |
output_dir = "generated/images"
|
26 |
os.makedirs(output_dir, exist_ok=True)
|
27 |
-
|
28 |
logger.info(f"Generating image with prompt: {prompt}")
|
29 |
|
30 |
try:
|
@@ -32,8 +54,9 @@ async def generate_image(prompt: str) -> tuple[str, str] | None:
|
|
32 |
model="gemini-2.0-flash-preview-image-generation",
|
33 |
contents=prompt,
|
34 |
config=types.GenerateContentConfig(
|
35 |
-
response_modalities=[
|
36 |
-
|
|
|
37 |
)
|
38 |
|
39 |
# Process the response parts
|
@@ -44,19 +67,20 @@ async def generate_image(prompt: str) -> tuple[str, str] | None:
|
|
44 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
45 |
filename = f"gemini_{timestamp}.png"
|
46 |
filepath = os.path.join(output_dir, filename)
|
47 |
-
|
48 |
# Save the image
|
49 |
image = Image.open(BytesIO(part.inline_data.data))
|
50 |
-
image.save
|
51 |
logger.info(f"Image saved to: {filepath}")
|
52 |
image_saved = True
|
53 |
-
|
54 |
-
return filepath,
|
55 |
-
|
56 |
if not image_saved:
|
|
|
57 |
logger.error("No image was generated in the response.")
|
58 |
return None, None
|
59 |
-
|
60 |
except Exception as e:
|
61 |
logger.error(f"Error generating image: {e}")
|
62 |
return None, None
|
@@ -65,38 +89,41 @@ async def generate_image(prompt: str) -> tuple[str, str] | None:
|
|
65 |
async def modify_image(image_path: str, modification_prompt: str) -> str | None:
|
66 |
"""
|
67 |
Modify an existing image using Google's Gemini model based on a text prompt.
|
68 |
-
|
69 |
Args:
|
70 |
image_path (str): Path to the existing image file
|
71 |
modification_prompt (str): The text prompt describing how to modify the image
|
72 |
-
|
73 |
Returns:
|
74 |
str: Path to the modified image file, or None if modification failed
|
75 |
"""
|
76 |
# Ensure the generated/images directory exists
|
77 |
output_dir = "generated/images"
|
78 |
os.makedirs(output_dir, exist_ok=True)
|
79 |
-
|
|
|
|
|
80 |
# Check if the input image exists
|
81 |
if not os.path.exists(image_path):
|
82 |
logger.error(f"Error: Image file not found at {image_path}")
|
83 |
return None
|
84 |
-
|
85 |
key = settings.gemini_api_key.get_secret_value()
|
86 |
-
|
87 |
client = genai.Client(api_key=key).aio
|
88 |
|
89 |
try:
|
90 |
# Load the input image
|
91 |
input_image = Image.open(image_path)
|
92 |
-
|
93 |
# Make the API call with both text and image
|
94 |
response = await client.models.generate_content(
|
95 |
model="gemini-2.0-flash-preview-image-generation",
|
96 |
contents=[modification_prompt, input_image],
|
97 |
config=types.GenerateContentConfig(
|
98 |
-
response_modalities=[
|
99 |
-
|
|
|
100 |
)
|
101 |
|
102 |
# Process the response parts
|
@@ -107,19 +134,20 @@ async def modify_image(image_path: str, modification_prompt: str) -> str | None:
|
|
107 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
108 |
filename = f"gemini_modified_{timestamp}.png"
|
109 |
filepath = os.path.join(output_dir, filename)
|
110 |
-
|
111 |
# Save the modified image
|
112 |
modified_image = Image.open(BytesIO(part.inline_data.data))
|
113 |
-
modified_image.save
|
114 |
logger.info(f"Modified image saved to: {filepath}")
|
115 |
image_saved = True
|
116 |
-
|
117 |
-
return filepath,
|
118 |
-
|
119 |
if not image_saved:
|
|
|
120 |
logger.error("No modified image was generated in the response.")
|
121 |
return None, None
|
122 |
-
|
123 |
except Exception as e:
|
124 |
logger.error(f"Error modifying image: {e}")
|
125 |
return None, None
|
@@ -129,10 +157,10 @@ if __name__ == "__main__":
|
|
129 |
# Example usage
|
130 |
sample_prompt = "A Luke Skywalker half height sprite with white background for visual novel game"
|
131 |
generated_image_path = generate_image(sample_prompt)
|
132 |
-
|
133 |
# if generated_image_path:
|
134 |
# # Example modification
|
135 |
# modification_prompt = "Now the house is destroyed, and the jawas are running away"
|
136 |
# modified_image_path = modify_image(generated_image_path, modification_prompt)
|
137 |
# if modified_image_path:
|
138 |
-
# print(f"Successfully modified image: {modified_image_path}")
|
|
|
6 |
from datetime import datetime
|
7 |
from config import settings
|
8 |
import logging
|
9 |
+
import asyncio
|
10 |
+
import gradio as gr
|
11 |
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
14 |
client = genai.Client(api_key=settings.gemini_api_key.get_secret_value()).aio
|
15 |
|
16 |
+
safety_settings = [
|
17 |
+
types.SafetySetting(
|
18 |
+
category="HARM_CATEGORY_HARASSMENT",
|
19 |
+
threshold="BLOCK_NONE", # Block none
|
20 |
+
),
|
21 |
+
types.SafetySetting(
|
22 |
+
category="HARM_CATEGORY_HATE_SPEECH",
|
23 |
+
threshold="BLOCK_NONE", # Block none
|
24 |
+
),
|
25 |
+
types.SafetySetting(
|
26 |
+
category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
27 |
+
threshold="BLOCK_NONE", # Block none
|
28 |
+
),
|
29 |
+
types.SafetySetting(
|
30 |
+
category="HARM_CATEGORY_DANGEROUS_CONTENT",
|
31 |
+
threshold="BLOCK_NONE", # Block none
|
32 |
+
),
|
33 |
+
]
|
34 |
+
|
35 |
+
|
36 |
async def generate_image(prompt: str) -> tuple[str, str] | None:
|
37 |
"""
|
38 |
Generate an image using Google's Gemini model and save it to generated/images directory.
|
39 |
+
|
40 |
Args:
|
41 |
prompt (str): The text prompt to generate the image from
|
42 |
+
|
43 |
Returns:
|
44 |
str: Path to the generated image file, or None if generation failed
|
45 |
"""
|
46 |
# Ensure the generated/images directory exists
|
47 |
output_dir = "generated/images"
|
48 |
os.makedirs(output_dir, exist_ok=True)
|
49 |
+
|
50 |
logger.info(f"Generating image with prompt: {prompt}")
|
51 |
|
52 |
try:
|
|
|
54 |
model="gemini-2.0-flash-preview-image-generation",
|
55 |
contents=prompt,
|
56 |
config=types.GenerateContentConfig(
|
57 |
+
response_modalities=["TEXT", "IMAGE"],
|
58 |
+
safety_settings=safety_settings,
|
59 |
+
),
|
60 |
)
|
61 |
|
62 |
# Process the response parts
|
|
|
67 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
68 |
filename = f"gemini_{timestamp}.png"
|
69 |
filepath = os.path.join(output_dir, filename)
|
70 |
+
|
71 |
# Save the image
|
72 |
image = Image.open(BytesIO(part.inline_data.data))
|
73 |
+
await asyncio.to_thread(image.save, filepath, "PNG")
|
74 |
logger.info(f"Image saved to: {filepath}")
|
75 |
image_saved = True
|
76 |
+
|
77 |
+
return filepath, prompt
|
78 |
+
|
79 |
if not image_saved:
|
80 |
+
gr.Warning("Image was censored by Google!")
|
81 |
logger.error("No image was generated in the response.")
|
82 |
return None, None
|
83 |
+
|
84 |
except Exception as e:
|
85 |
logger.error(f"Error generating image: {e}")
|
86 |
return None, None
|
|
|
89 |
async def modify_image(image_path: str, modification_prompt: str) -> str | None:
|
90 |
"""
|
91 |
Modify an existing image using Google's Gemini model based on a text prompt.
|
92 |
+
|
93 |
Args:
|
94 |
image_path (str): Path to the existing image file
|
95 |
modification_prompt (str): The text prompt describing how to modify the image
|
96 |
+
|
97 |
Returns:
|
98 |
str: Path to the modified image file, or None if modification failed
|
99 |
"""
|
100 |
# Ensure the generated/images directory exists
|
101 |
output_dir = "generated/images"
|
102 |
os.makedirs(output_dir, exist_ok=True)
|
103 |
+
|
104 |
+
logger.info(f"Modifying current scene image with prompt: {modification_prompt}")
|
105 |
+
|
106 |
# Check if the input image exists
|
107 |
if not os.path.exists(image_path):
|
108 |
logger.error(f"Error: Image file not found at {image_path}")
|
109 |
return None
|
110 |
+
|
111 |
key = settings.gemini_api_key.get_secret_value()
|
112 |
+
|
113 |
client = genai.Client(api_key=key).aio
|
114 |
|
115 |
try:
|
116 |
# Load the input image
|
117 |
input_image = Image.open(image_path)
|
118 |
+
|
119 |
# Make the API call with both text and image
|
120 |
response = await client.models.generate_content(
|
121 |
model="gemini-2.0-flash-preview-image-generation",
|
122 |
contents=[modification_prompt, input_image],
|
123 |
config=types.GenerateContentConfig(
|
124 |
+
response_modalities=["TEXT", "IMAGE"],
|
125 |
+
safety_settings=safety_settings,
|
126 |
+
),
|
127 |
)
|
128 |
|
129 |
# Process the response parts
|
|
|
134 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
135 |
filename = f"gemini_modified_{timestamp}.png"
|
136 |
filepath = os.path.join(output_dir, filename)
|
137 |
+
|
138 |
# Save the modified image
|
139 |
modified_image = Image.open(BytesIO(part.inline_data.data))
|
140 |
+
await asyncio.to_thread(modified_image.save, filepath, "PNG")
|
141 |
logger.info(f"Modified image saved to: {filepath}")
|
142 |
image_saved = True
|
143 |
+
|
144 |
+
return filepath, modification_prompt
|
145 |
+
|
146 |
if not image_saved:
|
147 |
+
gr.Warning("Updated image was censored by Google!")
|
148 |
logger.error("No modified image was generated in the response.")
|
149 |
return None, None
|
150 |
+
|
151 |
except Exception as e:
|
152 |
logger.error(f"Error modifying image: {e}")
|
153 |
return None, None
|
|
|
157 |
# Example usage
|
158 |
sample_prompt = "A Luke Skywalker half height sprite with white background for visual novel game"
|
159 |
generated_image_path = generate_image(sample_prompt)
|
160 |
+
|
161 |
# if generated_image_path:
|
162 |
# # Example modification
|
163 |
# modification_prompt = "Now the house is destroyed, and the jawas are running away"
|
164 |
# modified_image_path = modify_image(generated_image_path, modification_prompt)
|
165 |
# if modified_image_path:
|
166 |
+
# print(f"Successfully modified image: {modified_image_path}")
|
src/main.py
CHANGED
@@ -5,6 +5,8 @@ from audio.audio_generator import (
|
|
5 |
cleanup_music_session,
|
6 |
)
|
7 |
import logging
|
|
|
|
|
8 |
from agent.runner import process_step
|
9 |
import uuid
|
10 |
from game_constructor import (
|
@@ -15,6 +17,9 @@ from game_constructor import (
|
|
15 |
load_character_suggestion,
|
16 |
start_game_with_settings,
|
17 |
)
|
|
|
|
|
|
|
18 |
|
19 |
logger = logging.getLogger(__name__)
|
20 |
|
@@ -125,7 +130,7 @@ with gr.Blocks(
|
|
125 |
# Fullscreen Loading Indicator (hidden by default)
|
126 |
with gr.Column(visible=False, elem_id="loading-indicator") as loading_indicator:
|
127 |
gr.HTML("<div class='loading-text'>🚀 Starting your adventure...</div>")
|
128 |
-
|
129 |
local_storage = gr.BrowserState(str(uuid.uuid4()), "user_hash")
|
130 |
|
131 |
# Constructor Interface (visible by default)
|
|
|
5 |
cleanup_music_session,
|
6 |
)
|
7 |
import logging
|
8 |
+
from agent.llm_agent import process_user_input
|
9 |
+
from images.image_generator import modify_image
|
10 |
from agent.runner import process_step
|
11 |
import uuid
|
12 |
from game_constructor import (
|
|
|
17 |
load_character_suggestion,
|
18 |
start_game_with_settings,
|
19 |
)
|
20 |
+
import asyncio
|
21 |
+
from game_setting import get_user_story
|
22 |
+
from config import settings
|
23 |
|
24 |
logger = logging.getLogger(__name__)
|
25 |
|
|
|
130 |
# Fullscreen Loading Indicator (hidden by default)
|
131 |
with gr.Column(visible=False, elem_id="loading-indicator") as loading_indicator:
|
132 |
gr.HTML("<div class='loading-text'>🚀 Starting your adventure...</div>")
|
133 |
+
|
134 |
local_storage = gr.BrowserState(str(uuid.uuid4()), "user_hash")
|
135 |
|
136 |
# Constructor Interface (visible by default)
|