Nikita commited on
Commit
c4c426e
·
unverified ·
2 Parent(s): ccccaf7 2999669

Merge pull request #2 from DeltaZN/feat/improve-image-generation

Browse files
src/agent/image_agent.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import Literal, Optional
3
+ from agent.llm import create_light_llm
4
+ from langchain_core.messages import SystemMessage, HumanMessage
5
+ import logging
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ IMAGE_GENERATION_SYSTEM_PROMPT = """You are an AI agent for a visual novel game. Your role is to process an incoming scene description and determine if the visual scene needs to change. If it does, you will generate a new `scene_description`. This `scene_description` MUST BE a highly detailed image prompt, specifically engineered for an AI image generation model, and it MUST adhere to the strict first-person perspective detailed below.
11
+
12
+ **Your Core Tasks & Output Structure:**
13
+ Your output MUST be a `ChangeScene` object. You need to:
14
+ 1. **Determine Change Type:** Decide if the scene requires a "change_completely", "modify", or "no_change" and set this in the `change_scene` field of the output object.
15
+ 2. **Generate FPS Image Prompt:** If your decision is "change_completely" or "modify", you MUST then generate the image prompt and place it in the `scene_description` field of the output object. If "no_change", this field can be null or empty.
16
+
17
+ **Mandatory: First-Person Perspective (FPS) for Image Prompts**
18
+ The image prompt you generate for the `scene_description` field MUST strictly describe the scene from a first-person perspective (FPS), as if the player is looking directly through the character's eyes.
19
+ * **Viewpoint:** All descriptions must be from the character's eye level, looking forward or as indicated by the scene.
20
+ * **Character Visibility:** The scene must be depicted strictly as if looking through the character's eyes. NO part of the character's own body (e.g., hands, arms, feet, clothing on them) should be visible or described in the prompt. The view is purely what is external to the character.
21
+ * **Immersion:** Focus on what the character directly sees and perceives in their immediate environment. Use phrasing that reflects this, for example: "I see...", "Before me lies...", "Looking through the grimy window...", "The corridor stretches out in front of me."
22
+
23
+ **Guidelines for Crafting the FPS Image Prompt (for `scene_description` field):**
24
+ When generating the image prompt, ensure it's detailed and considers the following aspects, all from the character's first-person viewpoint:
25
+
26
+ 1. **Subject & Focus (as seen by the character):**
27
+ * What is the primary subject or point of interest directly in the character's view?
28
+ * Describe any other characters visible to the POV character: their appearance (from the character's perspective), clothing, expressions, posture, and actions.
29
+ * Detail key objects, items, or environmental elements the character is interacting with or observing.
30
+
31
+ 2. **Setting & Environment (from the character's perspective):**
32
+ * Describe the immediate surroundings as the character would see them.
33
+ * Time of day and weather conditions as perceived by the character.
34
+ * Specific architectural or natural features visible in the character's field of view.
35
+
36
+ 3. **Art Style & Medium:**
37
+ * Specify the desired visual style (e.g., photorealistic, anime, manga, watercolor, oil painting, pixel art, 3D render, concept art, comic book).
38
+ * Mention any specific artist influences if relevant (e.g., "in the style of Studio Ghibli").
39
+
40
+ 4. **Composition & Framing (from the character's viewpoint):**
41
+ * How is the scene framed from the character's eyes? (e.g., "looking straight ahead at a door," "view through a sniper scope," "gazing up at a tall tower").
42
+ * Describe the arrangement of elements as perceived by the character. Avoid terms like "medium shot" or "wide shot" unless they can be rephrased from an FPS view (e.g., "a wide vista opens up before me").
43
+
44
+ 5. **Lighting & Atmosphere (as perceived by the character):**
45
+ * Describe lighting conditions (e.g., "bright sunlight streams through the window in front of me," "only the dim glow of my flashlight illuminates the passage ahead," "neon signs reflect off the wet street I'm looking at").
46
+ * What is the overall mood or atmosphere from the character's perspective? (e.g., "a tense silence hangs in the air as I look down the dark hallway," "a sense of peace as I gaze at the sunset over the mountains").
47
+
48
+ 6. **Color Palette:**
49
+ * Specify dominant colors or a color scheme relevant to what the character sees.
50
+
51
+ 7. **Details & Keywords:**
52
+ * Include crucial details from the input scene description that the character would notice.
53
+ * Use descriptive adjectives and strong keywords.
54
+
55
+ **Example for the `scene_description` field (the FPS image prompt):**
56
+ "FPS view. Through the cockpit window of a futuristic hovercar, a sprawling neon-lit cyberpunk city stretches out under a stormy, rain-lashed sky. Rain streaks across the glass. The hum of the engine is palpable. Photorealistic, Blade Runner style. Cool blue and vibrant pink neon palette."
57
+ """
58
+
59
+
60
+ class ChangeScene(BaseModel):
61
+ change_scene: Literal["change_completely", "modify", "no_change"] = Field(
62
+ description="Whether the scene should be completely changed, just modified or not changed at all"
63
+ )
64
+ scene_description: Optional[str] = None
65
+
66
+
67
+ image_prompt_generator_llm = create_light_llm(0.1).with_structured_output(ChangeScene)
68
+
69
+ async def generate_image_prompt(scene_description: str, request_id: str) -> ChangeScene:
70
+ """
71
+ Generates a detailed image prompt string based on a scene description.
72
+ This prompt is intended for use with an AI image generation model.
73
+ """
74
+ logger.info(f"Generating image prompt for the current scene: {request_id}")
75
+ response = await image_prompt_generator_llm.ainvoke(
76
+ [
77
+ SystemMessage(content=IMAGE_GENERATION_SYSTEM_PROMPT),
78
+ HumanMessage(content=scene_description),
79
+ ]
80
+ )
81
+ logger.info(f"Image prompt generated: {request_id}")
82
+ return response
src/agent/llm.py CHANGED
@@ -43,6 +43,15 @@ def create_llm(
43
  top_p=top_p,
44
  thinking_budget=1024,
45
  )
 
 
 
 
 
 
 
 
 
46
 
47
 
48
  def create_precise_llm() -> ChatGoogleGenerativeAI:
 
43
  top_p=top_p,
44
  thinking_budget=1024,
45
  )
46
+
47
+
48
+ def create_light_llm(temperature: float = settings.temperature, top_p: float = settings.top_p):
49
+ return ChatGoogleGenerativeAI(
50
+ model="gemini-2.0-flash",
51
+ google_api_key=_get_api_key(),
52
+ temperature=temperature,
53
+ top_p=top_p
54
+ )
55
 
56
 
57
  def create_precise_llm() -> ChatGoogleGenerativeAI:
src/agent/llm_agent.py CHANGED
@@ -1,61 +1,73 @@
1
- """Simple interface for querying the LLM directly."""
2
-
3
- import logging
4
- from typing import List, Optional
5
-
6
- from pydantic import BaseModel, Field
7
-
8
  from agent.llm import create_llm
 
 
 
 
 
 
 
 
9
 
10
  logger = logging.getLogger(__name__)
11
 
12
 
13
- class ChangeScene(BaseModel):
14
- """Information about a scene change."""
15
-
16
- change_scene: bool = Field(description="Whether the scene should change")
17
- scene_description: Optional[str] = None
18
-
19
-
20
- class ChangeMusic(BaseModel):
21
- """Information about a music change."""
22
-
23
- change_music: bool = Field(description="Whether the music should change")
24
- music_description: Optional[str] = None
25
-
26
-
27
  class PlayerOption(BaseModel):
28
- """Single option for the player."""
29
-
30
  option_description: str = Field(
31
- description=(
32
- "Description of the option, e.g. '[Say] Hello!' "
33
- "or 'Go to the forest'"
34
- )
35
  )
36
 
37
 
38
  class LLMOutput(BaseModel):
39
- """Expected structure returned by the LLM."""
40
-
41
- change_scene: ChangeScene
42
- change_music: ChangeMusic
43
  game_message: str = Field(
44
- description=(
45
- "Message shown to the player, e.g. 'You entered the forest...'"
46
- )
47
  )
48
  player_options: List[PlayerOption] = Field(
49
- description="Up to three options for the player"
50
  )
51
 
52
 
53
- _llm = create_llm().with_structured_output(LLMOutput)
54
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- async def process_user_input(text: str) -> LLMOutput:
57
- """Send user text to the LLM and return the parsed response."""
58
- logger.info("User choice: %s", text)
59
- response: LLMOutput = await _llm.ainvoke(text)
60
- logger.info("LLM response: %s", response)
61
- return response
 
 
 
 
 
 
 
 
1
  from agent.llm import create_llm
2
+ from pydantic import BaseModel, Field
3
+ from typing import List
4
+ import logging
5
+ from agent.image_agent import ChangeScene
6
+ import asyncio
7
+ from agent.music_agent import generate_music_prompt
8
+ from agent.image_agent import generate_image_prompt
9
+ import uuid
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  class PlayerOption(BaseModel):
 
 
15
  option_description: str = Field(
16
+ description="The description of the option, Examples: [Change location] Go to the forest; [Say] Hello!"
 
 
 
17
  )
18
 
19
 
20
  class LLMOutput(BaseModel):
 
 
 
 
21
  game_message: str = Field(
22
+ description="The message to the player, Example: You entered the forest, and you see unknown scary creatures. What do you do?"
 
 
23
  )
24
  player_options: List[PlayerOption] = Field(
25
+ description="The list of up to 3 options for the player to choose from."
26
  )
27
 
28
 
29
+ class MultiAgentResponse(BaseModel):
30
+ game_message: str = Field(
31
+ description="The message to the player, Example: You entered the forest, and you see unknown scary creatures. What do you do?"
32
+ )
33
+ player_options: List[PlayerOption] = Field(
34
+ description="The list of up to 3 options for the player to choose from."
35
+ )
36
+ music_prompt: str = Field(description="The prompt for the music generation model.")
37
+ change_scene: ChangeScene = Field(description="The change to the scene.")
38
+
39
+ llm = create_llm().with_structured_output(MultiAgentResponse)
40
+
41
+
42
+ async def process_user_input(input: str) -> MultiAgentResponse:
43
+ """
44
+ Process user input and update the state.
45
+ """
46
+ request_id = str(uuid.uuid4())
47
+ logger.info(f"LLM input received: {request_id}")
48
+
49
+ response: LLMOutput = await llm.ainvoke(input)
50
+
51
+ # return response
52
+ current_state = f"""{input}
53
+
54
+ Game reaction: {response.game_message}
55
+ Player options: {response.player_options}
56
+ """
57
+
58
+ music_prompt_task = generate_music_prompt(current_state, request_id)
59
+
60
+ change_scene_task = generate_image_prompt(current_state, request_id)
61
+
62
+ music_prompt, change_scene = await asyncio.gather(music_prompt_task, change_scene_task)
63
+
64
+ multi_agent_response = MultiAgentResponse(
65
+ game_message=response.game_message,
66
+ player_options=response.player_options,
67
+ music_prompt=music_prompt,
68
+ change_scene=change_scene,
69
+ )
70
+
71
+ logger.info(f"LLM responded: {request_id}")
72
 
73
+ return multi_agent_response
 
 
 
 
 
src/agent/llm_graph.py CHANGED
@@ -3,8 +3,9 @@
3
  import logging
4
  from dataclasses import dataclass
5
  from typing import Any, Dict, Optional
6
-
7
  from langgraph.graph import END, StateGraph
 
8
 
9
  from agent.tools import (
10
  check_ending,
@@ -14,7 +15,7 @@ from agent.tools import (
14
  update_state_with_choice,
15
  )
16
  from agent.state import get_user_state
17
-
18
  logger = logging.getLogger(__name__)
19
 
20
 
@@ -59,11 +60,13 @@ async def node_init_game(state: GraphState) -> GraphState:
59
  first_scene = await generate_scene.ainvoke(
60
  {"user_hash": state.user_hash, "last_choice": "start"}
61
  )
 
 
62
  await generate_scene_image.ainvoke(
63
  {
64
  "user_hash": state.user_hash,
65
  "scene_id": first_scene["scene_id"],
66
- "prompt": first_scene["description"],
67
  }
68
  )
69
  state.scene = first_scene
@@ -91,13 +94,17 @@ async def node_player_step(state: GraphState) -> GraphState:
91
  "last_choice": state.choice_text,
92
  }
93
  )
94
- await generate_scene_image.ainvoke(
 
95
  {
96
  "user_hash": state.user_hash,
97
  "scene_id": next_scene["scene_id"],
98
- "prompt": next_scene["description"],
 
99
  }
100
  )
 
 
101
  state.scene = next_scene
102
  return state
103
 
 
3
  import logging
4
  from dataclasses import dataclass
5
  from typing import Any, Dict, Optional
6
+ import asyncio
7
  from langgraph.graph import END, StateGraph
8
+ from agent.image_agent import generate_image_prompt
9
 
10
  from agent.tools import (
11
  check_ending,
 
15
  update_state_with_choice,
16
  )
17
  from agent.state import get_user_state
18
+ from audio.audio_generator import change_music_tone
19
  logger = logging.getLogger(__name__)
20
 
21
 
 
60
  first_scene = await generate_scene.ainvoke(
61
  {"user_hash": state.user_hash, "last_choice": "start"}
62
  )
63
+ change_scene = await generate_image_prompt(first_scene["description"], state.user_hash)
64
+ logger.info(f"Change scene: {change_scene}")
65
  await generate_scene_image.ainvoke(
66
  {
67
  "user_hash": state.user_hash,
68
  "scene_id": first_scene["scene_id"],
69
+ "change_scene": change_scene,
70
  }
71
  )
72
  state.scene = first_scene
 
94
  "last_choice": state.choice_text,
95
  }
96
  )
97
+ change_scene = await generate_image_prompt(next_scene["description"], state.user_hash)
98
+ image_task = generate_scene_image.ainvoke(
99
  {
100
  "user_hash": state.user_hash,
101
  "scene_id": next_scene["scene_id"],
102
+ "current_image": user_state.assets[scene_id],
103
+ "change_scene": change_scene,
104
  }
105
  )
106
+ music_task = change_music_tone(state.user_hash, next_scene["music"])
107
+ await asyncio.gather(image_task, music_task)
108
  state.scene = next_scene
109
  return state
110
 
src/agent/music_agent.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from agent.llm import create_light_llm
3
+ from langchain_core.messages import SystemMessage, HumanMessage
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ music_options = """Instruments: 303 Acid Bass, 808 Hip Hop Beat, Accordion, Alto Saxophone, Bagpipes, Balalaika Ensemble, Banjo, Bass Clarinet, Bongos, Boomy Bass, Bouzouki, Buchla Synths, Cello, Charango, Clavichord, Conga Drums, Didgeridoo, Dirty Synths, Djembe, Drumline, Dulcimer, Fiddle, Flamenco Guitar, Funk Drums, Glockenspiel, Guitar, Hang Drum, Harmonica, Harp, Harpsichord, Hurdy-gurdy, Kalimba, Koto, Lyre, Mandolin, Maracas, Marimba, Mbira, Mellotron, Metallic Twang, Moog Oscillations, Ocarina, Persian Tar, Pipa, Precision Bass, Ragtime Piano, Rhodes Piano, Shamisen, Shredding Guitar, Sitar, Slide Guitar, Smooth Pianos, Spacey Synths, Steel Drum, Synth Pads, Tabla, TR-909 Drum Machine, Trumpet, Tuba, Vibraphone, Viola Ensemble, Warm Acoustic Guitar, Woodwinds, ...
9
+ Music Genre: Acid Jazz, Afrobeat, Alternative Country, Baroque, Bengal Baul, Bhangra, Bluegrass, Blues Rock, Bossa Nova, Breakbeat, Celtic Folk, Chillout, Chiptune, Classic Rock, Contemporary R&B, Cumbia, Deep House, Disco Funk, Drum & Bass, Dubstep, EDM, Electro Swing, Funk Metal, G-funk, Garage Rock, Glitch Hop, Grime, Hyperpop, Indian Classical, Indie Electronic, Indie Folk, Indie Pop, Irish Folk, Jam Band, Jamaican Dub, Jazz Fusion, Latin Jazz, Lo-Fi Hip Hop, Marching Band, Merengue, New Jack Swing, Minimal Techno, Moombahton, Neo-Soul, Orchestral Score, Piano Ballad, Polka, Post-Punk, 60s Psychedelic Rock, Psytrance, R&B, Reggae, Reggaeton, Renaissance Music, Salsa, Shoegaze, Ska, Surf Rock, Synthpop, Techno, Trance, Trap Beat, Trip Hop, Vaporwave, Witch house, ...
10
+ Mood/Description: Acoustic Instruments, Ambient, Bright Tones, Chill, Crunchy Distortion, Danceable, Dreamy, Echo, Emotional, Ethereal Ambience, Experimental, Fat Beats, Funky, Glitchy Effects, Huge Drop, Live Performance, Lo-fi, Ominous Drone, Psychedelic, Rich Orchestration, Saturated Tones, Subdued Melody, Sustained Chords, Swirling Phasers, Tight Groove, Unsettling, Upbeat, Virtuoso, Weird Noises, ...
11
+ """
12
+ system_prompt = f"""
13
+ You are a music agent responsible for generating appropriate music tones for scenes in a visual novel game.
14
+
15
+ Your task is to analyze the current scene description and generate a detailed music prompt that captures:
16
+ 1. The emotional atmosphere
17
+ 2. The intensity level
18
+ 3. The genre/style that best fits the scene
19
+ 4. Specific instruments that would enhance the mood
20
+
21
+ You have access to a wide range of musical elements including:
22
+ {music_options}
23
+
24
+ When generating a music prompt:
25
+ - Consider the scene's context, mood, and any suspense elements
26
+ - Choose instruments that complement the scene's atmosphere
27
+ - Select a genre that matches the story's setting and tone
28
+ - Include specific mood descriptors to guide the music generation
29
+
30
+ Your output should be a concise but detailed prompt that the music generation model can use to create an appropriate soundtrack for the scene.
31
+ """
32
+
33
+
34
+ class MusicPrompt(BaseModel):
35
+ prompt: str
36
+
37
+
38
+ llm = create_light_llm(0.1).with_structured_output(MusicPrompt)
39
+
40
+
41
+ async def generate_music_prompt(scene_description: str, request_id: str) -> str:
42
+ logger.info(f"Generating music prompt for the current scene: {request_id}")
43
+ response = await llm.ainvoke(
44
+ [SystemMessage(content=system_prompt), HumanMessage(content=scene_description)]
45
+ )
46
+ logger.info(f"Music prompt generated: {request_id}")
47
+ return response.prompt
src/agent/tools.py CHANGED
@@ -18,7 +18,8 @@ from agent.models import (
18
  )
19
  from agent.prompts import ENDING_CHECK_PROMPT, SCENE_PROMPT, STORY_FRAME_PROMPT
20
  from agent.state import get_user_state, set_user_state
21
- from images.image_generator import generate_image
 
22
 
23
  logger = logging.getLogger(__name__)
24
 
@@ -71,11 +72,9 @@ async def generate_scene(
71
  prompt = SCENE_PROMPT.format(
72
  lore=state.story_frame.lore,
73
  goal=state.story_frame.goal,
74
- milestones=','.join(m.id for m in state.story_frame.milestones),
75
- endings=','.join(e.id for e in state.story_frame.endings),
76
- history='; '.join(
77
- f"{c.scene_id}:{c.choice_text}" for c in state.user_choices
78
- ),
79
  last_choice=last_choice,
80
  )
81
  resp: SceneLLM = await llm.ainvoke(prompt)
@@ -107,11 +106,19 @@ async def generate_scene(
107
  async def generate_scene_image(
108
  user_hash: Annotated[str, "User session ID"],
109
  scene_id: Annotated[str, "Scene ID"],
110
- prompt: Annotated[str, "Prompt for image generation"],
 
111
  ) -> Annotated[str, "Path to generated image"]:
112
  """Generate an image for a scene and save the path in the state."""
113
  try:
114
- image_path, _ = await generate_image(prompt)
 
 
 
 
 
 
 
115
  state = get_user_state(user_hash)
116
  if scene_id in state.scenes:
117
  state.scenes[scene_id].image = image_path
@@ -152,14 +159,10 @@ async def check_ending(
152
  if not state.story_frame:
153
  return _err("No story frame")
154
  llm = create_llm().with_structured_output(EndingCheckResult)
155
- history = '; '.join(
156
- f"{c.scene_id}:{c.choice_text}" for c in state.user_choices
157
- )
158
  prompt = ENDING_CHECK_PROMPT.format(
159
  history=history,
160
- endings=','.join(
161
- f"{e.id}:{e.condition}" for e in state.story_frame.endings
162
- ),
163
  )
164
  resp: EndingCheckResult = await llm.ainvoke(prompt)
165
  if resp.ending_reached and resp.ending:
 
18
  )
19
  from agent.prompts import ENDING_CHECK_PROMPT, SCENE_PROMPT, STORY_FRAME_PROMPT
20
  from agent.state import get_user_state, set_user_state
21
+ from images.image_generator import modify_image, generate_image
22
+ from agent.image_agent import ChangeScene
23
 
24
  logger = logging.getLogger(__name__)
25
 
 
72
  prompt = SCENE_PROMPT.format(
73
  lore=state.story_frame.lore,
74
  goal=state.story_frame.goal,
75
+ milestones=",".join(m.id for m in state.story_frame.milestones),
76
+ endings=",".join(e.id for e in state.story_frame.endings),
77
+ history="; ".join(f"{c.scene_id}:{c.choice_text}" for c in state.user_choices),
 
 
78
  last_choice=last_choice,
79
  )
80
  resp: SceneLLM = await llm.ainvoke(prompt)
 
106
  async def generate_scene_image(
107
  user_hash: Annotated[str, "User session ID"],
108
  scene_id: Annotated[str, "Scene ID"],
109
+ change_scene: Annotated[ChangeScene, "Prompt for image generation"],
110
+ current_image: Annotated[str, "Current image"] | None = None,
111
  ) -> Annotated[str, "Path to generated image"]:
112
  """Generate an image for a scene and save the path in the state."""
113
  try:
114
+ image_path = current_image
115
+ if change_scene.change_scene == "change_completely" or change_scene.change_scene == "modify":
116
+ image_path, _ = await (
117
+ generate_image(change_scene.scene_description)
118
+ if current_image is None
119
+ # for now always modify the image to avoid the generating an update in a completely wrong style
120
+ else modify_image(current_image, change_scene.scene_description)
121
+ )
122
  state = get_user_state(user_hash)
123
  if scene_id in state.scenes:
124
  state.scenes[scene_id].image = image_path
 
159
  if not state.story_frame:
160
  return _err("No story frame")
161
  llm = create_llm().with_structured_output(EndingCheckResult)
162
+ history = "; ".join(f"{c.scene_id}:{c.choice_text}" for c in state.user_choices)
 
 
163
  prompt = ENDING_CHECK_PROMPT.format(
164
  history=history,
165
+ endings=",".join(f"{e.id}:{e.condition}" for e in state.story_frame.endings),
 
 
166
  )
167
  resp: EndingCheckResult = await llm.ainvoke(prompt)
168
  if resp.ending_reached and resp.ending:
src/audio/audio_generator.py CHANGED
@@ -13,10 +13,12 @@ logger = logging.getLogger(__name__)
13
  client = genai.Client(api_key=settings.gemini_api_key.get_secret_value(), http_options={'api_version': 'v1alpha'})
14
 
15
  async def generate_music(user_hash: str, music_tone: str, receive_audio):
16
- async with (
 
 
17
  client.aio.live.music.connect(model='models/lyria-realtime-exp') as session,
18
  asyncio.TaskGroup() as tg,
19
- ):
20
  # Set up task to receive server messages.
21
  tg.create_task(receive_audio(session, user_hash))
22
 
@@ -31,10 +33,9 @@ async def generate_music(user_hash: str, music_tone: str, receive_audio):
31
  )
32
  await session.play()
33
  logger.info(f"Started music generation for user hash {user_hash}, music tone: {music_tone}")
34
- await cleanup_music_session(user_hash)
35
  sessions[user_hash] = {
36
  'session': session,
37
- 'queue': queue.Queue(maxsize=3)
38
  }
39
 
40
  async def change_music_tone(user_hash: str, new_tone):
@@ -43,7 +44,6 @@ async def change_music_tone(user_hash: str, new_tone):
43
  if not session:
44
  logger.error(f"No session found for user hash {user_hash}")
45
  return
46
- await session.reset_context()
47
  await session.set_weighted_prompts(
48
  prompts=[types.WeightedPrompt(text=new_tone, weight=1.0)]
49
  )
 
13
  client = genai.Client(api_key=settings.gemini_api_key.get_secret_value(), http_options={'api_version': 'v1alpha'})
14
 
15
  async def generate_music(user_hash: str, music_tone: str, receive_audio):
16
+ if user_hash in sessions:
17
+ return
18
+ async with (
19
  client.aio.live.music.connect(model='models/lyria-realtime-exp') as session,
20
  asyncio.TaskGroup() as tg,
21
+ ):
22
  # Set up task to receive server messages.
23
  tg.create_task(receive_audio(session, user_hash))
24
 
 
33
  )
34
  await session.play()
35
  logger.info(f"Started music generation for user hash {user_hash}, music tone: {music_tone}")
 
36
  sessions[user_hash] = {
37
  'session': session,
38
+ 'queue': queue.Queue()
39
  }
40
 
41
  async def change_music_tone(user_hash: str, new_tone):
 
44
  if not session:
45
  logger.error(f"No session found for user hash {user_hash}")
46
  return
 
47
  await session.set_weighted_prompts(
48
  prompts=[types.WeightedPrompt(text=new_tone, weight=1.0)]
49
  )
src/config.py CHANGED
@@ -24,8 +24,11 @@ class BaseAppSettings(BaseSettings):
24
 
25
  class AppSettings(BaseAppSettings):
26
  gemini_api_key: SecretStr
 
 
27
  top_p: float = 0.95
28
  temperature: float = 0.5
29
-
 
30
 
31
  settings = AppSettings()
 
24
 
25
  class AppSettings(BaseAppSettings):
26
  gemini_api_key: SecretStr
27
+ gemini_api_keys: SecretStr
28
+ # assistant_api_key: SecretStr
29
  top_p: float = 0.95
30
  temperature: float = 0.5
31
+ pregenerate_next_scene: bool = True
32
+
33
 
34
  settings = AppSettings()
src/css.py CHANGED
@@ -33,11 +33,11 @@ custom_css = """
33
  background: rgba(0,0,0,0.7) !important;
34
  border: none !important;
35
  color: white !important;
36
- font-size: 18px !important;
37
  line-height: 1.5 !important;
38
- padding: 20px !important;
39
  border-radius: 10px !important;
40
- margin-bottom: 20px !important;
41
  }
42
 
43
  img {
@@ -49,7 +49,7 @@ img {
49
  border: none !important;
50
  color: white !important;
51
  -webkit-text-fill-color: white !important;
52
- font-size: 18px !important;
53
  resize: none !important;
54
  }
55
 
@@ -57,13 +57,12 @@ img {
57
  .choice-buttons {
58
  background: rgba(0,0,0,0.7) !important;
59
  border-radius: 10px !important;
60
- padding: 15px !important;
61
  }
62
 
63
  .choice-buttons label {
64
  color: white !important;
65
- font-size: 16px !important;
66
- margin-bottom: 10px !important;
67
  }
68
 
69
  /* Fix radio button backgrounds */
 
33
  background: rgba(0,0,0,0.7) !important;
34
  border: none !important;
35
  color: white !important;
36
+ font-size: 15px !important;
37
  line-height: 1.5 !important;
38
+ padding: 10px !important;
39
  border-radius: 10px !important;
40
+ margin-bottom: 10px !important;
41
  }
42
 
43
  img {
 
49
  border: none !important;
50
  color: white !important;
51
  -webkit-text-fill-color: white !important;
52
+ font-size: 15px !important;
53
  resize: none !important;
54
  }
55
 
 
57
  .choice-buttons {
58
  background: rgba(0,0,0,0.7) !important;
59
  border-radius: 10px !important;
60
+ padding: 10px !important;
61
  }
62
 
63
  .choice-buttons label {
64
  color: white !important;
65
+ font-size: 14px !important;
 
66
  }
67
 
68
  /* Fix radio button backgrounds */
src/game_constructor.py CHANGED
@@ -1,10 +1,16 @@
1
  import gradio as gr
2
  import json
3
  import uuid
 
 
 
 
4
  from game_setting import Character, GameSetting
5
  from agent.runner import process_step
6
  from audio.audio_generator import start_music_generation
7
  import asyncio
 
 
8
 
9
  # Predefined suggestions for demo
10
  SETTING_SUGGESTIONS = [
@@ -105,6 +111,7 @@ def save_game_config(
105
  except Exception as e:
106
  return f"❌ Error saving configuration: {str(e)}"
107
 
 
108
  async def start_game_with_settings(
109
  user_hash: str,
110
  setting_desc: str,
@@ -139,6 +146,8 @@ async def start_game_with_settings(
139
  )
140
 
141
  game_setting = GameSetting(character=character, setting=setting_desc, genre=genre)
 
 
142
 
143
  # Запускаем LLM-граф для инициализации истории
144
  result = await process_step(
@@ -149,8 +158,6 @@ async def start_game_with_settings(
149
  genre=game_setting.genre,
150
  )
151
 
152
- asyncio.create_task(start_music_generation(user_hash, "neutral"))
153
-
154
  scene = result["scene"]
155
  scene_text = scene["description"]
156
  scene_image = scene.get("image", "")
 
1
  import gradio as gr
2
  import json
3
  import uuid
4
+ from game_setting import Character, GameSetting, get_user_story
5
+ from game_state import story, state, get_current_scene
6
+ from agent.llm_agent import process_user_input
7
+ from images.image_generator import generate_image
8
  from game_setting import Character, GameSetting
9
  from agent.runner import process_step
10
  from audio.audio_generator import start_music_generation
11
  import asyncio
12
+ from config import settings
13
+
14
 
15
  # Predefined suggestions for demo
16
  SETTING_SUGGESTIONS = [
 
111
  except Exception as e:
112
  return f"❌ Error saving configuration: {str(e)}"
113
 
114
+
115
  async def start_game_with_settings(
116
  user_hash: str,
117
  setting_desc: str,
 
146
  )
147
 
148
  game_setting = GameSetting(character=character, setting=setting_desc, genre=genre)
149
+
150
+ asyncio.create_task(start_music_generation(user_hash, "neutral"))
151
 
152
  # Запускаем LLM-граф для инициализации истории
153
  result = await process_step(
 
158
  genre=game_setting.genre,
159
  )
160
 
 
 
161
  scene = result["scene"]
162
  scene_text = scene["description"]
163
  scene_image = scene.get("image", "")
src/game_setting.py CHANGED
@@ -1,12 +1,25 @@
1
  from pydantic import BaseModel
2
 
 
3
  class Character(BaseModel):
4
  name: str
5
  age: str
6
  background: str
7
  personality: str
8
 
 
9
  class GameSetting(BaseModel):
10
  character: Character
11
  setting: str
12
  genre: str
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pydantic import BaseModel
2
 
3
+
4
  class Character(BaseModel):
5
  name: str
6
  age: str
7
  background: str
8
  personality: str
9
 
10
+
11
  class GameSetting(BaseModel):
12
  character: Character
13
  setting: str
14
  genre: str
15
+
16
+
17
+ def get_user_story(
18
+ scene_description: str, scene_image_description: str, user_choice: str
19
+ ) -> str:
20
+ return f"""Current scene description:
21
+ {scene_description}
22
+ Current scene image description: {scene_image_description}
23
+
24
+ User's choice: {user_choice}
25
+ """
src/game_state.py CHANGED
@@ -1,10 +1,10 @@
1
-
2
  story = {
3
  "start": {
4
  "text": "You wake up in a mysterious forest. What do you do?",
5
  "image": "forest.jpg",
6
- "choices": ["Explore", "Wait"],
7
  "music_tone": "neutral",
 
8
  },
9
  }
10
 
@@ -12,4 +12,4 @@ state = {"scene": "start"}
12
 
13
  def get_current_scene():
14
  scene = story[state["scene"]]
15
- return scene["text"], scene["image"], scene["choices"]
 
 
1
  story = {
2
  "start": {
3
  "text": "You wake up in a mysterious forest. What do you do?",
4
  "image": "forest.jpg",
5
+ "choices": {"Explore": None, "Wait": None},
6
  "music_tone": "neutral",
7
+ "img_description": "forest in the fog",
8
  },
9
  }
10
 
 
12
 
13
  def get_current_scene():
14
  scene = story[state["scene"]]
15
+ return scene["text"], scene["image"], scene["choices"].keys()
src/images/image_generator.py CHANGED
@@ -6,25 +6,47 @@ from io import BytesIO
6
  from datetime import datetime
7
  from config import settings
8
  import logging
 
 
9
 
10
  logger = logging.getLogger(__name__)
11
 
12
  client = genai.Client(api_key=settings.gemini_api_key.get_secret_value()).aio
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  async def generate_image(prompt: str) -> tuple[str, str] | None:
15
  """
16
  Generate an image using Google's Gemini model and save it to generated/images directory.
17
-
18
  Args:
19
  prompt (str): The text prompt to generate the image from
20
-
21
  Returns:
22
  str: Path to the generated image file, or None if generation failed
23
  """
24
  # Ensure the generated/images directory exists
25
  output_dir = "generated/images"
26
  os.makedirs(output_dir, exist_ok=True)
27
-
28
  logger.info(f"Generating image with prompt: {prompt}")
29
 
30
  try:
@@ -32,8 +54,9 @@ async def generate_image(prompt: str) -> tuple[str, str] | None:
32
  model="gemini-2.0-flash-preview-image-generation",
33
  contents=prompt,
34
  config=types.GenerateContentConfig(
35
- response_modalities=['TEXT', 'IMAGE'],
36
- )
 
37
  )
38
 
39
  # Process the response parts
@@ -44,19 +67,20 @@ async def generate_image(prompt: str) -> tuple[str, str] | None:
44
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
45
  filename = f"gemini_{timestamp}.png"
46
  filepath = os.path.join(output_dir, filename)
47
-
48
  # Save the image
49
  image = Image.open(BytesIO(part.inline_data.data))
50
- image.save(filepath, "PNG")
51
  logger.info(f"Image saved to: {filepath}")
52
  image_saved = True
53
-
54
- return filepath, part.text
55
-
56
  if not image_saved:
 
57
  logger.error("No image was generated in the response.")
58
  return None, None
59
-
60
  except Exception as e:
61
  logger.error(f"Error generating image: {e}")
62
  return None, None
@@ -65,38 +89,41 @@ async def generate_image(prompt: str) -> tuple[str, str] | None:
65
  async def modify_image(image_path: str, modification_prompt: str) -> str | None:
66
  """
67
  Modify an existing image using Google's Gemini model based on a text prompt.
68
-
69
  Args:
70
  image_path (str): Path to the existing image file
71
  modification_prompt (str): The text prompt describing how to modify the image
72
-
73
  Returns:
74
  str: Path to the modified image file, or None if modification failed
75
  """
76
  # Ensure the generated/images directory exists
77
  output_dir = "generated/images"
78
  os.makedirs(output_dir, exist_ok=True)
79
-
 
 
80
  # Check if the input image exists
81
  if not os.path.exists(image_path):
82
  logger.error(f"Error: Image file not found at {image_path}")
83
  return None
84
-
85
  key = settings.gemini_api_key.get_secret_value()
86
-
87
  client = genai.Client(api_key=key).aio
88
 
89
  try:
90
  # Load the input image
91
  input_image = Image.open(image_path)
92
-
93
  # Make the API call with both text and image
94
  response = await client.models.generate_content(
95
  model="gemini-2.0-flash-preview-image-generation",
96
  contents=[modification_prompt, input_image],
97
  config=types.GenerateContentConfig(
98
- response_modalities=['TEXT', 'IMAGE']
99
- )
 
100
  )
101
 
102
  # Process the response parts
@@ -107,19 +134,20 @@ async def modify_image(image_path: str, modification_prompt: str) -> str | None:
107
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
108
  filename = f"gemini_modified_{timestamp}.png"
109
  filepath = os.path.join(output_dir, filename)
110
-
111
  # Save the modified image
112
  modified_image = Image.open(BytesIO(part.inline_data.data))
113
- modified_image.save(filepath, "PNG")
114
  logger.info(f"Modified image saved to: {filepath}")
115
  image_saved = True
116
-
117
- return filepath, part.text
118
-
119
  if not image_saved:
 
120
  logger.error("No modified image was generated in the response.")
121
  return None, None
122
-
123
  except Exception as e:
124
  logger.error(f"Error modifying image: {e}")
125
  return None, None
@@ -129,10 +157,10 @@ if __name__ == "__main__":
129
  # Example usage
130
  sample_prompt = "A Luke Skywalker half height sprite with white background for visual novel game"
131
  generated_image_path = generate_image(sample_prompt)
132
-
133
  # if generated_image_path:
134
  # # Example modification
135
  # modification_prompt = "Now the house is destroyed, and the jawas are running away"
136
  # modified_image_path = modify_image(generated_image_path, modification_prompt)
137
  # if modified_image_path:
138
- # print(f"Successfully modified image: {modified_image_path}")
 
6
  from datetime import datetime
7
  from config import settings
8
  import logging
9
+ import asyncio
10
+ import gradio as gr
11
 
12
  logger = logging.getLogger(__name__)
13
 
14
  client = genai.Client(api_key=settings.gemini_api_key.get_secret_value()).aio
15
 
16
+ safety_settings = [
17
+ types.SafetySetting(
18
+ category="HARM_CATEGORY_HARASSMENT",
19
+ threshold="BLOCK_NONE", # Block none
20
+ ),
21
+ types.SafetySetting(
22
+ category="HARM_CATEGORY_HATE_SPEECH",
23
+ threshold="BLOCK_NONE", # Block none
24
+ ),
25
+ types.SafetySetting(
26
+ category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
27
+ threshold="BLOCK_NONE", # Block none
28
+ ),
29
+ types.SafetySetting(
30
+ category="HARM_CATEGORY_DANGEROUS_CONTENT",
31
+ threshold="BLOCK_NONE", # Block none
32
+ ),
33
+ ]
34
+
35
+
36
  async def generate_image(prompt: str) -> tuple[str, str] | None:
37
  """
38
  Generate an image using Google's Gemini model and save it to generated/images directory.
39
+
40
  Args:
41
  prompt (str): The text prompt to generate the image from
42
+
43
  Returns:
44
  str: Path to the generated image file, or None if generation failed
45
  """
46
  # Ensure the generated/images directory exists
47
  output_dir = "generated/images"
48
  os.makedirs(output_dir, exist_ok=True)
49
+
50
  logger.info(f"Generating image with prompt: {prompt}")
51
 
52
  try:
 
54
  model="gemini-2.0-flash-preview-image-generation",
55
  contents=prompt,
56
  config=types.GenerateContentConfig(
57
+ response_modalities=["TEXT", "IMAGE"],
58
+ safety_settings=safety_settings,
59
+ ),
60
  )
61
 
62
  # Process the response parts
 
67
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
68
  filename = f"gemini_{timestamp}.png"
69
  filepath = os.path.join(output_dir, filename)
70
+
71
  # Save the image
72
  image = Image.open(BytesIO(part.inline_data.data))
73
+ await asyncio.to_thread(image.save, filepath, "PNG")
74
  logger.info(f"Image saved to: {filepath}")
75
  image_saved = True
76
+
77
+ return filepath, prompt
78
+
79
  if not image_saved:
80
+ gr.Warning("Image was censored by Google!")
81
  logger.error("No image was generated in the response.")
82
  return None, None
83
+
84
  except Exception as e:
85
  logger.error(f"Error generating image: {e}")
86
  return None, None
 
89
  async def modify_image(image_path: str, modification_prompt: str) -> str | None:
90
  """
91
  Modify an existing image using Google's Gemini model based on a text prompt.
92
+
93
  Args:
94
  image_path (str): Path to the existing image file
95
  modification_prompt (str): The text prompt describing how to modify the image
96
+
97
  Returns:
98
  str: Path to the modified image file, or None if modification failed
99
  """
100
  # Ensure the generated/images directory exists
101
  output_dir = "generated/images"
102
  os.makedirs(output_dir, exist_ok=True)
103
+
104
+ logger.info(f"Modifying current scene image with prompt: {modification_prompt}")
105
+
106
  # Check if the input image exists
107
  if not os.path.exists(image_path):
108
  logger.error(f"Error: Image file not found at {image_path}")
109
  return None
110
+
111
  key = settings.gemini_api_key.get_secret_value()
112
+
113
  client = genai.Client(api_key=key).aio
114
 
115
  try:
116
  # Load the input image
117
  input_image = Image.open(image_path)
118
+
119
  # Make the API call with both text and image
120
  response = await client.models.generate_content(
121
  model="gemini-2.0-flash-preview-image-generation",
122
  contents=[modification_prompt, input_image],
123
  config=types.GenerateContentConfig(
124
+ response_modalities=["TEXT", "IMAGE"],
125
+ safety_settings=safety_settings,
126
+ ),
127
  )
128
 
129
  # Process the response parts
 
134
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
135
  filename = f"gemini_modified_{timestamp}.png"
136
  filepath = os.path.join(output_dir, filename)
137
+
138
  # Save the modified image
139
  modified_image = Image.open(BytesIO(part.inline_data.data))
140
+ await asyncio.to_thread(modified_image.save, filepath, "PNG")
141
  logger.info(f"Modified image saved to: {filepath}")
142
  image_saved = True
143
+
144
+ return filepath, modification_prompt
145
+
146
  if not image_saved:
147
+ gr.Warning("Updated image was censored by Google!")
148
  logger.error("No modified image was generated in the response.")
149
  return None, None
150
+
151
  except Exception as e:
152
  logger.error(f"Error modifying image: {e}")
153
  return None, None
 
157
  # Example usage
158
  sample_prompt = "A Luke Skywalker half height sprite with white background for visual novel game"
159
  generated_image_path = generate_image(sample_prompt)
160
+
161
  # if generated_image_path:
162
  # # Example modification
163
  # modification_prompt = "Now the house is destroyed, and the jawas are running away"
164
  # modified_image_path = modify_image(generated_image_path, modification_prompt)
165
  # if modified_image_path:
166
+ # print(f"Successfully modified image: {modified_image_path}")
src/main.py CHANGED
@@ -5,6 +5,8 @@ from audio.audio_generator import (
5
  cleanup_music_session,
6
  )
7
  import logging
 
 
8
  from agent.runner import process_step
9
  import uuid
10
  from game_constructor import (
@@ -15,6 +17,9 @@ from game_constructor import (
15
  load_character_suggestion,
16
  start_game_with_settings,
17
  )
 
 
 
18
 
19
  logger = logging.getLogger(__name__)
20
 
@@ -125,7 +130,7 @@ with gr.Blocks(
125
  # Fullscreen Loading Indicator (hidden by default)
126
  with gr.Column(visible=False, elem_id="loading-indicator") as loading_indicator:
127
  gr.HTML("<div class='loading-text'>🚀 Starting your adventure...</div>")
128
-
129
  local_storage = gr.BrowserState(str(uuid.uuid4()), "user_hash")
130
 
131
  # Constructor Interface (visible by default)
 
5
  cleanup_music_session,
6
  )
7
  import logging
8
+ from agent.llm_agent import process_user_input
9
+ from images.image_generator import modify_image
10
  from agent.runner import process_step
11
  import uuid
12
  from game_constructor import (
 
17
  load_character_suggestion,
18
  start_game_with_settings,
19
  )
20
+ import asyncio
21
+ from game_setting import get_user_story
22
+ from config import settings
23
 
24
  logger = logging.getLogger(__name__)
25
 
 
130
  # Fullscreen Loading Indicator (hidden by default)
131
  with gr.Column(visible=False, elem_id="loading-indicator") as loading_indicator:
132
  gr.HTML("<div class='loading-text'>🚀 Starting your adventure...</div>")
133
+
134
  local_storage = gr.BrowserState(str(uuid.uuid4()), "user_hash")
135
 
136
  # Constructor Interface (visible by default)