import os import time from os import path import tempfile import uuid import base64 import mimetypes import json import io import torch from PIL import Image from safetensors.torch import load_file from huggingface_hub import hf_hub_download # Diffusers 관련 라이브러리 import gradio as gr from diffusers import FluxPipeline # Google GenAI 라이브러리 from google import genai from google.genai import types ####################################### # 0. 환경설정 ####################################### BASE_DIR = path.dirname(path.abspath(__file__)) if "__file__" in globals() else os.getcwd() CACHE_PATH = path.join(BASE_DIR, "models") os.environ["TRANSFORMERS_CACHE"] = CACHE_PATH os.environ["HF_HUB_CACHE"] = CACHE_PATH os.environ["HF_HOME"] = CACHE_PATH # 간단한 타이머 클래스 class timer: def __init__(self, method_name="timed process"): self.method = method_name def __enter__(self): self.start = time.time() print(f"{self.method} starts") def __exit__(self, exc_type, exc_val, exc_tb): end = time.time() print(f"{self.method} took {str(round(end - self.start, 2))}s") ####################################### # 1. FLUX 파이프라인 로드 ####################################### if not path.exists(CACHE_PATH): os.makedirs(CACHE_PATH, exist_ok=True) pipe = FluxPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16 ) lora_path = hf_hub_download("ByteDance/Hyper-SD", "Hyper-FLUX.1-dev-8steps-lora.safetensors") pipe.load_lora_weights(lora_path) pipe.fuse_lora(lora_scale=0.125) pipe.to(device="cuda", dtype=torch.bfloat16) ####################################### # 2. Google GenAI를 통한 이미지 내 텍스트 변환 함수 ####################################### def save_binary_file(file_name, data): """Google GenAI에서 응답받은 이진 데이터를 이미지 파일로 저장""" with open(file_name, "wb") as f: f.write(data) def generate_by_google_genai(text, file_name, model="gemini-2.0-flash-exp"): """ Google GenAI(gemini) 모델을 통해 이미지/텍스트를 생성하거나 변환. - text: 변경할 텍스트나 명령어 등 프롬프트 - file_name: 원본 이미지(예: .png) 경로 - model: 사용할 gemini 모델 이름 """ # (1) 환경 변수에서 API 키 가져오기 (필수) api_key = os.getenv("GAPI_TOKEN", None) if not api_key: raise ValueError( "GAPI_TOKEN 환경 변수가 설정되지 않았습니다. " "Google GenAI API를 사용하기 위해서는 GAPI_TOKEN이 필요합니다." ) # (2) Google Client 초기화 client = genai.Client(api_key=api_key) # (3) 이미지 업로드 files = [client.files.upload(file=file_name)] # (4) gemini에 전달할 Content 준비 (이미지 + 프롬프트) contents = [ types.Content( role="user", parts=[ types.Part.from_uri( file_uri=files[0].uri, mime_type=files[0].mime_type, ), types.Part.from_text(text=text), ], ), ] # (5) 생성/변환 설정 generate_content_config = types.GenerateContentConfig( temperature=1, top_p=0.95, top_k=40, max_output_tokens=8192, response_modalities=["image", "text"], response_mime_type="text/plain", ) text_response = "" image_path = None # 임시 파일로 이미지 받을 준비 with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: temp_path = tmp.name # 응답 스트림을 받으면서 이미지/텍스트 구분 처리 for chunk in client.models.generate_content_stream( model=model, contents=contents, config=generate_content_config, ): if not chunk.candidates or not chunk.candidates[0].content or not chunk.candidates[0].content.parts: continue candidate = chunk.candidates[0].content.parts[0] # inline_data가 있으면 이미지 응답 if candidate.inline_data: save_binary_file(temp_path, candidate.inline_data.data) print(f"File of mime type {candidate.inline_data.mime_type} saved to: {temp_path}") image_path = temp_path break else: # 이미지 없이 텍스트만 반환되는 경우 text_response += chunk.text + "\n" # 업로드한 File 객체 제거 del files return image_path, text_response ####################################### # 3. Gradio 함수 # (1) FLUX로 이미지 생성 -> (2) Google GenAI로 텍스트 교체 ####################################### def generate_initial_image(prompt, text, height, width, steps, scale, seed): """ FLUX 파이프라인을 사용해 '텍스트가 포함된 이미지를' 먼저 생성. - prompt 내 를 text로 치환 - 가 없다면 "with clear readable text that says ''"를 자동 붙임 """ if "" in prompt: combined_prompt = prompt.replace("", text) else: combined_prompt = f"{prompt} with clear readable text that says '{text}'" # 디버그용: 최종 들어가는 프롬프트를 확인 print(f"[DEBUG] Final combined_prompt: {combined_prompt}") with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16), timer("inference"): result = pipe( prompt=[combined_prompt], generator=torch.Generator().manual_seed(int(seed)), num_inference_steps=int(steps), guidance_scale=float(scale), height=int(height), width=int(width), max_sequence_length=256 ).images[0] return result def change_text_in_image(original_image, new_text): """ Google GenAI의 gemini 모델을 통해, 업로드된 이미지 내부의 문구를 `new_text`로 변경해주는 함수. """ try: with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: original_path = tmp.name original_image.save(original_path) # Gemini 모델 호출 image_path, text_response = generate_by_google_genai( text=f"Change the text in this image to: '{new_text}'", file_name=original_path ) if image_path: # Gradio 구버전에는 decode_base64_to_image가 없으므로 PIL로 처리 with open(image_path, "rb") as f: image_data = f.read() modified_img = Image.open(io.BytesIO(image_data)) return modified_img, "" else: # 이미지가 없이 텍스트만 반환된 경우 return None, text_response except Exception as e: raise gr.Error(f"Error: {e}") ####################################### # 4. Gradio 인터페이스 구성 ####################################### with gr.Blocks(title="Flux + Google GenAI Text Replacement") as demo: gr.Markdown( """ # Flux 기반 이미지 생성 + Google GenAI를 통한 텍스트 변환 **Usage**: - You can include `` in the prompt. For example: `white cat with speech bubble says ` - Then, type the actual text in "Text to Include in the Image" (ex: "Hello" or "안녕"). - If `` is not found in your prompt, the text will be automatically appended as: `with clear readable text that says ''`. - Finally, you can optionally change the text again via Gemini. --- """ ) with gr.Row(): with gr.Column(): gr.Markdown("## 1) Step 1: FLUX로 텍스트 포함 이미지 생성") prompt_input = gr.Textbox( lines=3, label="이미지 장면/배경 Prompt (use `` placeholder if you like)", placeholder="e.g. A white cat with speech bubble says " ) text_input = gr.Textbox( lines=1, label="이미지 안에 들어갈 텍스트", placeholder="e.g. Hello or 안녕" ) with gr.Accordion("고급 설정 (확장)", open=False): height = gr.Slider(label="Height", minimum=256, maximum=1152, step=64, value=512) width = gr.Slider(label="Width", minimum=256, maximum=1152, step=64, value=512) steps = gr.Slider(label="Inference Steps", minimum=6, maximum=25, step=1, value=8) scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=10.0, step=0.5, value=3.5) seed = gr.Number(label="Seed (reproducibility)", value=1234, precision=0) generate_btn = gr.Button("Generate Base Image", variant="primary") generated_image = gr.Image(label="Generated Image (with text)", type="pil") with gr.Column(): gr.Markdown("## 2) Step 2: 생성된 이미지 내 텍스트 수정") new_text_input = gr.Textbox( label="새로 바꿀 텍스트", placeholder="예) Hello world" ) modify_btn = gr.Button("Change Text in Image via Gemini", variant="secondary") output_img = gr.Image(label="Modified Image", type="pil") output_txt = gr.Textbox(label="(If only text returned)") # 버튼 액션 연결 generate_btn.click( fn=generate_initial_image, inputs=[prompt_input, text_input, height, width, steps, scale, seed], outputs=[generated_image] ) modify_btn.click( fn=change_text_in_image, inputs=[generated_image, new_text_input], outputs=[output_img, output_txt] ) demo.launch(max_threads=20)