Every-Text

Runtime error

File size: 9,624 Bytes

import os
import time
from os import path
import tempfile
import uuid
import base64
import mimetypes
import json
import io

import torch
from PIL import Image

from safetensors.torch import load_file
from huggingface_hub import hf_hub_download

# Diffusers 관련 라이브러리
import gradio as gr
from diffusers import FluxPipeline

# Google GenAI 라이브러리
from google import genai
from google.genai import types

#######################################
# 0. 환경설정
#######################################

BASE_DIR = path.dirname(path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
CACHE_PATH = path.join(BASE_DIR, "models")

os.environ["TRANSFORMERS_CACHE"] = CACHE_PATH
os.environ["HF_HUB_CACHE"] = CACHE_PATH
os.environ["HF_HOME"] = CACHE_PATH

# 간단한 타이머 클래스
class timer:
    def __init__(self, method_name="timed process"):
        self.method = method_name
    def __enter__(self):
        self.start = time.time()
        print(f"{self.method} starts")
    def __exit__(self, exc_type, exc_val, exc_tb):
        end = time.time()
        print(f"{self.method} took {str(round(end - self.start, 2))}s")

#######################################
# 1. FLUX 파이프라인 로드
#######################################

if not path.exists(CACHE_PATH):
    os.makedirs(CACHE_PATH, exist_ok=True)

pipe = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    torch_dtype=torch.bfloat16
)

lora_path = hf_hub_download("ByteDance/Hyper-SD", "Hyper-FLUX.1-dev-8steps-lora.safetensors")
pipe.load_lora_weights(lora_path)
pipe.fuse_lora(lora_scale=0.125)

pipe.to(device="cuda", dtype=torch.bfloat16)

#######################################
# 2. Google GenAI를 통한 이미지 내 텍스트 변환 함수
#######################################

def save_binary_file(file_name, data):
    """Google GenAI에서 응답받은 이진 데이터를 이미지 파일로 저장"""
    with open(file_name, "wb") as f:
        f.write(data)

def generate_by_google_genai(text, file_name, model="gemini-2.0-flash-exp"):
    """
    Google GenAI(gemini) 모델을 통해 이미지/텍스트를 생성하거나 변환.
    - text: 변경할 텍스트나 명령어 등 프롬프트
    - file_name: 원본 이미지(예: .png) 경로
    - model: 사용할 gemini 모델 이름
    """
    # GAPI_TOKEN 환경변수에서 키를 가져옴 (필수)
    api_key = os.getenv("GAPI_TOKEN", None)
    if not api_key:
        raise ValueError(
            "GAPI_TOKEN 환경 변수가 설정되지 않았습니다. "
            "Google GenAI API를 사용하기 위해서는 GAPI_TOKEN이 필요합니다."
        )

    client = genai.Client(api_key=api_key)
    
    # 이미지 업로드
    files = [client.files.upload(file=file_name)]
    
    # gemini에 전달할 Content 준비
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_uri(
                    file_uri=files[0].uri,
                    mime_type=files[0].mime_type,
                ),
                types.Part.from_text(text=text),
            ],
        ),
    ]

    generate_content_config = types.GenerateContentConfig(
        temperature=1,
        top_p=0.95,
        top_k=40,
        max_output_tokens=8192,
        response_modalities=["image", "text"],
        response_mime_type="text/plain",
    )

    text_response = ""
    image_path = None

    # 임시 파일에 이미지 응답을 저장할 준비
    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
        temp_path = tmp.name
        for chunk in client.models.generate_content_stream(
            model=model,
            contents=contents,
            config=generate_content_config,
        ):
            if not chunk.candidates or not chunk.candidates[0].content or not chunk.candidates[0].content.parts:
                continue
            candidate = chunk.candidates[0].content.parts[0]

            # inline_data(이미지) 응답인 경우
            if candidate.inline_data:
                save_binary_file(temp_path, candidate.inline_data.data)
                print(f"File of mime type {candidate.inline_data.mime_type} saved to: {temp_path}")
                image_path = temp_path
                break
            else:
                text_response += chunk.text + "\n"
    
    del files
    return image_path, text_response

#######################################
# 3. Gradio 함수
#######################################

def generate_initial_image(prompt, text, height, width, steps, scale, seed):
    """
    FLUX를 이용해 텍스트가 포함된 이미지를 생성
    - prompt 내에 <text>라는 특수 구분자가 있으면, 거기에 text가 치환됨.
    - 그렇지 않은 경우, 기존처럼 prompt 뒤에 “with clear readable text that says ...”를 추가.
    """
    if "<text>" in prompt:
        combined_prompt = prompt.replace("<text>", text)
    else:
        combined_prompt = f"{prompt} with clear readable text that says '{text}'"

    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16), timer("inference"):
        result = pipe(
            prompt=[combined_prompt],
            generator=torch.Generator().manual_seed(int(seed)),
            num_inference_steps=int(steps),
            guidance_scale=float(scale),
            height=int(height),
            width=int(width),
            max_sequence_length=256
        ).images[0]

    return result

def change_text_in_image(original_image, new_text):
    """
    Gemini 모델을 통해,
    업로드된 이미지 내부의 문구를 `new_text`로 변경해주는 함수.
    """
    try:
        # 임시 파일에 먼저 저장
        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
            original_path = tmp.name
            original_image.save(original_path)

        # Gemini 모델 호출
        image_path, text_response = generate_by_google_genai(
            text=f"Change the text in this image to: '{new_text}'",
            file_name=original_path
        )

        if image_path:
            # Gradio 구버전에는 decode_base64_to_image가 없으므로 PIL을 직접 사용
            with open(image_path, "rb") as f:
                image_data = f.read()
            modified_img = Image.open(io.BytesIO(image_data))
            return modified_img, ""
        else:
            return None, text_response

    except Exception as e:
        raise gr.Error(f"Error: {e}")

#######################################
# 4. Gradio 인터페이스
#######################################

with gr.Blocks(title="Flux + Google GenAI Text Replacement") as demo:
    gr.Markdown(
        """
        # Flux Image Generation + Google GenAI Text Replacement

        **Usage Instructions (in English)**  
        1. Write a prompt that may contain the special placeholder `<text>`.  
           - Example: `A white cat says <text> in a cartoon style`.  
        2. Enter the actual text in the "Text to Include in the Image" field.  
           - Example: `안녕`  
        3. Click the "Generate Base Image" button.  
           - The prompt will be transformed so that `<text>` is replaced with your actual text.  
           - If `<text>` is **not** found, the text will be appended automatically as `with clear readable text that says ...`.  
        4. (Optional) If you want to change the text again, use the "Change Text in Image" button.

        ---
        """
    )

    with gr.Row():
        with gr.Column():
            gr.Markdown("## 1) Generate the Base Image (FLUX)")
            prompt_input = gr.Textbox(
                lines=3,
                label="Prompt (with optional `<text>` placeholder)",
                placeholder="e.g. A white cat says <text> in a cartoon style"
            )
            text_input = gr.Textbox(
                lines=1,
                label="Text to Include in the Image",
                placeholder="e.g. 안녕"
            )
            with gr.Accordion("Advanced Settings", open=False):
                height = gr.Slider(label="Height", minimum=256, maximum=1152, step=64, value=512)
                width = gr.Slider(label="Width", minimum=256, maximum=1152, step=64, value=512)
                steps = gr.Slider(label="Inference Steps", minimum=6, maximum=25, step=1, value=8)
                scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=5.0, step=0.1, value=3.5)
                seed = gr.Number(label="Seed (reproducibility)", value=1234, precision=0)

            generate_btn = gr.Button("Generate Base Image", variant="primary")
            generated_image = gr.Image(label="Generated Image", type="pil")

        with gr.Column():
            gr.Markdown("## 2) (Optional) Change Text in the Generated Image (Gemini)")
            new_text_input = gr.Textbox(
                label="New Text to Insert",
                placeholder="e.g. Hello"
            )
            modify_btn = gr.Button("Change Text in Image via Gemini", variant="secondary")
            output_img = gr.Image(label="Modified Image", type="pil")
            output_txt = gr.Textbox(label="(If only text returned)")

    # 버튼 액션 연결
    generate_btn.click(
        fn=generate_initial_image,
        inputs=[prompt_input, text_input, height, width, steps, scale, seed],
        outputs=[generated_image]
    )

    modify_btn.click(
        fn=change_text_in_image,
        inputs=[generated_image, new_text_input],
        outputs=[output_img, output_txt]
    )

demo.launch(max_threads=20)