Every-Text

Running

File size: 13,668 Bytes

import os
import re
import time
from os import path
import tempfile
import uuid
import base64
import mimetypes
import json
import io
import random
import string

import torch
from PIL import Image

from transformers import pipeline
from safetensors.torch import load_file
from huggingface_hub import hf_hub_download

# Diffusers
import gradio as gr
from diffusers import FluxPipeline

# (Internal) text-modification library
from google import genai
from google.genai import types

#######################################
# 0. Environment & Translation Pipeline
#######################################

BASE_DIR = path.dirname(path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
CACHE_PATH = path.join(BASE_DIR, "models")

os.environ["TRANSFORMERS_CACHE"] = CACHE_PATH
os.environ["HF_HUB_CACHE"] = CACHE_PATH
os.environ["HF_HOME"] = CACHE_PATH

# Translation (Korean -> English), CPU only
translator = pipeline(
    task="translation",
    model="Helsinki-NLP/opus-mt-ko-en",
    device=-1  # force CPU
)

def maybe_translate_to_english(text: str) -> str:
    """
    If the prompt contains any Korean characters, translate to English.
    Otherwise, return as-is.
    """
    if re.search("[가-힣]", text):
        translated = translator(text)[0]["translation_text"]
        print(f"[TRANSLATE] Detected Korean -> '{text}' -> '{translated}'")
        return translated
    return text

# Simple Timer Class
class timer:
    def __init__(self, method_name="timed process"):
        self.method = method_name
    def __enter__(self):
        self.start = time.time()
        print(f"[TIMER] {self.method} starts")
    def __exit__(self, exc_type, exc_val, exc_tb):
        end = time.time()
        print(f"[TIMER] {self.method} took {round(end - self.start, 2)}s")

#######################################
# 1. Load FLUX Pipeline
#######################################

if not path.exists(CACHE_PATH):
    os.makedirs(CACHE_PATH, exist_ok=True)

pipe = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    torch_dtype=torch.bfloat16
)

lora_path = hf_hub_download("ByteDance/Hyper-SD", "Hyper-FLUX.1-dev-8steps-lora.safetensors")
pipe.load_lora_weights(lora_path)
pipe.fuse_lora(lora_scale=0.125)
pipe.to(device="cuda", dtype=torch.bfloat16)

#######################################
# 2. Internal Text Modification Functions
#######################################

def save_binary_file(file_name, data):
    with open(file_name, "wb") as f:
        f.write(data)

def generate_by_google_genai(text, file_name, model="gemini-2.0-flash-exp"):
    """
    Internally modifies text within an image, returning a new image path.
    (Screen instructions do not mention 'Google'.)
    """
    api_key = os.getenv("GAPI_TOKEN", None)
    if not api_key:
        raise ValueError(
            "GAPI_TOKEN is missing. Please set an API key."
        )

    client = genai.Client(api_key=api_key)
    files = [client.files.upload(file=file_name)]
    
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_uri(
                    file_uri=files[0].uri,
                    mime_type=files[0].mime_type,
                ),
                types.Part.from_text(text=text),
            ],
        ),
    ]

    generate_content_config = types.GenerateContentConfig(
        temperature=1,
        top_p=0.95,
        top_k=40,
        max_output_tokens=8192,
        response_modalities=["image", "text"],
        response_mime_type="text/plain",
    )

    text_response = ""
    image_path = None

    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
        temp_path = tmp.name
        for chunk in client.models.generate_content_stream(
            model=model,
            contents=contents,
            config=generate_content_config,
        ):
            if not chunk.candidates or not chunk.candidates[0].content:
                continue

            candidate = chunk.candidates[0].content.parts[0]
            if candidate.inline_data:
                save_binary_file(temp_path, candidate.inline_data.data)
                print(f"[DEBUG] Returned new image -> {temp_path}")
                image_path = temp_path
                break
            else:
                text_response += chunk.text + "\n"
    
    del files
    return image_path, text_response


#######################################
# 3. Diffusion Utility
#######################################

def generate_random_letters(length: int) -> str:
    """
    Create a random sequence of uppercase/lowercase letters of given length.
    """
    letters = string.ascii_lowercase + string.ascii_uppercase
    return "".join(random.choice(letters) for _ in range(length))

def is_all_english(text: str) -> bool:
    """
    Check if text consists only of English letters (a-z, A-Z), digits, spaces,
    and a few basic punctuation characters. If so, return True.
    Otherwise, False (includes Korean or other characters).
    """
    return bool(re.match(r'^[a-zA-Z0-9\s\.,!\?\']*$', text))

def maybe_use_random_or_original(final_text: str) -> str:
    """
    If final_text is strictly English/allowed chars, use it as-is.
    If it contains other chars (like Korean, etc.), 
    replace with random letters of the same length.
    """
    if not final_text:
        return ""
    if is_all_english(final_text):
        return final_text
    else:
        return generate_random_letters(len(final_text))

def fill_prompt_with_random_texts(prompt: str, r1: str, r2: str, r3: str) -> str:
    """
    Replace <text1>, <text2>, <text3> with r1, r2, r3 respectively.
    <text1> is required; if missing, we append something.
    """
    if "<text1>" in prompt:
        prompt = prompt.replace("<text1>", r1)
    else:
        prompt = f"{prompt} with clear readable text that says '{r1}'"

    if "<text2>" in prompt:
        prompt = prompt.replace("<text2>", r2)
    if "<text3>" in prompt:
        prompt = prompt.replace("<text3>", r3)
    
    return prompt

def generate_initial_image(prompt, height, width, steps, scale, seed):
    """
    Use Flux Pipeline to generate the initial image from the prompt.
    """
    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16), timer("Flux Generation"):
        result = pipe(
            prompt=[prompt],
            generator=torch.Generator().manual_seed(int(seed)),
            num_inference_steps=int(steps),
            guidance_scale=float(scale),
            height=int(height),
            width=int(width),
            max_sequence_length=256
        ).images[0]
    return result


#######################################
# 4. Creating 2 Final Images
#######################################

def build_multi_change_instruction(r1, f1, r2, f2, r3, f3):
    """
    Summarize instructions to replace (r1->f1), (r2->f2), (r3->f3).
    """
    instructions = []
    if r1 and f1:
        instructions.append(f"Change any text reading '{r1}' in this image to '{f1}'.")
    if r2 and f2:
        instructions.append(f"Change any text reading '{r2}' in this image to '{f2}'.")
    if r3 and f3:
        instructions.append(f"Change any text reading '{r3}' in this image to '{f3}'.")
    if instructions:
        return " ".join(instructions)
    return "No text changes needed."

def change_text_in_image_two_times(original_image, instruction):
    """
    Call the text modification function twice, 
    returning 2 final variations.
    """
    results = []
    for version_tag in ["(A)", "(B)"]:
        mod_instruction = f"{instruction} {version_tag}"
        try:
            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
                original_path = tmp.name
                original_image.save(original_path)

            image_path, text_response = generate_by_google_genai(
                text=mod_instruction,
                file_name=original_path
            )
            if image_path:
                with open(image_path, "rb") as f:
                    image_data = f.read()
                new_img = Image.open(io.BytesIO(image_data))
                results.append(new_img)
            else:
                results.append(original_image)
        except Exception as e:
            raise gr.Error(f"Error: {e}")
    return results


#######################################
# 5. Main Process
#######################################

def run_process(
    prompt,
    final_text1,
    final_text2,
    final_text3,
    height,
    width,
    steps,
    scale,
    seed
):
    """
    1) If prompt has Korean, translate to English
    2) For each <textX>, if it's purely English, use as-is,
       else generate random letters of the same length.
    3) Generate initial image with these placeholders
    4) Then produce 2 final images by replacing placeholders with real texts
    """
    prompt_en = maybe_translate_to_english(prompt)

    # Decide random vs original for each text
    r1 = maybe_use_random_or_original(final_text1)
    r2 = maybe_use_random_or_original(final_text2)
    r3 = maybe_use_random_or_original(final_text3)

    print(f"[DEBUG] Using placeholders: r1='{r1}', r2='{r2}', r3='{r3}'")

    # Fill prompt
    final_prompt = fill_prompt_with_random_texts(prompt_en, r1, r2, r3)
    print(f"[DEBUG] final_prompt = {final_prompt}")

    # Generate initial "random/original" image
    _random_image = generate_initial_image(final_prompt, height, width, steps, scale, seed)

    # Build final instructions & call twice -> 2 final images
    instruction = build_multi_change_instruction(r1, final_text1, r2, final_text2, r3, final_text3)
    final_imgs = change_text_in_image_two_times(_random_image, instruction)
    # Return only the 2 final images (don't show the random image)
    return [final_imgs[0], final_imgs[1]]

#######################################
# 6. Gradio UI
#######################################

with gr.Blocks(title="Eevery Text Imaginator: FLUX") as demo:
    gr.Markdown(
        """
        <h2 style="text-align:center; margin-bottom: 15px;">
            <strong>Eevery Text Imaginator: FLUX</strong>
        </h2>
        
        <p style="text-align:center;">
            This tool generates two final images from a prompt
            containing placeholders <code>&lt;text1&gt;</code>, <code>&lt;text2&gt;</code>, <code>&lt;text3&gt;</code>.
            If your chosen text is purely English, it will appear directly;
            otherwise it becomes random letters in the initial phase.
        </p>
        
        <hr style="margin: 15px 0;">
        """
    )

    # 5 example prompts (focusing on <text1>, <text2>)
    examples = [
        [
            "On a grand stage, <text1> in big letters and <text2> on the left side",
            "HELLO", "WORLD", ""
        ],
        [
            "Futuristic neon sign with <text1>, plus <text2> near the bottom",
            "WELCOME", "SALE", ""
        ],
        [
            "A classical poster reading <text1> in bold, <text2> as a subtitle",
            "MUSICFEST", "2025", ""
        ],
        [
            "In a cartoon style, a speech bubble with <text1> and another text <text2>",
            "HI!", "OhYes", ""
        ],
        [
            "Large billboard featuring <text1>, smaller text <text2> in the corner",
            "ANNOUNCEMENT", "OPENNOW", ""
        ],
    ]

    with gr.Row():
        with gr.Column():
            with gr.Box():
                prompt_input = gr.Textbox(
                    lines=3,
                    label="Prompt (Korean or English)",
                    placeholder="On a grand stage, <text1> in big letters..."
                )
                final_text1 = gr.Textbox(
                    label="New Text #1 (Required)",
                    placeholder="Example: HELLO or 안녕하세요"
                )
                final_text2 = gr.Textbox(
                    label="New Text #2 (Optional)",
                    placeholder="Example: WORLD or 반갑습니다"
                )
                final_text3 = gr.Textbox(
                    label="New Text #3 (Optional)",
                    placeholder="(Leave blank if not used)"
                )

            with gr.Accordion("Advanced Settings (optional)", open=False):
                height = gr.Slider(label="Height", minimum=256, maximum=1152, step=64, value=512)
                width = gr.Slider(label="Width", minimum=256, maximum=1152, step=64, value=512)
                steps = gr.Slider(label="Inference Steps", minimum=6, maximum=25, step=1, value=8)
                scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=10.0, step=0.5, value=3.5)
                seed = gr.Number(label="Seed", value=1234, precision=0)

            run_btn = gr.Button("Generate 2 Final Images", variant="primary")

            gr.Examples(
                examples=examples,
                inputs=[prompt_input, final_text1, final_text2, final_text3],
                label="Example Prompts"
            )

        with gr.Column():
            final_image_output1 = gr.Image(label="Final Image #1", type="pil")
            final_image_output2 = gr.Image(label="Final Image #2", type="pil")

    # We only display the 2 final images, not the initial random image
    run_btn.click(
        fn=run_process,
        inputs=[
            prompt_input,
            final_text1,
            final_text2,
            final_text3,
            height,
            width,
            steps,
            scale,
            seed
        ],
        outputs=[final_image_output1, final_image_output2]
    )

demo.launch(max_threads=20)