import os import re import time from os import path import tempfile import uuid import base64 import mimetypes import json import io import random import string import torch from PIL import Image from transformers import pipeline from safetensors.torch import load_file from huggingface_hub import hf_hub_download # Diffusers import gradio as gr from diffusers import FluxPipeline # (Internal) text-modification library from google import genai from google.genai import types ####################################### # 0. Environment & Translation Pipeline ####################################### BASE_DIR = path.dirname(path.abspath(__file__)) if "__file__" in globals() else os.getcwd() CACHE_PATH = path.join(BASE_DIR, "models") os.environ["TRANSFORMERS_CACHE"] = CACHE_PATH os.environ["HF_HUB_CACHE"] = CACHE_PATH os.environ["HF_HOME"] = CACHE_PATH # Translation (Korean -> English), CPU only translator = pipeline( task="translation", model="Helsinki-NLP/opus-mt-ko-en", device=-1 # force CPU ) def maybe_translate_to_english(text: str) -> str: """ If the prompt contains any Korean characters, translate to English. Otherwise, return as-is. """ if re.search("[가-힣]", text): translated = translator(text)[0]["translation_text"] print(f"[TRANSLATE] Detected Korean -> '{text}' -> '{translated}'") return translated return text # Simple Timer Class class timer: def __init__(self, method_name="timed process"): self.method = method_name def __enter__(self): self.start = time.time() print(f"[TIMER] {self.method} starts") def __exit__(self, exc_type, exc_val, exc_tb): end = time.time() print(f"[TIMER] {self.method} took {round(end - self.start, 2)}s") ####################################### # 1. Load FLUX Pipeline ####################################### if not path.exists(CACHE_PATH): os.makedirs(CACHE_PATH, exist_ok=True) pipe = FluxPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16 ) lora_path = hf_hub_download("ByteDance/Hyper-SD", "Hyper-FLUX.1-dev-8steps-lora.safetensors") pipe.load_lora_weights(lora_path) pipe.fuse_lora(lora_scale=0.125) pipe.to(device="cuda", dtype=torch.bfloat16) ####################################### # 2. Internal Text Modification Functions ####################################### def save_binary_file(file_name, data): with open(file_name, "wb") as f: f.write(data) def generate_by_google_genai(text, file_name, model="gemini-2.0-flash-exp"): """ Internally modifies text within an image, returning a new image path. (Screen instructions do not mention 'Google'.) """ api_key = os.getenv("GAPI_TOKEN", None) if not api_key: raise ValueError( "GAPI_TOKEN is missing. Please set an API key." ) client = genai.Client(api_key=api_key) files = [client.files.upload(file=file_name)] contents = [ types.Content( role="user", parts=[ types.Part.from_uri( file_uri=files[0].uri, mime_type=files[0].mime_type, ), types.Part.from_text(text=text), ], ), ] generate_content_config = types.GenerateContentConfig( temperature=1, top_p=0.95, top_k=40, max_output_tokens=8192, response_modalities=["image", "text"], response_mime_type="text/plain", ) text_response = "" image_path = None with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: temp_path = tmp.name for chunk in client.models.generate_content_stream( model=model, contents=contents, config=generate_content_config, ): if not chunk.candidates or not chunk.candidates[0].content: continue candidate = chunk.candidates[0].content.parts[0] if candidate.inline_data: save_binary_file(temp_path, candidate.inline_data.data) print(f"[DEBUG] Returned new image -> {temp_path}") image_path = temp_path break else: text_response += chunk.text + "\n" del files return image_path, text_response ####################################### # 3. Diffusion Utility ####################################### def generate_random_letters(length: int) -> str: """ Create a random sequence of uppercase/lowercase letters of given length. """ letters = string.ascii_lowercase + string.ascii_uppercase return "".join(random.choice(letters) for _ in range(length)) def is_all_english(text: str) -> bool: """ Check if text consists only of English letters (a-z, A-Z), digits, spaces, and a few basic punctuation characters. If so, return True. Otherwise, False (includes Korean or other characters). """ return bool(re.match(r'^[a-zA-Z0-9\s\.,!\?\']*$', text)) def maybe_use_random_or_original(final_text: str) -> str: """ If final_text is strictly English/allowed chars, use it as-is. If it contains other chars (like Korean, etc.), replace with random letters of the same length. """ if not final_text: return "" if is_all_english(final_text): return final_text else: return generate_random_letters(len(final_text)) def fill_prompt_with_random_texts(prompt: str, r1: str, r2: str, r3: str) -> str: """ Replace , , with r1, r2, r3 respectively. is required; if missing, we append something. """ if "" in prompt: prompt = prompt.replace("", r1) else: prompt = f"{prompt} with clear readable text that says '{r1}'" if "" in prompt: prompt = prompt.replace("", r2) if "" in prompt: prompt = prompt.replace("", r3) return prompt def generate_initial_image(prompt, height, width, steps, scale, seed): """ Use Flux Pipeline to generate the initial image from the prompt. """ with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16), timer("Flux Generation"): result = pipe( prompt=[prompt], generator=torch.Generator().manual_seed(int(seed)), num_inference_steps=int(steps), guidance_scale=float(scale), height=int(height), width=int(width), max_sequence_length=256 ).images[0] return result ####################################### # 4. Creating 2 Final Images ####################################### def build_multi_change_instruction(r1, f1, r2, f2, r3, f3): """ Summarize instructions to replace (r1->f1), (r2->f2), (r3->f3). """ instructions = [] if r1 and f1: instructions.append(f"Change any text reading '{r1}' in this image to '{f1}'.") if r2 and f2: instructions.append(f"Change any text reading '{r2}' in this image to '{f2}'.") if r3 and f3: instructions.append(f"Change any text reading '{r3}' in this image to '{f3}'.") if instructions: return " ".join(instructions) return "No text changes needed." def change_text_in_image_two_times(original_image, instruction): """ Call the text modification function twice, returning 2 final variations. """ results = [] for version_tag in ["(A)", "(B)"]: mod_instruction = f"{instruction} {version_tag}" try: with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: original_path = tmp.name original_image.save(original_path) image_path, text_response = generate_by_google_genai( text=mod_instruction, file_name=original_path ) if image_path: with open(image_path, "rb") as f: image_data = f.read() new_img = Image.open(io.BytesIO(image_data)) results.append(new_img) else: results.append(original_image) except Exception as e: raise gr.Error(f"Error: {e}") return results ####################################### # 5. Main Process ####################################### def run_process( prompt, final_text1, final_text2, final_text3, height, width, steps, scale, seed ): """ 1) If prompt has Korean, translate to English 2) For each , if it's purely English, use as-is, else generate random letters of the same length. 3) Generate initial image with these placeholders 4) Then produce 2 final images by replacing placeholders with real texts """ prompt_en = maybe_translate_to_english(prompt) # Decide random vs original for each text r1 = maybe_use_random_or_original(final_text1) r2 = maybe_use_random_or_original(final_text2) r3 = maybe_use_random_or_original(final_text3) print(f"[DEBUG] Using placeholders: r1='{r1}', r2='{r2}', r3='{r3}'") # Fill prompt final_prompt = fill_prompt_with_random_texts(prompt_en, r1, r2, r3) print(f"[DEBUG] final_prompt = {final_prompt}") # Generate initial "random/original" image _random_image = generate_initial_image(final_prompt, height, width, steps, scale, seed) # Build final instructions & call twice -> 2 final images instruction = build_multi_change_instruction(r1, final_text1, r2, final_text2, r3, final_text3) final_imgs = change_text_in_image_two_times(_random_image, instruction) # Return only the 2 final images (don't show the random image) return [final_imgs[0], final_imgs[1]] ####################################### # 6. Gradio UI ####################################### with gr.Blocks(title="Eevery Text Imaginator: FLUX") as demo: gr.Markdown( """

Eevery Text Imaginator: FLUX

This tool generates two final images from a prompt containing placeholders <text1>, <text2>, <text3>. If your chosen text is purely English, it will appear directly; otherwise it becomes random letters in the initial phase.


""" ) # 5 example prompts (focusing on , ) examples = [ [ "On a grand stage, in big letters and on the left side", "HELLO", "WORLD", "" ], [ "Futuristic neon sign with , plus near the bottom", "WELCOME", "SALE", "" ], [ "A classical poster reading in bold, as a subtitle", "MUSICFEST", "2025", "" ], [ "In a cartoon style, a speech bubble with and another text ", "HI!", "OhYes", "" ], [ "Large billboard featuring , smaller text in the corner", "ANNOUNCEMENT", "OPENNOW", "" ], ] with gr.Row(): with gr.Column(): with gr.Box(): prompt_input = gr.Textbox( lines=3, label="Prompt (Korean or English)", placeholder="On a grand stage, in big letters..." ) final_text1 = gr.Textbox( label="New Text #1 (Required)", placeholder="Example: HELLO or 안녕하세요" ) final_text2 = gr.Textbox( label="New Text #2 (Optional)", placeholder="Example: WORLD or 반갑습니다" ) final_text3 = gr.Textbox( label="New Text #3 (Optional)", placeholder="(Leave blank if not used)" ) with gr.Accordion("Advanced Settings (optional)", open=False): height = gr.Slider(label="Height", minimum=256, maximum=1152, step=64, value=512) width = gr.Slider(label="Width", minimum=256, maximum=1152, step=64, value=512) steps = gr.Slider(label="Inference Steps", minimum=6, maximum=25, step=1, value=8) scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=10.0, step=0.5, value=3.5) seed = gr.Number(label="Seed", value=1234, precision=0) run_btn = gr.Button("Generate 2 Final Images", variant="primary") gr.Examples( examples=examples, inputs=[prompt_input, final_text1, final_text2, final_text3], label="Example Prompts" ) with gr.Column(): final_image_output1 = gr.Image(label="Final Image #1", type="pil") final_image_output2 = gr.Image(label="Final Image #2", type="pil") # We only display the 2 final images, not the initial random image run_btn.click( fn=run_process, inputs=[ prompt_input, final_text1, final_text2, final_text3, height, width, steps, scale, seed ], outputs=[final_image_output1, final_image_output2] ) demo.launch(max_threads=20)