Spaces:

prithivMLmods
/

FLUX-REALISM

Running on Zero

App Files Files Community

FLUX-REALISM / app.py

prithivMLmods

Update app.py

2230883 verified 4 months ago

raw

history blame

12.8 kB

	import os
	import random
	import uuid
	import json
	import time
	import asyncio
	from threading import Thread

	import gradio as gr
	import spaces
	import torch
	import numpy as np
	from PIL import Image
	import edge_tts

	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	TextIteratorStreamer,
	Qwen2_5_VLForConditionalGeneration,
	AutoProcessor,
	)
	from transformers.image_utils import load_image
	from diffusers import DiffusionPipeline

	DESCRIPTION = "# Flux.1 Realism 🥖"
	if not torch.cuda.is_available():
	DESCRIPTION += "\n<p>⚠️Running on CPU, This may not work on CPU.</p>"

	css = '''
	h1 {
	text-align: center;
	display: block;
	}

	#duplicate-button {
	margin: auto;
	color: #fff;
	background: #1565c0;
	border-radius: 100vh;
	}
	'''

	MAX_MAX_NEW_TOKENS = 2048
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


	# Load text-only model and tokenizer
	model_id = "prithivMLmods/FastThink-0.5B-Tiny"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	)
	model.eval()

	TTS_VOICES = [
	"en-US-JennyNeural", # @tts1
	"en-US-GuyNeural", # @tts2
	]

	# Load multimodal Qwen model & processor
	MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	torch_dtype=torch.float16
	).to("cuda").eval()

	async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
	"""Convert text to speech using Edge TTS and save as MP3"""
	communicate = edge_tts.Communicate(text, voice)
	await communicate.save(output_file)
	return output_file

	def clean_chat_history(chat_history):
	"""
	Filter out any chat entries whose "content" is not a string.
	This helps prevent errors when concatenating previous messages.
	"""
	cleaned = []
	for msg in chat_history:
	if isinstance(msg, dict) and isinstance(msg.get("content"), str):
	cleaned.append(msg)
	return cleaned

	def progress_bar_html(label: str) -> str:
	"""
	Returns an HTML snippet for a thin progress bar with a label.
	The progress bar is styled as a dark red animated bar.
	"""
	return f'''
	<div style="display: flex; align-items: center;">
	<span style="margin-right: 10px; font-size: 14px;">{label}</span>
	<div style="width: 110px; height: 5px; background-color: #f0f0f0; border-radius: 2px; overflow: hidden;">
	<div style="width: 100%; height: 100%; background-color: #ff5900; animation: loading 1.5s linear infinite;"></div>
	</div>
	</div>
	<style>
	@keyframes loading {{
	0% {{ transform: translateX(-100%); }}
	100% {{ transform: translateX(100%); }}
	}}
	</style>
	'''

	# FLUX.1 IMAGE GENERATION SETUP
	MAX_SEED = np.iinfo(np.int32).max

	def save_image(img: Image.Image) -> str:
	"""Save a PIL image with a unique filename and return the path."""
	unique_name = str(uuid.uuid4()) + ".png"
	img.save(unique_name)
	return unique_name

	def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
	if randomize_seed:
	seed = random.randint(0, MAX_SEED)
	return seed

	# Initialize Flux.1 pipeline
	base_model = "black-forest-labs/FLUX.1-dev"
	pipe = DiffusionPipeline.from_pretrained(base_model, torch_dtype=torch.bfloat16)
	lora_repo = "strangerzonehf/Flux-Super-Realism-LoRA"
	trigger_word = "Super Realism" # Leave blank if no trigger word is needed.
	pipe.load_lora_weights(lora_repo)
	pipe.to("cuda")

	# Define style prompts for Flux.1
	style_list = [
	{
	"name": "3840 x 2160",
	"prompt": "hyper-realistic 8K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
	},
	{
	"name": "2560 x 1440",
	"prompt": "hyper-realistic 4K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
	},
	{
	"name": "HD+",
	"prompt": "hyper-realistic 2K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
	},
	{
	"name": "Style Zero",
	"prompt": "{prompt}",
	},
	]
	styles = {k["name"]: k["prompt"] for k in style_list}
	DEFAULT_STYLE_NAME = "3840 x 2160"
	STYLE_NAMES = list(styles.keys())

	def apply_style(style_name: str, positive: str) -> str:
	return styles.get(style_name, styles[DEFAULT_STYLE_NAME]).replace("{prompt}", positive)

	@spaces.GPU(duration=60, enable_queue=True)
	def generate_image_flux(
	prompt: str,
	seed: int = 0,
	width: int = 1024,
	height: int = 1024,
	guidance_scale: float = 3,
	randomize_seed: bool = False,
	style_name: str = DEFAULT_STYLE_NAME,
	progress=gr.Progress(track_tqdm=True),
	):
	"""Generate images using the Flux.1 pipeline with style prompts."""
	seed = int(randomize_seed_fn(seed, randomize_seed))
	positive_prompt = apply_style(style_name, prompt)
	if trigger_word:
	positive_prompt = f"{trigger_word} {positive_prompt}"
	images = pipe(
	prompt=positive_prompt,
	width=width,
	height=height,
	guidance_scale=guidance_scale,
	num_inference_steps=28,
	num_images_per_prompt=1,
	output_type="pil",
	).images
	image_paths = [save_image(img) for img in images]
	return image_paths, seed

	# CHAT GENERATION FUNCTION (TEXT & MULTIMODAL)

	@spaces.GPU
	def generate(
	input_dict: dict,
	chat_history: list[dict],
	max_new_tokens: int = 1024,
	temperature: float = 0.6,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.2,
	):
	"""
	Generates chatbot responses with support for multimodal input, TTS, and image generation.
	Special commands:
	- "@tts1" or "@tts2": triggers text-to-speech.
	- "@image": triggers image generation using the Flux.1 pipeline.
	"""
	text = input_dict["text"]
	files = input_dict.get("files", [])

	# If the text begins with "@image", use Flux for image generation.
	if text.strip().lower().startswith("@image"):
	# Remove the "@image" tag and use the remainder as the prompt.
	prompt = text[len("@image"):].strip()
	yield progress_bar_html("Hold Tight Generating Flux.1 Image")
	image_paths, used_seed = generate_image_flux(
	prompt=prompt,
	seed=1,
	width=1024,
	height=1024,
	guidance_scale=3,
	randomize_seed=True,
	style_name=DEFAULT_STYLE_NAME,
	progress=gr.Progress(track_tqdm=True),
	)
	yield gr.Image(image_paths[0])
	return # Exit early after image generation.

	# Check if a TTS command is issued.
	tts_prefix = "@tts"
	is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
	voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)

	if is_tts and voice_index:
	voice = TTS_VOICES[voice_index - 1]
	text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
	# Clear previous chat history for a fresh TTS request.
	conversation = [{"role": "user", "content": text}]
	else:
	voice = None
	# Remove any stray @tts tags and build the conversation history.
	text = text.replace(tts_prefix, "").strip()
	conversation = clean_chat_history(chat_history)
	conversation.append({"role": "user", "content": text})

	# Handle multimodal input if files are provided.
	if files:
	if len(files) > 1:
	images = [load_image(image) for image in files]
	elif len(files) == 1:
	images = [load_image(files[0])]
	else:
	images = []
	messages = [{
	"role": "user",
	"content": [
	*[{"type": "image", "image": image} for image in images],
	{"type": "text", "text": text},
	]
	}]
	prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
	streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
	generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
	thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
	thread.start()

	buffer = ""
	yield progress_bar_html("Thinking...")
	for new_text in streamer:
	buffer += new_text
	buffer = buffer.replace("<\|im_end\|>", "")
	time.sleep(0.01)
	yield buffer
	else:
	input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
	if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
	input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
	gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
	input_ids = input_ids.to(model.device)
	streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
	generation_kwargs = {
	"input_ids": input_ids,
	"streamer": streamer,
	"max_new_tokens": max_new_tokens,
	"do_sample": True,
	"top_p": top_p,
	"top_k": top_k,
	"temperature": temperature,
	"num_beams": 1,
	"repetition_penalty": repetition_penalty,
	}
	t = Thread(target=model.generate, kwargs=generation_kwargs)
	t.start()

	outputs = []
	yield progress_bar_html("Thinking...")
	for new_text in streamer:
	outputs.append(new_text)
	yield "".join(outputs)

	final_response = "".join(outputs)
	yield final_response

	# If TTS was requested, convert the final response to speech.
	if is_tts and voice:
	output_file = asyncio.run(text_to_speech(final_response, voice))
	yield gr.Audio(output_file, autoplay=True)

	# GRADIO CHAT INTERFACE
	demo = gr.ChatInterface(
	fn=generate,
	additional_inputs=[
	gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
	gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
	gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
	gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
	gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
	],
	examples=[
	["@image Chocolate dripping from a donut against a yellow background, in the style of hyper-realistic 8K"],
	["@image Super Realism, High-resolution photograph, woman, UHD, photorealistic, shot on a Sony A7III --chaos 20 --ar 1:2 --style raw --stylize 250"],
	["@image Woman in a red jacket, snowy, in the style of hyper-realistic portraiture, caninecore, mountainous vistas, timeless beauty, palewave, iconic, distinctive noses --ar 72:101 --stylize 750 --v 6"],
	["@image Super-realism, Purple Dreamy, a medium-angle shot of a young woman with long brown hair, wearing a pair of eye-level glasses, stands in front of a backdrop of purple and white lights. The womans eyes are closed, her lips are slightly parted, as if she is looking up at the sky. Her hair is cascading over her shoulders, framing her face. She is wearing a sleeveless top, adorned with tiny white dots, and a gold chain necklace around her neck. Her left earrings are dangling from her ears, adding a pop of color to the scene."]
	["Python Program for Array Rotation"],
	["@tts1 Who is Nikola Tesla, and why did he die?"],
	[{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
	[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
	["@tts2 What causes rainbows to form?"],
	],
	cache_examples=False,
	type="messages",
	description=DESCRIPTION,
	css=css,
	fill_height=True,
	textbox=gr.MultimodalTextbox(
	label="Query Input",
	file_types=["image"],
	file_count="multiple",
	placeholder="‎ @image-flux.1 image gen, @tts1, @tts2-voices, default [text, vision]"
	),
	stop_btn="Stop Generation",
	multimodal=True,
	)

	if __name__ == "__main__":
	demo.queue(max_size=20).launch(share=True)