prithivMLmods commited on
Commit
9fbf1ed
·
verified ·
1 Parent(s): b51bdbc

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -196
app.py DELETED
@@ -1,196 +0,0 @@
1
- import spaces
2
- import gradio as gr
3
- import torch
4
- from PIL import Image
5
- from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, pipeline
6
- from diffusers import DiffusionPipeline
7
- import random
8
- import numpy as np
9
- import os
10
- from qwen_vl_utils import process_vision_info
11
-
12
- # Initialize models
13
- device = "cuda" if torch.cuda.is_available() else "cpu"
14
- dtype = torch.bfloat16
15
-
16
- huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
17
-
18
- # FLUX.1-dev model
19
- pipe = DiffusionPipeline.from_pretrained(
20
- "black-forest-labs/FLUX.1-dev", torch_dtype=dtype, token=huggingface_token
21
- ).to(device)
22
-
23
- # Initialize Qwen2VL model
24
- qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
25
- "prithivMLmods/JSONify-Flux", trust_remote_code=True, torch_dtype=torch.float16
26
- ).to(device).eval()
27
- qwen_processor = AutoProcessor.from_pretrained("prithivMLmods/JSONify-Flux", trust_remote_code=True)
28
-
29
- # Prompt Enhancer
30
- enhancer_long = pipeline("summarization", model="prithivMLmods/t5-Flan-Prompt-Enhance", device=device)
31
-
32
- MAX_SEED = np.iinfo(np.int32).max
33
- MAX_IMAGE_SIZE = 2048
34
-
35
- # Qwen2VL caption function – updated to request plain text caption instead of JSON
36
- @spaces.GPU
37
- def qwen_caption(image):
38
- # Convert image to PIL if needed
39
- if not isinstance(image, Image.Image):
40
- image = Image.fromarray(image)
41
-
42
- messages = [
43
- {
44
- "role": "user",
45
- "content": [
46
- {"type": "image", "image": image},
47
- {"type": "text", "text": "Generate a detailed and optimized caption for the given image."},
48
- ],
49
- }
50
- ]
51
-
52
- text = qwen_processor.apply_chat_template(
53
- messages, tokenize=False, add_generation_prompt=True
54
- )
55
- image_inputs, video_inputs = process_vision_info(messages)
56
- inputs = qwen_processor(
57
- text=[text],
58
- images=image_inputs,
59
- videos=video_inputs,
60
- padding=True,
61
- return_tensors="pt",
62
- ).to(device)
63
-
64
- generated_ids = qwen_model.generate(**inputs, max_new_tokens=1024)
65
- generated_ids_trimmed = [
66
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
67
- ]
68
- output_text = qwen_processor.batch_decode(
69
- generated_ids_trimmed,
70
- skip_special_tokens=True,
71
- clean_up_tokenization_spaces=False,
72
- )[0]
73
-
74
- return output_text
75
-
76
- # Prompt Enhancer function (unchanged)
77
- def enhance_prompt(input_prompt):
78
- result = enhancer_long("Enhance the description: " + input_prompt)
79
- enhanced_text = result[0]['summary_text']
80
- return enhanced_text
81
-
82
- @spaces.GPU
83
- def process_workflow(image, text_prompt, use_enhancer, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, progress=gr.Progress(track_tqdm=True)):
84
- if image is not None:
85
- if not isinstance(image, Image.Image):
86
- image = Image.fromarray(image)
87
- prompt = qwen_caption(image)
88
- print(prompt)
89
- else:
90
- prompt = text_prompt
91
-
92
- if use_enhancer:
93
- prompt = enhance_prompt(prompt)
94
-
95
- if randomize_seed:
96
- seed = random.randint(0, MAX_SEED)
97
-
98
- generator = torch.Generator(device=device).manual_seed(seed)
99
-
100
- torch.cuda.empty_cache()
101
-
102
- try:
103
- image = pipe(
104
- prompt=prompt,
105
- generator=generator,
106
- num_inference_steps=num_inference_steps,
107
- width=width,
108
- height=height,
109
- guidance_scale=guidance_scale
110
- ).images[0]
111
- except RuntimeError as e:
112
- if "CUDA out of memory" in str(e):
113
- raise RuntimeError("CUDA out of memory. Try reducing image size or inference steps.")
114
- else:
115
- raise e
116
-
117
- return image, prompt, seed
118
-
119
- custom_css = """
120
- .input-group, .output-group {
121
-
122
- }
123
- .submit-btn {
124
- background: linear-gradient(90deg, #4B79A1 0%, #283E51 100%) !important;
125
- border: none !important;
126
- color: white !important;
127
- }
128
- .submit-btn:hover {
129
- background-color: #3498db !important;
130
- }
131
- """
132
-
133
- title = """<h1 align="center">FLUX.1-dev with Qwen2VL Captioner and Prompt Enhancer</h1>
134
- <p><center>
135
- <a href="https://huggingface.co/black-forest-labs/FLUX.1-dev" target="_blank">[FLUX.1-dev Model]</a>
136
- <a href="https://huggingface.co/prithivMLmods/JSONify-Flux" target="_blank">[JSONify Flux Model]</a>
137
- <a href="https://huggingface.co/prithivMLmods/t5-Flan-Prompt-Enhance" target="_blank">[Prompt Enhancer t5]</a>
138
- <p align="center">Create long prompts from images or enhance your short prompts with prompt enhancer</p>
139
- </center></p>
140
- """
141
-
142
- with gr.Blocks(css=custom_css) as demo:
143
- gr.HTML(title)
144
-
145
- with gr.Sidebar(label="Parameters", open=True):
146
- gr.Markdown(
147
- """
148
- ### About
149
-
150
- #### Flux.1-Dev
151
- FLUX.1 [dev] is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions. FLUX.1 [dev] is an open-weight, guidance-distilled model for non-commercial applications. Directly distilled from FLUX.1 [pro], FLUX.1 [dev] obtains similar quality and prompt adherence capabilities, while being more efficient than a standard model of the same size.
152
- [FLUX.1-dev Model](https://huggingface.co/black-forest-labs/FLUX.1-dev)
153
-
154
- #### JSONify-Flux
155
- JSONify-Flux is a multimodal image-text-text model trained on a dataset of FLUX-generated images with context-rich captions based on the Qwen2VL architecture. The JSON-based instruction has been manually removed to avoid JSON format captions.
156
- [JSONify-Flux Model](https://huggingface.co/prithivMLmods/JSONify-Flux)
157
-
158
- #### t5-Flan-Prompt-Enhance
159
- t5-Flan-Prompt-Enhance is a prompt summarization model that enriches synthetic FLUX prompts with more detailed descriptions.
160
- [t5-Flan-Prompt-Enhance Model](https://huggingface.co/prithivMLmods/t5-Flan-Prompt-Enhance)
161
- """
162
- )
163
-
164
- with gr.Row():
165
- with gr.Column(scale=1):
166
- with gr.Group(elem_classes="input-group"):
167
- input_image = gr.Image(label="Input Image (Qwen2VL Captioner)")
168
-
169
- with gr.Accordion("Advanced Settings", open=False):
170
- text_prompt = gr.Textbox(label="Text Prompt (optional, used if no image is uploaded)")
171
- use_enhancer = gr.Checkbox(label="Use Prompt Enhancer", value=False)
172
- seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
173
- randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
174
- width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=512)
175
- height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=512)
176
- guidance_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=15, step=0.1, value=3.5)
177
- num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=32)
178
-
179
- generate_btn = gr.Button("Generate Image & Prompt", elem_classes="submit-btn")
180
-
181
- with gr.Column(scale=1):
182
- with gr.Group(elem_classes="output-group"):
183
- output_image = gr.Image(label="result", elem_id="gallery", show_label=False)
184
- final_prompt = gr.Textbox(label="prompt")
185
- used_seed = gr.Number(label="seed")
186
-
187
- generate_btn.click(
188
- fn=process_workflow,
189
- inputs=[
190
- input_image, text_prompt, use_enhancer, seed, randomize_seed,
191
- width, height, guidance_scale, num_inference_steps
192
- ],
193
- outputs=[output_image, final_prompt, used_seed]
194
- )
195
-
196
- demo.launch(debug=True)