Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on May 17

Commit

8110123

verified ·

1 Parent(s): 8aa0ea7

Update app.py

Browse files

Files changed (1) hide show

app.py +227 -165

app.py CHANGED Viewed

@@ -1,172 +1,234 @@
 import gradio as gr
-from transformers.image_utils import load_image
-from threading import Thread
-import time
-import torch
 import spaces
-import cv2
 import numpy as np
 from PIL import Image
-from transformers import (
-    Qwen2VLForConditionalGeneration,
-    AutoProcessor,
-    TextIteratorStreamer,
-)
-from transformers import Qwen2_5_VLForConditionalGeneration
-# Helper Functions
-def progress_bar_html(label: str, primary_color: str = "#FF69B4", secondary_color: str = "#FFB6C1") -> str:
-    """
-    Returns an HTML snippet for a thin animated progress bar with a label.
-    Colors can be customized; default colors are used for Qwen2VL/Aya‑Vision.
-    """
-    return f'''
-<div style="display: flex; align-items: center;">
-    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
-    <div style="width: 110px; height: 5px; background-color: {secondary_color}; border-radius: 2px; overflow: hidden;">
-        <div style="width: 100%; height: 100%; background-color: {primary_color}; animation: loading 1.5s linear infinite;"></div>
-    </div>
-</div>
-<style>
-@keyframes loading {{
-    0% {{ transform: translateX(-100%); }}
-    100% {{ transform: translateX(100%); }}
-}}
-</style>
-    '''
-def downsample_video(video_path):
-    """
-    Downsamples a video file by extracting 10 evenly spaced frames.
-    Returns a list of tuples (PIL.Image, timestamp).
-    """
-    vidcap = cv2.VideoCapture(video_path)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = vidcap.get(cv2.CAP_PROP_FPS)
-    frames = []
-    if total_frames <= 0 or fps <= 0:
-        vidcap.release()
-        return frames
-    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
-    for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2)
-            frames.append((pil_image, timestamp))
-    vidcap.release()
-    return frames
-# Model and Processor Setup
-QV_MODEL_ID = "prithivMLmods/Qwen2-VL-Ocrtest-2B-Instruct"
-qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
-qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
-    QV_MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to("cuda").eval()
-DOCSCOPEOCR_MODEL_ID = "prithivMLmods/docscopeOCR-7B-050425-exp"
-docscopeocr_processor = AutoProcessor.from_pretrained(DOCSCOPEOCR_MODEL_ID, trust_remote_code=True)
-docscopeocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    DOCSCOPEOCR_MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16
-).to("cuda").eval()
-# Main Inference Function
-@spaces.GPU
-def model_inference(message, history, use_docscopeocr):
-    text = message["text"].strip()
-    files = message.get("files", [])
-    if not text and not files:
-        yield "Error: Please input a text query or provide image or video files."
-        return
-    # Process files: images and videos
-    image_list = []
-    for idx, file in enumerate(files):
-        if file.lower().endswith((".mp4", ".avi", ".mov")):
-            frames = downsample_video(file)
-            if not frames:
-                yield "Error: Could not extract frames from the video."
-                return
-            for frame, timestamp in frames:
-                label = f"Video {idx+1} Frame {timestamp}:"
-                image_list.append((label, frame))
-        else:
-            try:
-                img = load_image(file)
-                label = f"Image {idx+1}:"
-                image_list.append((label, img))
-            except Exception as e:
-                yield f"Error loading image: {str(e)}"
-                return
-    # Build content list
-    content = [{"type": "text", "text": text}]
-    for label, img in image_list:
-        content.append({"type": "text", "text": label})
-        content.append({"type": "image", "image": img})
-    messages = [{"role": "user", "content": content}]
-    # Select processor and model
-    if use_docscopeocr:
-        processor = docscopeocr_processor
-        model = docscopeocr_model
-        model_name = "DocScopeOCR"
-    else:
-        processor = qwen_processor
-        model = qwen_model
-        model_name = "Qwen2VL OCR"
-    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    all_images = [item["image"] for item in content if item["type"] == "image"]
-    inputs = processor(
-        text=[prompt_full],
-        images=all_images if all_images else None,
-        return_tensors="pt",
-        padding=True,
-    ).to("cuda")
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    yield progress_bar_html(f"Processing with {model_name}")
-    for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
-        time.sleep(0.01)
-        yield buffer
-# Gradio Interface
 examples = [
-    [{"text": "OCR the text in the image", "files": ["example/image1.jpg"]}],
-    [{"text": "Describe the content of the image", "files": ["example/image2.jpg"]}],
-    [{"text": "Extract the image content", "files": ["example/image3.jpg"]}],
 ]
-demo = gr.ChatInterface(
-    fn=model_inference,
-    description="# **DocScope OCR `VL/OCR`**",
-    examples=examples,
-    textbox=gr.MultimodalTextbox(
-        label="Query Input",
-        file_types=["image", "video"],
-        file_count="multiple",
-        placeholder="Input your query and optionally upload image(s) or video(s). Select the model using the checkbox."
-    ),
-    stop_btn="Stop Generation",
-    multimodal=True,
-    cache_examples=False,
-    theme="bethecloud/storj_theme",
-    additional_inputs=[gr.Checkbox(label="Use DocScopeOCR", value=True, info="Check to use DocScopeOCR, uncheck to use Qwen2VL OCR")],
-)
-demo.launch(debug=True, ssr_mode=False)

 import gradio as gr
 import spaces
 import numpy as np
+import random
+from diffusers import DiffusionPipeline
+import torch
 from PIL import Image
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_repo_id = "stabilityai/stable-diffusion-3.5-large-turbo"
+torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
+pipe = pipe.to(device)
+pipe.load_lora_weights("strangerzonehf/SD3.5-Turbo-Portrait-LoRA", weight_name="SD3.5-Turbo-Portrait.safetensors")
+trigger_word = "Turbo Portrait"
+pipe.fuse_lora(lora_scale=1.0)
+MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 1024
+# Define styles
+style_list = [
+    {
+        "name": "3840 x 2160",
+        "prompt": "hyper-realistic 8K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
+        "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
+    },
+    {
+        "name": "2560 x 1440",
+        "prompt": "hyper-realistic 4K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
+        "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
+    },
+    {
+        "name": "HD+",
+        "prompt": "hyper-realistic 2K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
+        "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
+    },
+    {
+        "name": "Style Zero",
+        "prompt": "{prompt}",
+        "negative_prompt": "",
+    },
+]
+STYLE_NAMES = [style["name"] for style in style_list]
+DEFAULT_STYLE_NAME = STYLE_NAMES[0]
+grid_sizes = {
+    "2x1": (2, 1),
+    "1x2": (1, 2),
+    "2x2": (2, 2),
+    "2x3": (2, 3),
+    "3x2": (3, 2),
+    "1x1": (1, 1)
+}
+@spaces.GPU(duration=60)
+def infer(
+    prompt,
+    negative_prompt="",
+    seed=42,
+    randomize_seed=False,
+    width=1024,
+    height=1024,
+    guidance_scale=7.5,
+    num_inference_steps=10,
+    style="Style Zero",
+    grid_size="1x1",
+    progress=gr.Progress(track_tqdm=True),
+):
+    selected_style = next(s for s in style_list if s["name"] == style)
+    styled_prompt = selected_style["prompt"].format(prompt=prompt)
+    styled_negative_prompt = selected_style["negative_prompt"]
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator().manual_seed(seed)
+    grid_size_x, grid_size_y = grid_sizes.get(grid_size, (1, 1))
+    num_images = grid_size_x * grid_size_y
+    options = {
+        "prompt": styled_prompt,
+        "negative_prompt": styled_negative_prompt,
+        "guidance_scale": guidance_scale,
+        "num_inference_steps": num_inference_steps,
+        "width": width,
+        "height": height,
+        "generator": generator,
+        "num_images_per_prompt": num_images,
+    }
+    torch.cuda.empty_cache()  # Clear GPU memory
+    result = pipe(**options)
+    grid_img = Image.new('RGB', (width * grid_size_x, height * grid_size_y))
+    for i, img in enumerate(result.images[:num_images]):
+        grid_img.paste(img, (i % grid_size_x * width, i // grid_size_x * height))
+    return grid_img, seed
 examples = [
+    "A tiny astronaut hatching from an egg on the moon, 4k, planet theme",
+    "An anime-style illustration of a delicious, golden-brown wiener schnitzel on a plate, served with fresh lemon slices, parsley --style raw5",
+    "Cold coffee in a cup bokeh --ar 85:128 --v 6.0 --style raw5, 4K, Photo-Realistic",
+    "A cat holding a sign that says hello world --ar 85:128 --v 6.0 --style raw"
 ]
+css = '''
+.gradio-container{max-width: 585px !important}
+h1{text-align:center}
+footer {
+    visibility: hidden
+}
+'''
+with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("## GRID 6X🪨")
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=1,
+                placeholder="Enter your prompt",
+                container=False,
+            )
+            run_button = gr.Button("Run", scale=0, variant="primary")
+        result = gr.Image(label="Result", show_label=False)
+        with gr.Row(visible=True):
+            grid_size_selection = gr.Dropdown(
+                choices=["2x1", "1x2", "2x2", "2x3", "3x2", "1x1"],
+                value="1x1",
+                label="Grid Size"
+            )
+        with gr.Accordion("Advanced Settings", open=False):
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=1,
+                placeholder="Enter a negative prompt",
+                value="(deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, (mutated hands and fingers:1.4), disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation",
+                visible=False,
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row():
+                width = gr.Slider(
+                    label="Width",
+                    minimum=512,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,
+                )
+                height = gr.Slider(
+                    label="Height",
+                    minimum=512,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,
+                )
+            with gr.Row():
+                guidance_scale = gr.Slider(
+                    label="Guidance scale",
+                    minimum=0.0,
+                    maximum=7.5,
+                    step=0.1,
+                    value=0.0,
+                )
+                num_inference_steps = gr.Slider(
+                    label="Number of inference steps",
+                    minimum=1,
+                    maximum=50,
+                    step=1,
+                    value=8,
+                )
+                style_selection = gr.Radio(
+                    show_label=True,
+                    container=True,
+                    interactive=True,
+                    choices=STYLE_NAMES,
+                    value=DEFAULT_STYLE_NAME,
+                    label="Quality Style",
+                )
+        gr.Examples(examples=examples,
+                    inputs=[prompt],
+                    outputs=[result, seed],
+                    fn=infer,
+                    cache_examples=False)
+    gr.on(
+        triggers=[run_button.click, prompt.submit],
+        fn=infer,
+        inputs=[
+            prompt,
+            negative_prompt,
+            seed,
+            randomize_seed,
+            width,
+            height,
+            guidance_scale,
+            num_inference_steps,
+            style_selection,
+            grid_size_selection,
+        ],
+        outputs=[result, seed],
+    )
+if __name__ == "__main__":
+    demo.launch(ssr_mode=False, show_error=True)