Spaces:

not-lain
/

gpu-utils

Paused

App Files Files Community

not-lain commited on 18 days ago

Commit

94d1b20

1 Parent(s): f65ed8a

reset

Browse files

Files changed (1) hide show

app.py +233 -369

app.py CHANGED Viewed

@@ -1,21 +1,19 @@
 import gradio as gr
 import spaces
 import torch
-from loadimg import load_img  # Assuming loadimg.py exists with load_img function
 from torchvision import transforms
 from transformers import AutoModelForImageSegmentation, pipeline
 from diffusers import FluxFillPipeline
 from PIL import Image, ImageOps
 import numpy as np
 from simple_lama_inpainting import SimpleLama
 from contextlib import contextmanager
 import gc
-# --- Add Translation Imports ---
-from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
-# --- Utility Functions ---
 @contextmanager
 def float32_high_matmul_precision():
     torch.set_float32_matmul_precision("high")
@@ -25,33 +23,14 @@ def float32_high_matmul_precision():
         torch.set_float32_matmul_precision("highest")
-# --- Model Loading ---
-# Use context manager for precision during model loading if needed
-with float32_high_matmul_precision():
-    pipe = FluxFillPipeline.from_pretrained(
-        "black-forest-labs/FLUX.1-Fill-dev", torch_dtype=torch.bfloat16
-    ).to("cuda")
-    birefnet = AutoModelForImageSegmentation.from_pretrained(
-        "ZhengPeng7/BiRefNet", trust_remote_code=True
-    ).to("cuda")
-simple_lama = SimpleLama()  # Initialize Lama globally if used often
-# --- Translation Model and Tokenizer Loading ---
-translation_model_name = "facebook/mbart-large-50-many-to-many-mmt"
-try:
-    translation_model = MBartForConditionalGeneration.from_pretrained(
-        translation_model_name
-    ).to("cuda")  # Move to GPU
-    translation_tokenizer = MBart50TokenizerFast.from_pretrained(translation_model_name)
-except Exception as e:
-    print(f"Error loading translation model/tokenizer: {e}")
-    # Consider exiting or disabling the translation tab if loading fails
-    translation_model = None
-    translation_tokenizer = None
-# --- Image Processing Functions ---
 transform_image = transforms.Compose(
     [
@@ -70,6 +49,7 @@ def prepare_image_and_mask(
     padding_right=0,
 ):
     image = load_img(image).convert("RGB")
     background = ImageOps.expand(
         image,
         border=(padding_left, padding_top, padding_right, padding_bottom),
@@ -97,19 +77,19 @@ def outpaint(
     background, mask = prepare_image_and_mask(
         image, padding_top, padding_bottom, padding_left, padding_right
     )
-    with (
-        float32_high_matmul_precision()
-    ):  # Apply precision context if needed for inference
-        result = pipe(
-            prompt=prompt,
-            height=background.height,
-            width=background.width,
-            image=background,
-            mask_image=mask,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-        ).images[0]
     result = result.convert("RGBA")
     return result
@@ -122,391 +102,275 @@ def inpaint(
 ):
     background = image.convert("RGB")
     mask = mask.convert("L")
-    with (
-        float32_high_matmul_precision()
-    ):  # Apply precision context if needed for inference
-        result = pipe(
-            prompt=prompt,
-            height=background.height,
-            width=background.width,
-            image=background,
-            mask_image=mask,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-        ).images[0]
     result = result.convert("RGBA")
     return result
 def rmbg(image=None, url=None):
-    if image is None and url:
-        # Basic check for URL format, improve as needed
-        if not url.startswith(("http://", "https://")):
-            return "Invalid URL provided."
-        image = url  # load_img should handle URLs if configured correctly
-    elif image is None:
-        return "Please provide an image or a URL."
-    try:
-        image_pil = load_img(image).convert("RGB")
-    except Exception as e:
-        return f"Error loading image: {e}"
-    image_size = image_pil.size
-    input_images = transform_image(image_pil).unsqueeze(0).to("cuda")
     with float32_high_matmul_precision():
         with torch.no_grad():
             preds = birefnet(input_images)[-1].sigmoid().cpu()
     pred = preds[0].squeeze()
     pred_pil = transforms.ToPILImage()(pred)
     mask = pred_pil.resize(image_size)
-    image_pil.putalpha(mask)
-    # Clean up GPU memory if needed
-    del input_images, preds, pred
-    torch.cuda.empty_cache()
-    gc.collect()
-    return image_pil
 def erase(image=None, mask=None):
-    if image is None or mask is None:
-        return "Please provide both an image and a mask."
-    try:
-        image_pil = load_img(image)
-        mask_pil = load_img(mask).convert("L")
-        result = simple_lama(image_pil, mask_pil)
-        # Clean up
-        gc.collect()
-        return result
-    except Exception as e:
-        return f"Error during erase operation: {e}"
-# --- Translation Functionality ---
-# Language Mapping
-lang_data = {
-    "Arabic": "ar_AR",
-    "Czech": "cs_CZ",
-    "German": "de_DE",
-    "English": "en_XX",
-    "Spanish": "es_XX",
-    "Estonian": "et_EE",
-    "Finnish": "fi_FI",
-    "French": "fr_XX",
-    "Gujarati": "gu_IN",
-    "Hindi": "hi_IN",
-    "Italian": "it_IT",
-    "Japanese": "ja_XX",
-    "Kazakh": "kk_KZ",
-    "Korean": "ko_KR",
-    "Lithuanian": "lt_LT",
-    "Latvian": "lv_LV",
-    "Burmese": "my_MM",
-    "Nepali": "ne_NP",
-    "Dutch": "nl_XX",
-    "Romanian": "ro_RO",
-    "Russian": "ru_RU",
-    "Sinhala": "si_LK",
-    "Turkish": "tr_TR",
-    "Vietnamese": "vi_VN",
-    "Chinese": "zh_CN",
-    "Afrikaans": "af_ZA",
-    "Azerbaijani": "az_AZ",
-    "Bengali": "bn_IN",
-    "Persian": "fa_IR",
-    "Hebrew": "he_IL",
-    "Croatian": "hr_HR",
-    "Indonesian": "id_ID",
-    "Georgian": "ka_GE",
-    "Khmer": "km_KH",
-    "Macedonian": "mk_MK",
-    "Malayalam": "ml_IN",
-    "Mongolian": "mn_MN",
-    "Marathi": "mr_IN",
-    "Polish": "pl_PL",
-    "Pashto": "ps_AF",
-    "Portuguese": "pt_XX",
-    "Swedish": "sv_SE",
-    "Swahili": "sw_KE",
-    "Tamil": "ta_IN",
-    "Telugu": "te_IN",
-    "Thai": "th_TH",
-    "Tagalog": "tl_XX",
-    "Ukrainian": "uk_UA",
-    "Urdu": "ur_PK",
-    "Xhosa": "xh_ZA",
-    "Galician": "gl_ES",
-    "Slovene": "sl_SI",
-}
-language_names = sorted(list(lang_data.keys()))
-def translate_text(text_to_translate, source_language_name, target_language_name):
-    """
-    Translates text using the loaded mBART model.
-    """
-    if translation_model is None or translation_tokenizer is None:
-        return "Translation model not loaded. Cannot perform translation."
-    if not text_to_translate:
-        return "Please enter text to translate."
-    if not source_language_name:
-        return "Please select a source language."
-    if not target_language_name:
-        return "Please select a target language."
-    try:
-        source_lang_code = lang_data[source_language_name]
-        target_lang_code = lang_data[target_language_name]
-        translation_tokenizer.src_lang = source_lang_code
-        encoded_text = translation_tokenizer(text_to_translate, return_tensors="pt").to(
-            "cuda"
-        )  # Move input to GPU
-        target_lang_id = translation_tokenizer.lang_code_to_id[target_lang_code]
-        # Generate translation on GPU
-        with torch.no_grad():  # Use no_grad for inference
-            generated_tokens = translation_model.generate(
-                **encoded_text, forced_bos_token_id=target_lang_id, max_length=200
-            )
-        translated_text = translation_tokenizer.batch_decode(
-            generated_tokens, skip_special_tokens=True
-        )
-        # Clean up GPU memory
-        del encoded_text, generated_tokens
-        torch.cuda.empty_cache()
-        gc.collect()
-        return translated_text[0]
-    except KeyError as e:
-        return f"Error: Language code not found for {e}. Check language mappings."
-    except Exception as e:
-        print(f"Translation error: {e}")
-        # Clean up GPU memory on error too
-        torch.cuda.empty_cache()
-        gc.collect()
-        return f"An error occurred during translation: {e}"
-# --- Main Function Router (for image tasks) ---
-# Note: Translation uses its own function directly
-@spaces.GPU(duration=120)  # Keep GPU decorator if needed for image tasks
 def main(*args):
     api_num = args[0]
     args = args[1:]
-    gc.collect()  # Try to collect garbage before starting task
-    torch.cuda.empty_cache()  # Clear cache before starting task
-    result = None
-    try:
-        if api_num == 1:
-            result = rmbg(*args)
-        elif api_num == 2:
-            result = outpaint(*args)
-        elif api_num == 3:
-            result = inpaint(*args)
-        # elif api_num == 4: # Keep commented out as in original
-        #     return mask_generation(*args)
-        elif api_num == 5:
-            result = erase(*args)
-        else:
-            result = "Invalid API number."
-    except Exception as e:
-        print(f"Error in main task routing (api_num={api_num}): {e}")
-        result = f"An error occurred: {e}"
-    finally:
-        # Ensure memory cleanup happens even if there's an error
-        gc.collect()
-        torch.cuda.empty_cache()
-    return result
-# --- Define Gradio Interfaces for Each Tab ---
-# Image Task Tabs
 rmbg_tab = gr.Interface(
     fn=main,
     inputs=[
-        gr.Number(1, interactive=False, visible=False),  # Hide API number
-        gr.Image(label="Input Image", type="pil", sources=["upload", "clipboard"]),
-        gr.Text(label="Or Image URL (optional)"),
     ],
-    outputs=gr.Image(label="Output Image", type="pil"),
-    title="Remove Background",
-    description="Upload an image or provide a URL to remove its background.",
     api_name="rmbg",
-    # examples=[[1, "./assets/sample_rmbg.png", ""]], # Update example path if needed
     cache_examples=False,
 )
 outpaint_tab = gr.Interface(
     fn=main,
     inputs=[
-        gr.Number(2, interactive=False, visible=False),
-        gr.Image(label="Input Image", type="pil", sources=["upload", "clipboard"]),
-        gr.Number(value=0, label="Padding Top (pixels)"),
-        gr.Number(value=0, label="Padding Bottom (pixels)"),
-        gr.Number(value=0, label="Padding Left (pixels)"),
-        gr.Number(value=0, label="Padding Right (pixels)"),
-        gr.Text(
-            label="Prompt (optional)",
-            info="Describe what to fill the extended area with",
-        ),
-        gr.Slider(
-            minimum=10, maximum=100, step=1, value=28, label="Inference Steps"
-        ),  # Use slider for steps
-        gr.Slider(
-            minimum=1, maximum=100, step=1, value=50, label="Guidance Scale"
-        ),  # Use slider for guidance
     ],
-    outputs=gr.Image(label="Outpainted Image", type="pil"),
-    title="Outpainting",
-    description="Extend an image by adding padding and filling the new area using a diffusion model.",
     api_name="outpainting",
-    # examples=[[2, "./assets/rocket.png", 100, 0, 0, 0, "", 28, 50]], # Update example path
     cache_examples=False,
 )
 inpaint_tab = gr.Interface(
     fn=main,
     inputs=[
-        gr.Number(3, interactive=False, visible=False),
-        gr.Image(label="Input Image", type="pil", sources=["upload", "clipboard"]),
-        gr.Image(
-            label="Mask Image (White=Inpaint Area)",
-            type="pil",
-            sources=["upload", "clipboard"],
-        ),
-        gr.Text(
-            label="Prompt (optional)", info="Describe what to fill the masked area with"
-        ),
-        gr.Slider(minimum=10, maximum=100, step=1, value=28, label="Inference Steps"),
-        gr.Slider(minimum=1, maximum=100, step=1, value=50, label="Guidance Scale"),
     ],
-    outputs=gr.Image(label="Inpainted Image", type="pil"),
-    title="Inpainting",
-    description="Fill in the white areas of a mask applied to an image using a diffusion model.",
     api_name="inpaint",
-    # examples=[[3, "./assets/rocket.png", "./assets/Inpainting_mask.png", "", 28, 50]], # Update example paths
     cache_examples=False,
 )
 erase_tab = gr.Interface(
-    fn=main,
     inputs=[
-        gr.Number(5, interactive=False, visible=False),
-        gr.Image(label="Input Image", type="pil", sources=["upload", "clipboard"]),
-        gr.Image(
-            label="Mask Image (White=Erase Area)",
-            type="pil",
-            sources=["upload", "clipboard"],
-        ),
     ],
-    outputs=gr.Image(label="Result Image", type="pil"),
-    title="Erase Object (LAMA)",
-    description="Erase objects from an image based on a mask using the LaMa inpainting model.",
     api_name="erase",
-    # examples=[[5, "./assets/rocket.png", "./assets/Inpainting_mask.png"]], # Update example paths
     cache_examples=False,
 )
-# --- Define Translation Tab using gr.Blocks ---
-with gr.Blocks() as translation_tab:
-    gr.Markdown(
-        """
-        ## Multilingual Translation (mBART-50)
-        Translate text between 50 different languages.
-        Select the source and target languages, enter your text, and click Translate.
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            source_lang_dropdown = gr.Dropdown(
-                label="Source Language",
-                choices=language_names,
-                info="Select the language of your input text.",
-            )
-            target_lang_dropdown = gr.Dropdown(
-                label="Target Language",
-                choices=language_names,
-                info="Select the language you want to translate to.",
-            )
-        with gr.Column(scale=2):
-            input_textbox = gr.Textbox(
-                label="Text to Translate",
-                lines=6,  # Increased lines
-                placeholder="Enter text here...",
-            )
-            translate_button = gr.Button(
-                "Translate", variant="primary"
-            )  # Added variant
-            output_textbox = gr.Textbox(
-                label="Translated Text",
-                lines=6,  # Increased lines
-                interactive=False,  # Make output read-only
-            )
-    # Connect Components to the translation function directly
-    translate_button.click(
-        fn=translate_text,
-        inputs=[input_textbox, source_lang_dropdown, target_lang_dropdown],
-        outputs=output_textbox,
-        api_name="translate",  # Add API name for the translation endpoint
-    )
-    # Add Translation Examples
-    gr.Examples(
-        examples=[
-            [
-                "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है",
-                "Hindi",
-                "French",
-            ],
-            [
-                "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا.",
-                "Arabic",
-                "English",
-            ],
-            [
-                "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie.",
-                "French",
-                "German",
-            ],
-            ["Hello world! How are you today?", "English", "Spanish"],
-            ["Guten Tag!", "German", "Japanese"],
-            ["これはテストです", "Japanese", "English"],
-        ],
-        inputs=[input_textbox, source_lang_dropdown, target_lang_dropdown],
-        outputs=output_textbox,
-        fn=translate_text,
-        cache_examples=False,
-    )
-# --- Combine all tabs ---
 demo = gr.TabbedInterface(
     [
         rmbg_tab,
         outpaint_tab,
         inpaint_tab,
         erase_tab,
-        translation_tab,  # Add the translation tab
-        #  sam2_tab, # Keep commented out
     ],
     [
-        "Remove Background",  # Tab title
-        "Outpainting",  # Tab title
-        "Inpainting",  # Tab title
-        "Erase (LAMA)",  # Tab title
-        "Translate",  # Tab title for translation
         #  "sam2",
     ],
-    title="Image & Text Utilities (GPU)",  # Updated title
 )
-demo.launch()

 import gradio as gr
 import spaces
 import torch
+from loadimg import load_img
 from torchvision import transforms
 from transformers import AutoModelForImageSegmentation, pipeline
 from diffusers import FluxFillPipeline
 from PIL import Image, ImageOps
+# from sam2.sam2_image_predictor import SAM2ImagePredictor
 import numpy as np
 from simple_lama_inpainting import SimpleLama
 from contextlib import contextmanager
+# import whisperx
 import gc
 @contextmanager
 def float32_high_matmul_precision():
     torch.set_float32_matmul_precision("high")
         torch.set_float32_matmul_precision("highest")
+pipe = FluxFillPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-Fill-dev", torch_dtype=torch.bfloat16
+).to("cuda")
+birefnet = AutoModelForImageSegmentation.from_pretrained(
+    "ZhengPeng7/BiRefNet", trust_remote_code=True
+)
+birefnet.to("cuda")
 transform_image = transforms.Compose(
     [
     padding_right=0,
 ):
     image = load_img(image).convert("RGB")
+    # expand image (left,top,right,bottom)
     background = ImageOps.expand(
         image,
         border=(padding_left, padding_top, padding_right, padding_bottom),
     background, mask = prepare_image_and_mask(
         image, padding_top, padding_bottom, padding_left, padding_right
     )
+    result = pipe(
+        prompt=prompt,
+        height=background.height,
+        width=background.width,
+        image=background,
+        mask_image=mask,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+    ).images[0]
     result = result.convert("RGBA")
     return result
 ):
     background = image.convert("RGB")
     mask = mask.convert("L")
+    result = pipe(
+        prompt=prompt,
+        height=background.height,
+        width=background.width,
+        image=background,
+        mask_image=mask,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+    ).images[0]
     result = result.convert("RGBA")
     return result
 def rmbg(image=None, url=None):
+    if image is None:
+        image = url
+    image = load_img(image).convert("RGB")
+    image_size = image.size
+    input_images = transform_image(image).unsqueeze(0).to("cuda")
     with float32_high_matmul_precision():
+        # Prediction
         with torch.no_grad():
             preds = birefnet(input_images)[-1].sigmoid().cpu()
     pred = preds[0].squeeze()
     pred_pil = transforms.ToPILImage()(pred)
     mask = pred_pil.resize(image_size)
+    image.putalpha(mask)
+    return image
+# def mask_generation(image=None, d=None):
+#     # use bfloat16 for the entire notebook
+#     # torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
+#     # # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
+#     # if torch.cuda.get_device_properties(0).major >= 8:
+#     #     torch.backends.cuda.matmul.allow_tf32 = True
+#     #     torch.backends.cudnn.allow_tf32 = True
+#     d = eval(d)  # convert this to dictionary
+#     with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+#         predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2.1-hiera-large")
+#         predictor.set_image(image)
+#         input_point = np.array(d["input_points"])
+#         input_label = np.array(d["input_labels"])
+#         masks, scores, logits = predictor.predict(
+#             point_coords=input_point,
+#             point_labels=input_label,
+#             multimask_output=True,
+#         )
+#     sorted_ind = np.argsort(scores)[::-1]
+#     masks = masks[sorted_ind]
+#     scores = scores[sorted_ind]
+#     logits = logits[sorted_ind]
+#     out = []
+#     for i in range(len(masks)):
+#         m = Image.fromarray(masks[i] * 255).convert("L")
+#         comp = Image.composite(image, m, m)
+#         out.append((comp, f"image {i}"))
+#     return out
 def erase(image=None, mask=None):
+    simple_lama = SimpleLama()
+    image = load_img(image)
+    mask = load_img(mask).convert("L")
+    return simple_lama(image, mask)
+# def transcribe(audio):
+#     if audio is None:
+#         raise gr.Error("No audio file submitted!")
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#     compute_type = "float16"
+#     batch_size = 8  # reduced batch size to be conservative with memory
+#     try:
+#         # 1. Load model and transcribe
+#         model = whisperx.load_model("large-v2", device, compute_type=compute_type)
+#         audio_input = whisperx.load_audio(audio)
+#         result = model.transcribe(audio_input, batch_size=batch_size)
+#         # Clear GPU memory
+#         del model
+#         gc.collect()
+#         torch.cuda.empty_cache()
+#         # 2. Align whisper output
+#         model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+#         result = whisperx.align(result["segments"], model_a, metadata, audio_input, device, return_char_alignments=False)
+#         # Clear GPU memory
+#         del model_a
+#         gc.collect()
+#         torch.cuda.empty_cache()
+#         # 3. Assign speaker labels
+#         diarize_model = whisperx.DiarizationPipeline(device=device)
+#         diarize_segments = diarize_model(audio_input)
+#         # Combine transcription with speaker diarization
+#         result = whisperx.assign_word_speakers(diarize_segments, result)
+#         # Format output with speaker labels and timestamps
+#         formatted_text = []
+#         for segment in result["segments"]:
+#             if not isinstance(segment, dict):
+#                 continue
+#             speaker = f"[Speaker {segment.get('speaker', 'Unknown')}]"
+#             start_time = f"{float(segment.get('start', 0)):.2f}"
+#             end_time = f"{float(segment.get('end', 0)):.2f}"
+#             text = segment.get('text', '').strip()
+#             formatted_text.append(f"[{start_time}s - {end_time}s] {speaker}: {text}")
+#         return "\n".join(formatted_text)
+#     except Exception as e:
+#         raise gr.Error(f"Transcription failed: {str(e)}")
+#     finally:
+#         # Ensure GPU memory is cleared even if an error occurs
+#         gc.collect()
+#         torch.cuda.empty_cache()
+@spaces.GPU(duration=120)
 def main(*args):
     api_num = args[0]
     args = args[1:]
+    if api_num == 1:
+        return rmbg(*args)
+    elif api_num == 2:
+        return outpaint(*args)
+    elif api_num == 3:
+        return inpaint(*args)
+    # elif api_num == 4:
+    #     return mask_generation(*args)
+    elif api_num == 5:
+        return erase(*args)
+    # elif api_num == 6:
+    #     return transcribe(*args)
 rmbg_tab = gr.Interface(
     fn=main,
     inputs=[
+        gr.Number(1, interactive=False),
+        "image",
+        gr.Text("", label="url"),
     ],
+    outputs=["image"],
     api_name="rmbg",
+    examples=[[1, "./assets/Inpainting mask.png", ""]],
     cache_examples=False,
+    description="pass an image or a url of an image",
 )
 outpaint_tab = gr.Interface(
     fn=main,
     inputs=[
+        gr.Number(2, interactive=False),
+        gr.Image(label="image", type="pil"),
+        gr.Number(label="padding top"),
+        gr.Number(label="padding bottom"),
+        gr.Number(label="padding left"),
+        gr.Number(label="padding right"),
+        gr.Text(label="prompt"),
+        gr.Number(value=50, label="num_inference_steps"),
+        gr.Number(value=28, label="guidance_scale"),
     ],
+    outputs=["image"],
     api_name="outpainting",
+    examples=[[2, "./assets/rocket.png", 100, 0, 0, 0, "", 50, 28]],
     cache_examples=False,
 )
 inpaint_tab = gr.Interface(
     fn=main,
     inputs=[
+        gr.Number(3, interactive=False),
+        gr.Image(label="image", type="pil"),
+        gr.Image(label="mask", type="pil"),
+        gr.Text(label="prompt"),
+        gr.Number(value=50, label="num_inference_steps"),
+        gr.Number(value=28, label="guidance_scale"),
     ],
+    outputs=["image"],
     api_name="inpaint",
+    examples=[[3, "./assets/rocket.png", "./assets/Inpainting mask.png"]],
     cache_examples=False,
+    description="it is recommended that you use https://github.com/la-voliere/react-mask-editor when creating an image mask in JS and then inverse it before sending it to this space",
 )
+# sam2_tab = gr.Interface(
+#     main,
+#     inputs=[
+#         gr.Number(4, interactive=False),
+#         gr.Image(type="pil"),
+#         gr.Text(),
+#     ],
+#     outputs=gr.Gallery(),
+#     examples=[
+#         [
+#             4,
+#             "./assets/truck.jpg",
+#             '{"input_points": [[500, 375], [1125, 625]], "input_labels": [1, 0]}',
+#         ]
+#     ],
+#     api_name="sam2",
+#     cache_examples=False,
+# )
 erase_tab = gr.Interface(
+    main,
     inputs=[
+        gr.Number(5, interactive=False),
+        gr.Image(type="pil"),
+        gr.Image(type="pil"),
+    ],
+    outputs=gr.Image(),
+    examples=[
+        [
+            5,
+            "./assets/rocket.png",
+            "./assets/Inpainting mask.png",
+        ]
     ],
     api_name="erase",
     cache_examples=False,
 )
+transcribe_tab = gr.Interface(
+    fn=main,
+    inputs=[
+        gr.Number(value=6, interactive=False),  # API number
+        gr.Audio(type="filepath", label="Audio File"),
+    ],
+    outputs=gr.Textbox(label="Transcription"),
+    title="Audio Transcription",
+    description="Upload an audio file to extract text using WhisperX with speaker diarization",
+    api_name="transcribe",
+    examples=[]
+)
 demo = gr.TabbedInterface(
     [
         rmbg_tab,
         outpaint_tab,
         inpaint_tab,
+        #  sam2_tab,
         erase_tab,
+        transcribe_tab,
     ],
     [
+        "remove background",
+        "outpainting",
+        "inpainting",
         #  "sam2",
+        "erase",
+        # "transcribe",
     ],
+    title="Utilities that require GPU",
 )
+demo.launch()