prithivMLmods commited on
Commit
0dfb052
·
verified ·
1 Parent(s): b15b59b
Files changed (1) hide show
  1. app.py +6 -6
app.py CHANGED
@@ -29,8 +29,8 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
29
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
- # Load Behemoth-3B-070225-post0.1
33
- MODEL_ID_N = "prithivMLmods/Behemoth-3B-070225-post0.1"
34
  processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
35
  model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
36
  MODEL_ID_N,
@@ -110,7 +110,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
110
  if model_name == "SkyCaptioner-V1":
111
  processor = processor_m
112
  model = model_m
113
- elif model_name == "Behemoth-3B-070225-post0.1":
114
  processor = processor_n
115
  model = model_n
116
  elif model_name == "SpaceThinker-3B":
@@ -171,7 +171,7 @@ def generate_video(model_name: str, text: str, video_path: str,
171
  if model_name == "SkyCaptioner-V1":
172
  processor = processor_m
173
  model = model_m
174
- elif model_name == "Behemoth-3B-070225-post0.1":
175
  processor = processor_n
176
  model = model_n
177
  elif model_name == "SpaceThinker-3B":
@@ -293,7 +293,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
293
  with gr.Accordion("(Result.md)", open=False):
294
  markdown_output = gr.Markdown(label="Formatted Result")
295
  model_choice = gr.Radio(
296
- choices=["SkyCaptioner-V1", "Behemoth-3B-070225-post0.1", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "SpaceOm-3B"],
297
  label="Select Model",
298
  value="SkyCaptioner-V1"
299
  )
@@ -302,7 +302,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
302
  gr.Markdown("> [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
303
  gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
304
  gr.Markdown("> [SpaceOm](https://huggingface.co/remyxai/SpaceOm): SpaceOm, the reasoning traces in the spacethinker dataset average ~200 thinking tokens, so now included longer reasoning traces in the training data to help the model use more tokens in reasoning.")
305
- gr.Markdown("> [Behemoth-3B-070225-post0.1](https://huggingface.co/prithivMLmods/Behemoth-3B-070225-post0.1): The behemoth-3b-070225-post0.1 model is a fine-tuned version of qwen2.5-vl-3b-instruct, optimized for detailed image captioning, OCR tasks, and chain-of-thought reasoning.")
306
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
307
 
308
  image_submit.click(
 
29
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
+ # Load DeepCaption-VLA-7B
33
+ MODEL_ID_N = "prithivMLmods/DeepCaption-VLA-7B"
34
  processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
35
  model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
36
  MODEL_ID_N,
 
110
  if model_name == "SkyCaptioner-V1":
111
  processor = processor_m
112
  model = model_m
113
+ elif model_name == "DeepCaption-VLA-7B":
114
  processor = processor_n
115
  model = model_n
116
  elif model_name == "SpaceThinker-3B":
 
171
  if model_name == "SkyCaptioner-V1":
172
  processor = processor_m
173
  model = model_m
174
+ elif model_name == "DeepCaption-VLA-7B":
175
  processor = processor_n
176
  model = model_n
177
  elif model_name == "SpaceThinker-3B":
 
293
  with gr.Accordion("(Result.md)", open=False):
294
  markdown_output = gr.Markdown(label="Formatted Result")
295
  model_choice = gr.Radio(
296
+ choices=["SkyCaptioner-V1", "DeepCaption-VLA-7B", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "SpaceOm-3B"],
297
  label="Select Model",
298
  value="SkyCaptioner-V1"
299
  )
 
302
  gr.Markdown("> [SpaceThinker-Qwen2.5VL-3B](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B): thinking/reasoning multimodal/vision-language model (VLM) trained to enhance spatial reasoning.")
303
  gr.Markdown("> [coreOCR-7B-050325-preview](https://huggingface.co/prithivMLmods/coreOCR-7B-050325-preview): model is a fine-tuned version of qwen/qwen2-vl-7b, optimized for document-level optical character recognition (ocr), long-context vision-language understanding.")
304
  gr.Markdown("> [SpaceOm](https://huggingface.co/remyxai/SpaceOm): SpaceOm, the reasoning traces in the spacethinker dataset average ~200 thinking tokens, so now included longer reasoning traces in the training data to help the model use more tokens in reasoning.")
305
+ gr.Markdown("> [DeepCaption-VLA-7B](https://huggingface.co/prithivMLmods/Behemoth-3B-070225-post0.1): DeepCaption-VLA-7B model is a fine-tuned version of Qwen2.5-VL-7B-Instruct, tailored for Image Captioning and VLA. This variant is designed to generate precise, highly descriptive captions.")
306
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
307
 
308
  image_submit.click(