Spaces:

prithivMLmods
/

Tiny-VLMs-Lab

Running on Zero

App Files Files Community

prithivMLmods commited on 14 days ago

Commit

b04f6ab

verified ·

1 Parent(s): 30d6225

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -11

app.py CHANGED Viewed

@@ -379,7 +379,7 @@ pdf_cache = {
     "results": []
 }
 @spaces.GPU
-def inference(model_name: str, image: Image.Image, prompt: str, max_new_tokens: int = 1024) -> str:
     """Run inference on an image with the given prompt using the selected model."""
     try:
         if model_name == "Camel-Doc-OCR-062825":
@@ -406,9 +406,18 @@ def inference(model_name: str, image: Image.Image, prompt: str, max_new_tokens:
                 ]
             }
         ]
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text, [image], return_tensors="pt").to(device)
         with torch.no_grad():
             generated_ids = model.generate(
@@ -421,7 +430,7 @@ def inference(model_name: str, image: Image.Image, prompt: str, max_new_tokens:
         generated_ids = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
-        output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
         return output_text
     except Exception as e:
@@ -508,6 +517,7 @@ def load_file_for_preview(file_path: str) -> Tuple[Optional[Image.Image], str]:
     if not file_path or not os.path.exists(file_path):
         return None, "No file selected"
     file_ext = os.path.splitext(file_path).lower()
     try:
@@ -526,6 +536,7 @@ def load_file_for_preview(file_path: str) -> Tuple[Optional[Image.Image], str]:
                 "results": []
             })
             return images, f"Page 1 / {len(images)}"
         elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
@@ -725,14 +736,15 @@ def create_gradio_interface():
                 if not file_path:
                     return None, "Please upload a file first.", None
-                # Load and preview file
-                image, page_info = load_file_for_preview(file_path)
-                if image is None:
-                    return None, page_info, None
                 # Process the image(s)
                 if pdf_cache["file_type"] == "pdf":
-                    # Process all pages for PDF
                     all_results = []
                     all_markdown = []
@@ -769,7 +781,7 @@ def create_gradio_interface():
                     # Process single image
                     result = process_image(
                         model_name,
-                        image,
                         min_pixels=int(min_pix) if min_pix else None,
                         max_pixels=int(max_pix) if max_pix else None
                     )
@@ -799,7 +811,7 @@ def create_gradio_interface():
         def handle_file_upload(file_path):
             """Handle file upload and show preview"""
             if not file_path:
-                return None, "No file loaded"
             image, page_info = load_file_for_preview(file_path)
             return image, page_info

     "results": []
 }
 @spaces.GPU
+def inference(model_name: str, image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> str:
     """Run inference on an image with the given prompt using the selected model."""
     try:
         if model_name == "Camel-Doc-OCR-062825":
                 ]
             }
         ]
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=[image],
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt"
+        ).to(device)
         with torch.no_grad():
             generated_ids = model.generate(
         generated_ids = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
+        output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
         return output_text
     except Exception as e:
     if not file_path or not os.path.exists(file_path):
         return None, "No file selected"
+    # FIX 1: Access the second element of the tuple returned by os.path.splitext
     file_ext = os.path.splitext(file_path).lower()
     try:
                 "results": []
             })
+            # FIX 2: Return only the first image for the preview component
             return images, f"Page 1 / {len(images)}"
         elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
                 if not file_path:
                     return None, "Please upload a file first.", None
+                # This function now correctly returns a single image for preview
+                # and populates the cache for multi-page processing.
+                preview_img, page_info_str = load_file_for_preview(file_path)
+                if preview_img is None:
+                    return None, page_info_str, None
                 # Process the image(s)
                 if pdf_cache["file_type"] == "pdf":
+                    # Process all pages for PDF from the cache
                     all_results = []
                     all_markdown = []
                     # Process single image
                     result = process_image(
                         model_name,
+                        preview_img, # Use the single loaded image
                         min_pixels=int(min_pix) if min_pix else None,
                         max_pixels=int(max_pix) if max_pix else None
                     )
         def handle_file_upload(file_path):
             """Handle file upload and show preview"""
             if not file_path:
+                return None, '<div class="page-info">No file loaded</div>'
             image, page_info = load_file_for_preview(file_path)
             return image, page_info