Spaces:

mknolan
/

internvl25-slide-analyzer-simple

Paused

App Files Files Community

mknolan commited on Mar 18

Commit

6bda7a2

verified ·

1 Parent(s): 97ab90d

Upload app.py

Browse files

Simplified version with minimal dependencies

Files changed (1) hide show

app.py +238 -0

app.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import os
+import sys
+import torch
+import tempfile
+from PIL import Image
+import gradio as gr
+import pdf2image
+from transformers import AutoModel, AutoTokenizer
+import torchvision.transforms as transforms
+# Configuration
+MODEL_NAME = "OpenGVLab/InternVL2_5-8B"
+IMAGE_SIZE = 448
+# Model loading function
+def load_model():
+    print(f"\n=== Loading {MODEL_NAME} ===")
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    # Set device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    # Load model and tokenizer with minimal options to avoid compatibility issues
+    try:
+        model = AutoModel.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True,
+            device_map="auto" if torch.cuda.is_available() else None
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
+            use_fast=False,
+            trust_remote_code=True
+        )
+        print(f"✓ Model and tokenizer loaded successfully!")
+        return model, tokenizer
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, None
+# Extract slides from uploaded PDF file
+def extract_slides_from_pdf(file_obj):
+    try:
+        file_bytes = file_obj.read()
+        file_extension = os.path.splitext(file_obj.name)[1].lower()
+        # Check if it's a PDF
+        if file_extension != '.pdf':
+            return []
+        # Create temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
+            temp_file.write(file_bytes)
+            temp_path = temp_file.name
+        # Extract images from PDF using pdf2image
+        slides = []
+        try:
+            images = pdf2image.convert_from_path(temp_path, dpi=300)
+            slides = [(f"Slide {i+1}", img) for i, img in enumerate(images)]
+        except Exception as e:
+            print(f"Error converting PDF: {e}")
+        # Clean up temporary file
+        os.unlink(temp_path)
+        return slides
+    except Exception as e:
+        import traceback
+        error_msg = f"Error extracting slides: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return []
+# Simple preprocessing for a single image
+def preprocess_image(image):
+    # Resize image to expected size
+    img = image.resize((IMAGE_SIZE, IMAGE_SIZE))
+    # Convert PIL image to tensor and normalize
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+    # Apply transformation and add batch dimension
+    img_tensor = transform(img).unsqueeze(0)
+    # Move tensor to GPU if available
+    if torch.cuda.is_available():
+        img_tensor = img_tensor.cuda()
+    return img_tensor
+# Image analysis function - using simple approach
+def analyze_image(model, tokenizer, image, prompt):
+    try:
+        # Check if image is valid
+        if image is None:
+            return "Please upload an image first."
+        # Process the image with simple preprocessing
+        processed_image = preprocess_image(image)
+        # Simple prompt format
+        question = f"<image>\n{prompt}"
+        # Use the model's chat method
+        response, _ = model.chat(
+            tokenizer=tokenizer,
+            pixel_values=processed_image,
+            question=question,
+            history=None,
+            return_history=True
+        )
+        return response
+    except Exception as e:
+        import traceback
+        error_msg = f"Error analyzing image: {str(e)}\n{traceback.format_exc()}"
+        return error_msg
+# Analyze multiple slides from a PDF
+def analyze_pdf_slides(model, tokenizer, file_obj, prompt, num_slides=2):
+    try:
+        if file_obj is None:
+            return "Please upload a PDF file."
+        # Extract slides from PDF
+        slides = extract_slides_from_pdf(file_obj)
+        if not slides:
+            return "No slides were extracted from the file. Please check that it's a valid PDF."
+        # Limit to the requested number of slides
+        slides = slides[:num_slides]
+        # Analyze each slide
+        analyses = []
+        for slide_title, slide_image in slides:
+            analysis = analyze_image(model, tokenizer, slide_image, prompt)
+            analyses.append((slide_title, analysis))
+        # Format the results
+        result = ""
+        for slide_title, analysis in analyses:
+            result += f"## {slide_title}\n\n{analysis}\n\n---\n\n"
+        return result
+    except Exception as e:
+        import traceback
+        error_msg = f"Error analyzing slides: {str(e)}\n{traceback.format_exc()}"
+        return error_msg
+# Main function
+def main():
+    # Load the model
+    model, tokenizer = load_model()
+    if model is None:
+        # Create an error interface if model loading failed
+        demo = gr.Interface(
+            fn=lambda x: "Model loading failed. Please check the logs for details.",
+            inputs=gr.Textbox(),
+            outputs=gr.Textbox(),
+            title="InternVL2.5 Slide Analyzer - Error",
+            description="The model failed to load. Please check the logs for more information."
+        )
+        return demo
+    # Create a simple interface
+    with gr.Blocks(title="InternVL2.5 PDF Slide Analyzer") as demo:
+        gr.Markdown("# InternVL2.5 PDF Slide Analyzer")
+        gr.Markdown("Upload a PDF file and analyze multiple slides")
+        # PDF Analysis tab
+        slide_prompts = [
+            "Analyze this slide and describe its contents.",
+            "What is the main message of this slide?",
+            "Extract all the text visible in this slide.",
+            "What are the key points presented in this slide?",
+            "Describe the visual elements and layout of this slide."
+        ]
+        with gr.Row():
+            file_input = gr.File(label="Upload PDF")
+            slide_prompt = gr.Dropdown(
+                choices=slide_prompts,
+                value=slide_prompts[0],
+                label="Select a prompt",
+                allow_custom_value=True
+            )
+        num_slides = gr.Slider(
+            minimum=1,
+            maximum=5,
+            value=2,
+            step=1,
+            label="Number of Slides to Analyze"
+        )
+        slides_analyze_btn = gr.Button("Analyze Slides")
+        slides_output = gr.Markdown(label="Analysis Results")
+        # Handle the slides analysis action
+        slides_analyze_btn.click(
+            fn=lambda file, prompt, num: analyze_pdf_slides(model, tokenizer, file, prompt, num),
+            inputs=[file_input, slide_prompt, num_slides],
+            outputs=slides_output
+        )
+        # Add example if available
+        if os.path.exists("example_slides/test_slides.pdf"):
+            gr.Examples(
+                examples=[
+                    ["example_slides/test_slides.pdf", "Extract all the text visible in this slide.", 2]
+                ],
+                inputs=[file_input, slide_prompt, num_slides]
+            )
+    return demo
+# Run the application
+if __name__ == "__main__":
+    try:
+        # Create and launch the interface
+        demo = main()
+        demo.launch(server_name="0.0.0.0")
+    except Exception as e:
+        print(f"Error starting the application: {e}")
+        import traceback
+        traceback.print_exc()