mknolan commited on
Commit
6bda7a2
·
verified ·
1 Parent(s): 97ab90d

Upload app.py

Browse files

Simplified version with minimal dependencies

Files changed (1) hide show
  1. app.py +238 -0
app.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import tempfile
5
+ from PIL import Image
6
+ import gradio as gr
7
+ import pdf2image
8
+ from transformers import AutoModel, AutoTokenizer
9
+ import torchvision.transforms as transforms
10
+
11
+ # Configuration
12
+ MODEL_NAME = "OpenGVLab/InternVL2_5-8B"
13
+ IMAGE_SIZE = 448
14
+
15
+ # Model loading function
16
+ def load_model():
17
+ print(f"\n=== Loading {MODEL_NAME} ===")
18
+ print(f"CUDA available: {torch.cuda.is_available()}")
19
+
20
+ # Set device
21
+ device = "cuda" if torch.cuda.is_available() else "cpu"
22
+ print(f"Using device: {device}")
23
+
24
+ # Load model and tokenizer with minimal options to avoid compatibility issues
25
+ try:
26
+ model = AutoModel.from_pretrained(
27
+ MODEL_NAME,
28
+ trust_remote_code=True,
29
+ device_map="auto" if torch.cuda.is_available() else None
30
+ )
31
+
32
+ tokenizer = AutoTokenizer.from_pretrained(
33
+ MODEL_NAME,
34
+ use_fast=False,
35
+ trust_remote_code=True
36
+ )
37
+
38
+ print(f"✓ Model and tokenizer loaded successfully!")
39
+ return model, tokenizer
40
+ except Exception as e:
41
+ print(f"❌ Error loading model: {e}")
42
+ import traceback
43
+ traceback.print_exc()
44
+ return None, None
45
+
46
+ # Extract slides from uploaded PDF file
47
+ def extract_slides_from_pdf(file_obj):
48
+ try:
49
+ file_bytes = file_obj.read()
50
+ file_extension = os.path.splitext(file_obj.name)[1].lower()
51
+
52
+ # Check if it's a PDF
53
+ if file_extension != '.pdf':
54
+ return []
55
+
56
+ # Create temporary file
57
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
58
+ temp_file.write(file_bytes)
59
+ temp_path = temp_file.name
60
+
61
+ # Extract images from PDF using pdf2image
62
+ slides = []
63
+ try:
64
+ images = pdf2image.convert_from_path(temp_path, dpi=300)
65
+ slides = [(f"Slide {i+1}", img) for i, img in enumerate(images)]
66
+ except Exception as e:
67
+ print(f"Error converting PDF: {e}")
68
+
69
+ # Clean up temporary file
70
+ os.unlink(temp_path)
71
+
72
+ return slides
73
+
74
+ except Exception as e:
75
+ import traceback
76
+ error_msg = f"Error extracting slides: {str(e)}\n{traceback.format_exc()}"
77
+ print(error_msg)
78
+ return []
79
+
80
+ # Simple preprocessing for a single image
81
+ def preprocess_image(image):
82
+ # Resize image to expected size
83
+ img = image.resize((IMAGE_SIZE, IMAGE_SIZE))
84
+
85
+ # Convert PIL image to tensor and normalize
86
+ transform = transforms.Compose([
87
+ transforms.ToTensor(),
88
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
89
+ ])
90
+
91
+ # Apply transformation and add batch dimension
92
+ img_tensor = transform(img).unsqueeze(0)
93
+
94
+ # Move tensor to GPU if available
95
+ if torch.cuda.is_available():
96
+ img_tensor = img_tensor.cuda()
97
+
98
+ return img_tensor
99
+
100
+ # Image analysis function - using simple approach
101
+ def analyze_image(model, tokenizer, image, prompt):
102
+ try:
103
+ # Check if image is valid
104
+ if image is None:
105
+ return "Please upload an image first."
106
+
107
+ # Process the image with simple preprocessing
108
+ processed_image = preprocess_image(image)
109
+
110
+ # Simple prompt format
111
+ question = f"<image>\n{prompt}"
112
+
113
+ # Use the model's chat method
114
+ response, _ = model.chat(
115
+ tokenizer=tokenizer,
116
+ pixel_values=processed_image,
117
+ question=question,
118
+ history=None,
119
+ return_history=True
120
+ )
121
+
122
+ return response
123
+ except Exception as e:
124
+ import traceback
125
+ error_msg = f"Error analyzing image: {str(e)}\n{traceback.format_exc()}"
126
+ return error_msg
127
+
128
+ # Analyze multiple slides from a PDF
129
+ def analyze_pdf_slides(model, tokenizer, file_obj, prompt, num_slides=2):
130
+ try:
131
+ if file_obj is None:
132
+ return "Please upload a PDF file."
133
+
134
+ # Extract slides from PDF
135
+ slides = extract_slides_from_pdf(file_obj)
136
+
137
+ if not slides:
138
+ return "No slides were extracted from the file. Please check that it's a valid PDF."
139
+
140
+ # Limit to the requested number of slides
141
+ slides = slides[:num_slides]
142
+
143
+ # Analyze each slide
144
+ analyses = []
145
+ for slide_title, slide_image in slides:
146
+ analysis = analyze_image(model, tokenizer, slide_image, prompt)
147
+ analyses.append((slide_title, analysis))
148
+
149
+ # Format the results
150
+ result = ""
151
+ for slide_title, analysis in analyses:
152
+ result += f"## {slide_title}\n\n{analysis}\n\n---\n\n"
153
+
154
+ return result
155
+
156
+ except Exception as e:
157
+ import traceback
158
+ error_msg = f"Error analyzing slides: {str(e)}\n{traceback.format_exc()}"
159
+ return error_msg
160
+
161
+ # Main function
162
+ def main():
163
+ # Load the model
164
+ model, tokenizer = load_model()
165
+
166
+ if model is None:
167
+ # Create an error interface if model loading failed
168
+ demo = gr.Interface(
169
+ fn=lambda x: "Model loading failed. Please check the logs for details.",
170
+ inputs=gr.Textbox(),
171
+ outputs=gr.Textbox(),
172
+ title="InternVL2.5 Slide Analyzer - Error",
173
+ description="The model failed to load. Please check the logs for more information."
174
+ )
175
+ return demo
176
+
177
+ # Create a simple interface
178
+ with gr.Blocks(title="InternVL2.5 PDF Slide Analyzer") as demo:
179
+ gr.Markdown("# InternVL2.5 PDF Slide Analyzer")
180
+ gr.Markdown("Upload a PDF file and analyze multiple slides")
181
+
182
+ # PDF Analysis tab
183
+ slide_prompts = [
184
+ "Analyze this slide and describe its contents.",
185
+ "What is the main message of this slide?",
186
+ "Extract all the text visible in this slide.",
187
+ "What are the key points presented in this slide?",
188
+ "Describe the visual elements and layout of this slide."
189
+ ]
190
+
191
+ with gr.Row():
192
+ file_input = gr.File(label="Upload PDF")
193
+ slide_prompt = gr.Dropdown(
194
+ choices=slide_prompts,
195
+ value=slide_prompts[0],
196
+ label="Select a prompt",
197
+ allow_custom_value=True
198
+ )
199
+
200
+ num_slides = gr.Slider(
201
+ minimum=1,
202
+ maximum=5,
203
+ value=2,
204
+ step=1,
205
+ label="Number of Slides to Analyze"
206
+ )
207
+
208
+ slides_analyze_btn = gr.Button("Analyze Slides")
209
+ slides_output = gr.Markdown(label="Analysis Results")
210
+
211
+ # Handle the slides analysis action
212
+ slides_analyze_btn.click(
213
+ fn=lambda file, prompt, num: analyze_pdf_slides(model, tokenizer, file, prompt, num),
214
+ inputs=[file_input, slide_prompt, num_slides],
215
+ outputs=slides_output
216
+ )
217
+
218
+ # Add example if available
219
+ if os.path.exists("example_slides/test_slides.pdf"):
220
+ gr.Examples(
221
+ examples=[
222
+ ["example_slides/test_slides.pdf", "Extract all the text visible in this slide.", 2]
223
+ ],
224
+ inputs=[file_input, slide_prompt, num_slides]
225
+ )
226
+
227
+ return demo
228
+
229
+ # Run the application
230
+ if __name__ == "__main__":
231
+ try:
232
+ # Create and launch the interface
233
+ demo = main()
234
+ demo.launch(server_name="0.0.0.0")
235
+ except Exception as e:
236
+ print(f"Error starting the application: {e}")
237
+ import traceback
238
+ traceback.print_exc()