Spaces:

AffordableAI
/

Real_Time_Safety_Monitoring

Sleeping

App Files Files Community

capradeepgujaran commited on Oct 23, 2024

Commit

740f7c7

verified ·

1 Parent(s): 7e6153d

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -57

app.py CHANGED Viewed

@@ -16,27 +16,32 @@ def create_monitor_interface():
         def __init__(self):
             self.client = Groq()
             self.model_name = "llama-3.2-90b-vision-preview"
-            self.max_image_size = (640, 640)  # Increased size for better visibility
-            self.colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]
         def resize_image(self, image):
             height, width = image.shape[:2]
-            aspect = width / height
-            if width > height:
-                new_width = min(self.max_image_size[0], width)
-                new_height = int(new_width / aspect)
-            else:
-                new_height = min(self.max_image_size[1], height)
-                new_width = int(new_height * aspect)
-            return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
         def analyze_frame(self, frame: np.ndarray) -> str:
             if frame is None:
                 return "No frame received"
-            # Convert and resize image
             if len(frame.shape) == 2:
                 frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
             elif len(frame.shape) == 3 and frame.shape[2] == 4:
@@ -45,11 +50,11 @@ def create_monitor_interface():
             frame = self.resize_image(frame)
             frame_pil = PILImage.fromarray(frame)
-            # Convert to base64 with minimal quality
             buffered = io.BytesIO()
             frame_pil.save(buffered,
                          format="JPEG",
-                         quality=30,
                          optimize=True)
             img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
             image_url = f"data:image/jpeg;base64,{img_base64}"
@@ -63,9 +68,10 @@ def create_monitor_interface():
                             "content": [
                                 {
                                     "type": "text",
-                                    "text": """Analyze this workplace image and describe each safety concern in this format:
-                                    - <location>Description</location>
-                                    Use one line per issue, starting with a dash and location in tags."""
                                 },
                                 {
                                     "type": "image_url",
@@ -81,7 +87,7 @@ def create_monitor_interface():
                         }
                     ],
                     temperature=0.1,
-                    max_tokens=150,
                     top_p=1,
                     stream=False,
                     stop=None
@@ -91,59 +97,97 @@ def create_monitor_interface():
                 print(f"Detailed error: {str(e)}")
                 return f"Analysis Error: {str(e)}"
         def draw_observations(self, image, observations):
             height, width = image.shape[:2]
             font = cv2.FONT_HERSHEY_SIMPLEX
-            font_scale = 0.5
             thickness = 2
-            # Generate random positions for each observation
             for idx, obs in enumerate(observations):
                 color = self.colors[idx % len(self.colors)]
-                # Generate random box position
-                box_width = width // 3
-                box_height = height // 3
-                x = random.randint(0, width - box_width)
-                y = random.randint(0, height - box_height)
                 # Draw rectangle
-                cv2.rectangle(image, (x, y), (x + box_width, y + box_height), color, 2)
                 # Add label with background
-                label = obs[:40] + "..." if len(obs) > 40 else obs
                 label_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
-                cv2.rectangle(image, (x, y - 20), (x + label_size[0], y), color, -1)
-                cv2.putText(image, label, (x, y - 5), font, font_scale, (255, 255, 255), thickness)
             return image
         def process_frame(self, frame: np.ndarray) -> tuple[np.ndarray, str]:
             if frame is None:
                 return None, "No image provided"
-            analysis = self.analyze_frame(frame)
-            display_frame = self.resize_image(frame.copy())
-            # Parse observations from the analysis
-            observations = []
-            for line in analysis.split('\n'):
-                line = line.strip()
-                if line.startswith('-'):
-                    # Extract text between <location> tags if present
-                    if '<location>' in line and '</location>' in line:
-                        start = line.find('<location>') + len('<location>')
-                        end = line.find('</location>')
-                        observation = line[end + len('</location>'):].strip()
-                    else:
-                        observation = line[1:].strip()  # Remove the dash
-                    if observation:
-                        observations.append(observation)
-            # Draw observations on the image
-            annotated_frame = self.draw_observations(display_frame, observations)
-            return annotated_frame, analysis
     # Create the main interface
     monitor = SafetyMonitor()
@@ -152,12 +196,12 @@ def create_monitor_interface():
         gr.Markdown("# Safety Analysis System powered by Llama 3.2 90b vision")
         with gr.Row():
-            input_image = gr.Image(label="Upload Image")
-            output_image = gr.Image(label="Annotated Results")
-        analysis_text = gr.Textbox(label="Detailed Analysis", lines=5)
-        def analyze_image(image):
             if image is None:
                 return None, "No image provided"
             try:
@@ -167,10 +211,10 @@ def create_monitor_interface():
                 print(f"Processing error: {str(e)}")
                 return None, f"Error processing image: {str(e)}"
-        input_image.change(
-            fn=analyze_image,
-            inputs=input_image,
-            outputs=[output_image, analysis_text]
         )
     return demo

         def __init__(self):
             self.client = Groq()
             self.model_name = "llama-3.2-90b-vision-preview"
+            self.max_image_size = (800, 800)  # Increased size for better quality
+            self.colors = [(0, 0, 255), (255, 0, 0), (0, 255, 0), (255, 255, 0), (255, 0, 255)]
+            self.last_analysis_time = 0
+            self.analysis_interval = 2  # Analyze every 2 seconds
+            self.last_observations = []  # Store previous observations
         def resize_image(self, image):
             height, width = image.shape[:2]
+            # Only resize if image is too large
+            if height > self.max_image_size[1] or width > self.max_image_size[0]:
+                aspect = width / height
+                if width > height:
+                    new_width = self.max_image_size[0]
+                    new_height = int(new_width / aspect)
+                else:
+                    new_height = self.max_image_size[1]
+                    new_width = int(new_height * aspect)
+                return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
+            return image
         def analyze_frame(self, frame: np.ndarray) -> str:
             if frame is None:
                 return "No frame received"
+            # Convert image
             if len(frame.shape) == 2:
                 frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
             elif len(frame.shape) == 3 and frame.shape[2] == 4:
             frame = self.resize_image(frame)
             frame_pil = PILImage.fromarray(frame)
+            # Convert to base64 with better quality
             buffered = io.BytesIO()
             frame_pil.save(buffered,
                          format="JPEG",
+                         quality=85,  # Higher quality
                          optimize=True)
             img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
             image_url = f"data:image/jpeg;base64,{img_base64}"
                             "content": [
                                 {
                                     "type": "text",
+                                    "text": """Analyze this image for safety hazards. For each issue, describe:
+                                    1. The location (top-left, center, bottom-right, etc.)
+                                    2. The specific safety concern
+                                    Format: - <location>position:description</location>"""
                                 },
                                 {
                                     "type": "image_url",
                         }
                     ],
                     temperature=0.1,
+                    max_tokens=200,
                     top_p=1,
                     stream=False,
                     stop=None
                 print(f"Detailed error: {str(e)}")
                 return f"Analysis Error: {str(e)}"
+        def get_region_coordinates(self, position: str, image_shape: tuple) -> tuple:
+            height, width = image_shape[:2]
+            regions = {
+                'top-left': (0, 0, width//3, height//3),
+                'top': (width//3, 0, 2*width//3, height//3),
+                'top-right': (2*width//3, 0, width, height//3),
+                'left': (0, height//3, width//3, 2*height//3),
+                'center': (width//3, height//3, 2*width//3, 2*height//3),
+                'right': (2*width//3, height//3, width, 2*height//3),
+                'bottom-left': (0, 2*height//3, width//3, height),
+                'bottom': (width//3, 2*height//3, 2*width//3, height),
+                'bottom-right': (2*width//3, 2*height//3, width, height)
+            }
+            # Find the best matching region
+            for region_name, coords in regions.items():
+                if region_name in position.lower():
+                    return coords
+            # Default to center if no match
+            return regions['center']
         def draw_observations(self, image, observations):
             height, width = image.shape[:2]
             font = cv2.FONT_HERSHEY_SIMPLEX
+            font_scale = 0.6
             thickness = 2
             for idx, obs in enumerate(observations):
                 color = self.colors[idx % len(self.colors)]
+                # Try to extract position from observation
+                parts = obs.split(':')
+                if len(parts) >= 2:
+                    position = parts[0]
+                    description = ':'.join(parts[1:])
+                else:
+                    position = 'center'
+                    description = obs
+                # Get coordinates based on position
+                x1, y1, x2, y2 = self.get_region_coordinates(position, image.shape)
                 # Draw rectangle
+                cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)
                 # Add label with background
+                label = description[:50] + "..." if len(description) > 50 else description
                 label_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
+                # Ensure label stays within image bounds
+                label_x = max(0, min(x1, width - label_size[0]))
+                label_y = max(20, y1 - 5)
+                cv2.rectangle(image, (label_x, label_y - 20),
+                            (label_x + label_size[0], label_y), color, -1)
+                cv2.putText(image, label, (label_x, label_y - 5),
+                          font, font_scale, (255, 255, 255), thickness)
             return image
         def process_frame(self, frame: np.ndarray) -> tuple[np.ndarray, str]:
             if frame is None:
                 return None, "No image provided"
+            current_time = time.time()
+            # Only perform analysis if enough time has passed
+            if current_time - self.last_analysis_time >= self.analysis_interval:
+                analysis = self.analyze_frame(frame)
+                self.last_analysis_time = current_time
+                # Parse observations
+                observations = []
+                for line in analysis.split('\n'):
+                    line = line.strip()
+                    if line.startswith('-'):
+                        if '<location>' in line and '</location>' in line:
+                            start = line.find('<location>') + len('<location>')
+                            end = line.find('</location>')
+                            observation = line[start:end].strip()
+                            if observation:
+                                observations.append(observation)
+                self.last_observations = observations
+            # Draw observations on the frame
+            display_frame = frame.copy()
+            annotated_frame = self.draw_observations(display_frame, self.last_observations)
+            return annotated_frame, '\n'.join([f"- {obs}" for obs in self.last_observations])
     # Create the main interface
     monitor = SafetyMonitor()
         gr.Markdown("# Safety Analysis System powered by Llama 3.2 90b vision")
         with gr.Row():
+            webcam = gr.Image(source="webcam", streaming=True, label="Live Feed")
+            output_image = gr.Image(label="Analysis")
+        analysis_text = gr.Textbox(label="Safety Concerns", lines=5)
+        def analyze_stream(image):
             if image is None:
                 return None, "No image provided"
             try:
                 print(f"Processing error: {str(e)}")
                 return None, f"Error processing image: {str(e)}"
+        webcam.stream(
+            fn=analyze_stream,
+            outputs=[output_image, analysis_text],
+            show_progress=False
         )
     return demo