Spaces:

willwade
/

chaplinDemo

Runtime error

App Files Files Community

willwade commited on Feb 3

Commit

87a5eeb

1 Parent(s): 13f27c7

fix frames as video - more time

Browse files

Files changed (1) hide show

app.py +13 -10

app.py CHANGED Viewed

@@ -22,7 +22,8 @@ class ChaplinGradio:
         # Frame buffer
         self.frame_buffer = []
-        self.buffer_size = 16  # Number of frames to accumulate before processing
     def download_models(self):
         """Download required model files from HuggingFace"""
@@ -65,7 +66,7 @@ class ChaplinGradio:
         current_time = time.time()
         if current_time - self.last_frame_time < self.frame_interval:
-            return None
         self.last_frame_time = current_time
@@ -80,8 +81,8 @@ class ChaplinGradio:
             # Add frame to buffer
             self.frame_buffer.append(frame)
-            # Only process when we have enough frames
-            if len(self.frame_buffer) >= self.buffer_size:
                 # Create temp directory if it doesn't exist
                 os.makedirs("temp", exist_ok=True)
@@ -105,13 +106,15 @@ class ChaplinGradio:
                     out.write(f)
                 out.release()
-                # Clear buffer
-                self.frame_buffer = []
                 try:
                     # Process the video file using the pipeline
                     predicted_text = self.vsr_model(temp_video)
-                    return predicted_text
                 except Exception as e:
                     print(f"Error during inference: {str(e)}")
@@ -121,7 +124,7 @@ class ChaplinGradio:
                     if os.path.exists(temp_video):
                         os.remove(temp_video)
-            return "Collecting frames..."  # Return status while collecting frames
         except Exception as e:
             print(f"Error processing: {str(e)}")
@@ -134,9 +137,9 @@ chaplin = ChaplinGradio()
 iface = gr.Interface(
     fn=chaplin.process_frame,
     inputs=gr.Image(sources=["webcam"], streaming=True),
-    outputs=gr.Textbox(label="Predicted Text"),
     title="Chaplin - Live Visual Speech Recognition",
-    description="Use your webcam to perform real-time visual speech recognition.",
     live=True
 )

         # Frame buffer
         self.frame_buffer = []
+        self.min_frames = 32  # 2 seconds of video at 16 fps
+        self.last_prediction = ""
     def download_models(self):
         """Download required model files from HuggingFace"""
         current_time = time.time()
         if current_time - self.last_frame_time < self.frame_interval:
+            return self.last_prediction
         self.last_frame_time = current_time
             # Add frame to buffer
             self.frame_buffer.append(frame)
+            # Process when we have enough frames
+            if len(self.frame_buffer) >= self.min_frames:
                 # Create temp directory if it doesn't exist
                 os.makedirs("temp", exist_ok=True)
                     out.write(f)
                 out.release()
+                # Clear buffer but keep last few frames for continuity
+                self.frame_buffer = self.frame_buffer[-8:]  # Keep last 0.5 seconds
                 try:
                     # Process the video file using the pipeline
                     predicted_text = self.vsr_model(temp_video)
+                    if predicted_text:
+                        self.last_prediction = predicted_text
+                    return self.last_prediction
                 except Exception as e:
                     print(f"Error during inference: {str(e)}")
                     if os.path.exists(temp_video):
                         os.remove(temp_video)
+            return self.last_prediction or "Waiting for speech..."
         except Exception as e:
             print(f"Error processing: {str(e)}")
 iface = gr.Interface(
     fn=chaplin.process_frame,
     inputs=gr.Image(sources=["webcam"], streaming=True),
+    outputs=gr.Textbox(label="Predicted Text", interactive=False),
     title="Chaplin - Live Visual Speech Recognition",
+    description="Speak clearly into the webcam. The model will process your speech in ~2 second chunks.",
     live=True
 )