willwade commited on
Commit
87a5eeb
·
1 Parent(s): 13f27c7

fix frames as video - more time

Browse files
Files changed (1) hide show
  1. app.py +13 -10
app.py CHANGED
@@ -22,7 +22,8 @@ class ChaplinGradio:
22
 
23
  # Frame buffer
24
  self.frame_buffer = []
25
- self.buffer_size = 16 # Number of frames to accumulate before processing
 
26
 
27
  def download_models(self):
28
  """Download required model files from HuggingFace"""
@@ -65,7 +66,7 @@ class ChaplinGradio:
65
  current_time = time.time()
66
 
67
  if current_time - self.last_frame_time < self.frame_interval:
68
- return None
69
 
70
  self.last_frame_time = current_time
71
 
@@ -80,8 +81,8 @@ class ChaplinGradio:
80
  # Add frame to buffer
81
  self.frame_buffer.append(frame)
82
 
83
- # Only process when we have enough frames
84
- if len(self.frame_buffer) >= self.buffer_size:
85
  # Create temp directory if it doesn't exist
86
  os.makedirs("temp", exist_ok=True)
87
 
@@ -105,13 +106,15 @@ class ChaplinGradio:
105
  out.write(f)
106
  out.release()
107
 
108
- # Clear buffer
109
- self.frame_buffer = []
110
 
111
  try:
112
  # Process the video file using the pipeline
113
  predicted_text = self.vsr_model(temp_video)
114
- return predicted_text
 
 
115
 
116
  except Exception as e:
117
  print(f"Error during inference: {str(e)}")
@@ -121,7 +124,7 @@ class ChaplinGradio:
121
  if os.path.exists(temp_video):
122
  os.remove(temp_video)
123
 
124
- return "Collecting frames..." # Return status while collecting frames
125
 
126
  except Exception as e:
127
  print(f"Error processing: {str(e)}")
@@ -134,9 +137,9 @@ chaplin = ChaplinGradio()
134
  iface = gr.Interface(
135
  fn=chaplin.process_frame,
136
  inputs=gr.Image(sources=["webcam"], streaming=True),
137
- outputs=gr.Textbox(label="Predicted Text"),
138
  title="Chaplin - Live Visual Speech Recognition",
139
- description="Use your webcam to perform real-time visual speech recognition.",
140
  live=True
141
  )
142
 
 
22
 
23
  # Frame buffer
24
  self.frame_buffer = []
25
+ self.min_frames = 32 # 2 seconds of video at 16 fps
26
+ self.last_prediction = ""
27
 
28
  def download_models(self):
29
  """Download required model files from HuggingFace"""
 
66
  current_time = time.time()
67
 
68
  if current_time - self.last_frame_time < self.frame_interval:
69
+ return self.last_prediction
70
 
71
  self.last_frame_time = current_time
72
 
 
81
  # Add frame to buffer
82
  self.frame_buffer.append(frame)
83
 
84
+ # Process when we have enough frames
85
+ if len(self.frame_buffer) >= self.min_frames:
86
  # Create temp directory if it doesn't exist
87
  os.makedirs("temp", exist_ok=True)
88
 
 
106
  out.write(f)
107
  out.release()
108
 
109
+ # Clear buffer but keep last few frames for continuity
110
+ self.frame_buffer = self.frame_buffer[-8:] # Keep last 0.5 seconds
111
 
112
  try:
113
  # Process the video file using the pipeline
114
  predicted_text = self.vsr_model(temp_video)
115
+ if predicted_text:
116
+ self.last_prediction = predicted_text
117
+ return self.last_prediction
118
 
119
  except Exception as e:
120
  print(f"Error during inference: {str(e)}")
 
124
  if os.path.exists(temp_video):
125
  os.remove(temp_video)
126
 
127
+ return self.last_prediction or "Waiting for speech..."
128
 
129
  except Exception as e:
130
  print(f"Error processing: {str(e)}")
 
137
  iface = gr.Interface(
138
  fn=chaplin.process_frame,
139
  inputs=gr.Image(sources=["webcam"], streaming=True),
140
+ outputs=gr.Textbox(label="Predicted Text", interactive=False),
141
  title="Chaplin - Live Visual Speech Recognition",
142
+ description="Speak clearly into the webcam. The model will process your speech in ~2 second chunks.",
143
  live=True
144
  )
145