willwade commited on
Commit
fea9cb0
·
1 Parent(s): 87a5eeb
Files changed (1) hide show
  1. app.py +19 -2
app.py CHANGED
@@ -24,6 +24,7 @@ class ChaplinGradio:
24
  self.frame_buffer = []
25
  self.min_frames = 32 # 2 seconds of video at 16 fps
26
  self.last_prediction = ""
 
27
 
28
  def download_models(self):
29
  """Download required model files from HuggingFace"""
@@ -71,26 +72,34 @@ class ChaplinGradio:
71
  self.last_frame_time = current_time
72
 
73
  if frame is None:
 
74
  return "No video input detected"
75
 
76
  try:
 
 
77
  # Convert frame to grayscale if it's not already
78
  if len(frame.shape) == 3:
79
  frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
 
80
 
81
  # Add frame to buffer
82
  self.frame_buffer.append(frame)
 
83
 
84
  # Process when we have enough frames
85
  if len(self.frame_buffer) >= self.min_frames:
 
86
  # Create temp directory if it doesn't exist
87
  os.makedirs("temp", exist_ok=True)
88
 
89
  # Generate temporary video file path
90
  temp_video = f"temp/frames_{time.time_ns()}.mp4"
 
91
 
92
  # Get frame dimensions from first frame
93
  frame_height, frame_width = self.frame_buffer[0].shape[:2]
 
94
 
95
  # Create video writer
96
  out = cv2.VideoWriter(
@@ -102,16 +111,20 @@ class ChaplinGradio:
102
  )
103
 
104
  # Write all frames to video
105
- for f in self.frame_buffer:
106
  out.write(f)
 
107
  out.release()
108
 
109
  # Clear buffer but keep last few frames for continuity
110
  self.frame_buffer = self.frame_buffer[-8:] # Keep last 0.5 seconds
 
111
 
112
  try:
113
  # Process the video file using the pipeline
 
114
  predicted_text = self.vsr_model(temp_video)
 
115
  if predicted_text:
116
  self.last_prediction = predicted_text
117
  return self.last_prediction
@@ -123,6 +136,7 @@ class ChaplinGradio:
123
  # Clean up temp file
124
  if os.path.exists(temp_video):
125
  os.remove(temp_video)
 
126
 
127
  return self.last_prediction or "Waiting for speech..."
128
 
@@ -137,7 +151,10 @@ chaplin = ChaplinGradio()
137
  iface = gr.Interface(
138
  fn=chaplin.process_frame,
139
  inputs=gr.Image(sources=["webcam"], streaming=True),
140
- outputs=gr.Textbox(label="Predicted Text", interactive=False),
 
 
 
141
  title="Chaplin - Live Visual Speech Recognition",
142
  description="Speak clearly into the webcam. The model will process your speech in ~2 second chunks.",
143
  live=True
 
24
  self.frame_buffer = []
25
  self.min_frames = 32 # 2 seconds of video at 16 fps
26
  self.last_prediction = ""
27
+ print(f"Initialized with device: {self.device}, fps: {self.fps}, min_frames: {self.min_frames}")
28
 
29
  def download_models(self):
30
  """Download required model files from HuggingFace"""
 
72
  self.last_frame_time = current_time
73
 
74
  if frame is None:
75
+ print("Received None frame")
76
  return "No video input detected"
77
 
78
  try:
79
+ print(f"Received frame with shape: {frame.shape}")
80
+
81
  # Convert frame to grayscale if it's not already
82
  if len(frame.shape) == 3:
83
  frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
84
+ print("Converted frame to grayscale")
85
 
86
  # Add frame to buffer
87
  self.frame_buffer.append(frame)
88
+ print(f"Buffer size now: {len(self.frame_buffer)}/{self.min_frames}")
89
 
90
  # Process when we have enough frames
91
  if len(self.frame_buffer) >= self.min_frames:
92
+ print("Processing buffer - have enough frames")
93
  # Create temp directory if it doesn't exist
94
  os.makedirs("temp", exist_ok=True)
95
 
96
  # Generate temporary video file path
97
  temp_video = f"temp/frames_{time.time_ns()}.mp4"
98
+ print(f"Created temp video path: {temp_video}")
99
 
100
  # Get frame dimensions from first frame
101
  frame_height, frame_width = self.frame_buffer[0].shape[:2]
102
+ print(f"Video dimensions: {frame_width}x{frame_height}")
103
 
104
  # Create video writer
105
  out = cv2.VideoWriter(
 
111
  )
112
 
113
  # Write all frames to video
114
+ for i, f in enumerate(self.frame_buffer):
115
  out.write(f)
116
+ print(f"Wrote {i+1} frames to video")
117
  out.release()
118
 
119
  # Clear buffer but keep last few frames for continuity
120
  self.frame_buffer = self.frame_buffer[-8:] # Keep last 0.5 seconds
121
+ print(f"Cleared buffer, kept {len(self.frame_buffer)} frames")
122
 
123
  try:
124
  # Process the video file using the pipeline
125
+ print("Starting model inference...")
126
  predicted_text = self.vsr_model(temp_video)
127
+ print(f"Model prediction: {predicted_text}")
128
  if predicted_text:
129
  self.last_prediction = predicted_text
130
  return self.last_prediction
 
136
  # Clean up temp file
137
  if os.path.exists(temp_video):
138
  os.remove(temp_video)
139
+ print("Cleaned up temp video file")
140
 
141
  return self.last_prediction or "Waiting for speech..."
142
 
 
151
  iface = gr.Interface(
152
  fn=chaplin.process_frame,
153
  inputs=gr.Image(sources=["webcam"], streaming=True),
154
+ outputs=[
155
+ gr.Textbox(label="Predicted Text", interactive=False),
156
+ gr.Textbox(label="Debug Log", interactive=False)
157
+ ],
158
  title="Chaplin - Live Visual Speech Recognition",
159
  description="Speak clearly into the webcam. The model will process your speech in ~2 second chunks.",
160
  live=True