Spaces:
Sleeping
Sleeping
Commit
·
4e75e2b
1
Parent(s):
8acaa5d
Updated html to json
Browse files
shared.py
CHANGED
|
@@ -455,43 +455,100 @@ class RealtimeSpeakerDiarization:
|
|
| 455 |
return f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
|
| 456 |
|
| 457 |
def get_formatted_conversation(self):
|
| 458 |
-
"""Get the formatted conversation"""
|
| 459 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
|
| 461 |
def get_status_info(self):
|
| 462 |
-
"""Get current status information"""
|
| 463 |
if not self.speaker_detector:
|
| 464 |
-
return "Speaker detector not initialized"
|
| 465 |
|
| 466 |
try:
|
| 467 |
-
|
| 468 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
status_lines = [
|
| 470 |
f"**Current Speaker:** {status['current_speaker'] + 1}",
|
| 471 |
-
f"**Active Speakers:** {status['
|
| 472 |
f"**Last Similarity:** {status['last_similarity']:.3f}",
|
| 473 |
-
f"**Change Threshold:** {status['
|
| 474 |
-
f"**Total Sentences:** {
|
| 475 |
-
f"**Segments Processed:** {status['
|
| 476 |
"",
|
| 477 |
"**Speaker Activity:**"
|
| 478 |
]
|
| 479 |
|
| 480 |
-
for
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
active = "🟢" if count > 0 else "⚫"
|
| 484 |
-
status_lines.append(f"{active} Speaker {i+1} ({color_name}): {count} segments")
|
| 485 |
|
| 486 |
-
|
|
|
|
|
|
|
| 487 |
|
| 488 |
except Exception as e:
|
| 489 |
-
|
|
|
|
|
|
|
| 490 |
|
| 491 |
def process_audio_chunk(self, audio_data, sample_rate=16000):
|
| 492 |
"""Process audio chunk from WebSocket input"""
|
| 493 |
if not self.is_running or self.audio_processor is None:
|
| 494 |
-
return
|
| 495 |
|
| 496 |
try:
|
| 497 |
# Convert bytes to numpy array if needed
|
|
@@ -517,6 +574,10 @@ class RealtimeSpeakerDiarization:
|
|
| 517 |
self.audio_processor.add_audio_chunk(audio_data)
|
| 518 |
|
| 519 |
# Periodically extract embeddings for speaker detection
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
if len(self.audio_processor.audio_buffer) % (SAMPLE_RATE // 2) == 0: # Every 0.5 seconds
|
| 521 |
embedding = self.audio_processor.extract_embedding_from_buffer()
|
| 522 |
if embedding is not None:
|
|
@@ -527,9 +588,18 @@ class RealtimeSpeakerDiarization:
|
|
| 527 |
with self.transcription_lock:
|
| 528 |
self.full_sentences.append((f"[Audio segment {self.speaker_detector.segment_counter}]", speaker_id))
|
| 529 |
self.update_conversation_display()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
|
| 531 |
except Exception as e:
|
| 532 |
logger.error(f"Error processing audio chunk: {e}")
|
|
|
|
| 533 |
|
| 534 |
def resample_audio(self, audio_bytes, from_rate, to_rate):
|
| 535 |
"""Resample audio to target sample rate"""
|
|
|
|
| 455 |
return f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
|
| 456 |
|
| 457 |
def get_formatted_conversation(self):
|
| 458 |
+
"""Get the formatted conversation with structured data"""
|
| 459 |
+
try:
|
| 460 |
+
# Create conversation HTML format as before
|
| 461 |
+
html_content = self.current_conversation
|
| 462 |
+
|
| 463 |
+
# Create structured data
|
| 464 |
+
structured_data = {
|
| 465 |
+
"html_content": html_content,
|
| 466 |
+
"sentences": [],
|
| 467 |
+
"current_transcript": self.last_transcription,
|
| 468 |
+
"current_speaker": self.speaker_detector.current_speaker if self.speaker_detector else 0
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
# Add sentence data
|
| 472 |
+
for sentence_text, speaker_id in self.full_sentences:
|
| 473 |
+
color = self.speaker_detector.get_color_for_speaker(speaker_id) if self.speaker_detector else "#FFFFFF"
|
| 474 |
+
structured_data["sentences"].append({
|
| 475 |
+
"text": sentence_text,
|
| 476 |
+
"speaker_id": speaker_id,
|
| 477 |
+
"speaker_name": f"Speaker {speaker_id + 1}",
|
| 478 |
+
"color": color
|
| 479 |
+
})
|
| 480 |
+
|
| 481 |
+
return html_content
|
| 482 |
+
except Exception as e:
|
| 483 |
+
logger.error(f"Error formatting conversation: {e}")
|
| 484 |
+
return f"<i>Error formatting conversation: {str(e)}</i>"
|
| 485 |
|
| 486 |
def get_status_info(self):
|
| 487 |
+
"""Get current status information as structured data"""
|
| 488 |
if not self.speaker_detector:
|
| 489 |
+
return {"error": "Speaker detector not initialized"}
|
| 490 |
|
| 491 |
try:
|
| 492 |
+
speaker_status = self.speaker_detector.get_status_info()
|
| 493 |
|
| 494 |
+
# Format speaker activity
|
| 495 |
+
speaker_activity = []
|
| 496 |
+
for i in range(speaker_status['max_speakers']):
|
| 497 |
+
color_name = SPEAKER_COLOR_NAMES[i] if i < len(SPEAKER_COLOR_NAMES) else f"Speaker {i+1}"
|
| 498 |
+
count = speaker_status['speaker_counts'][i]
|
| 499 |
+
active = count > 0
|
| 500 |
+
speaker_activity.append({
|
| 501 |
+
"id": i,
|
| 502 |
+
"name": f"Speaker {i+1}",
|
| 503 |
+
"color": SPEAKER_COLORS[i] if i < len(SPEAKER_COLORS) else "#FFFFFF",
|
| 504 |
+
"color_name": color_name,
|
| 505 |
+
"segment_count": count,
|
| 506 |
+
"active": active
|
| 507 |
+
})
|
| 508 |
+
|
| 509 |
+
# Create structured status object
|
| 510 |
+
status = {
|
| 511 |
+
"current_speaker": speaker_status['current_speaker'],
|
| 512 |
+
"current_speaker_name": f"Speaker {speaker_status['current_speaker'] + 1}",
|
| 513 |
+
"active_speakers_count": speaker_status['active_speakers'],
|
| 514 |
+
"max_speakers": speaker_status['max_speakers'],
|
| 515 |
+
"last_similarity": speaker_status['last_similarity'],
|
| 516 |
+
"change_threshold": speaker_status['threshold'],
|
| 517 |
+
"total_sentences": len(self.full_sentences),
|
| 518 |
+
"segments_processed": speaker_status['segment_counter'],
|
| 519 |
+
"speaker_activity": speaker_activity,
|
| 520 |
+
"timestamp": time.time()
|
| 521 |
+
}
|
| 522 |
+
|
| 523 |
+
# Also create a formatted text version for UI display
|
| 524 |
status_lines = [
|
| 525 |
f"**Current Speaker:** {status['current_speaker'] + 1}",
|
| 526 |
+
f"**Active Speakers:** {status['active_speakers_count']} of {status['max_speakers']}",
|
| 527 |
f"**Last Similarity:** {status['last_similarity']:.3f}",
|
| 528 |
+
f"**Change Threshold:** {status['change_threshold']:.2f}",
|
| 529 |
+
f"**Total Sentences:** {status['total_sentences']}",
|
| 530 |
+
f"**Segments Processed:** {status['segments_processed']}",
|
| 531 |
"",
|
| 532 |
"**Speaker Activity:**"
|
| 533 |
]
|
| 534 |
|
| 535 |
+
for speaker in status["speaker_activity"]:
|
| 536 |
+
active = "🟢" if speaker["active"] else "⚫"
|
| 537 |
+
status_lines.append(f"{active} Speaker {speaker['id']+1} ({speaker['color_name']}): {speaker['segment_count']} segments")
|
|
|
|
|
|
|
| 538 |
|
| 539 |
+
status["formatted_text"] = "\n".join(status_lines)
|
| 540 |
+
|
| 541 |
+
return status
|
| 542 |
|
| 543 |
except Exception as e:
|
| 544 |
+
error_msg = f"Error getting status: {e}"
|
| 545 |
+
logger.error(error_msg)
|
| 546 |
+
return {"error": error_msg, "formatted_text": error_msg}
|
| 547 |
|
| 548 |
def process_audio_chunk(self, audio_data, sample_rate=16000):
|
| 549 |
"""Process audio chunk from WebSocket input"""
|
| 550 |
if not self.is_running or self.audio_processor is None:
|
| 551 |
+
return {"status": "not_running"}
|
| 552 |
|
| 553 |
try:
|
| 554 |
# Convert bytes to numpy array if needed
|
|
|
|
| 574 |
self.audio_processor.add_audio_chunk(audio_data)
|
| 575 |
|
| 576 |
# Periodically extract embeddings for speaker detection
|
| 577 |
+
embedding = None
|
| 578 |
+
speaker_id = self.speaker_detector.current_speaker
|
| 579 |
+
similarity = 1.0
|
| 580 |
+
|
| 581 |
if len(self.audio_processor.audio_buffer) % (SAMPLE_RATE // 2) == 0: # Every 0.5 seconds
|
| 582 |
embedding = self.audio_processor.extract_embedding_from_buffer()
|
| 583 |
if embedding is not None:
|
|
|
|
| 588 |
with self.transcription_lock:
|
| 589 |
self.full_sentences.append((f"[Audio segment {self.speaker_detector.segment_counter}]", speaker_id))
|
| 590 |
self.update_conversation_display()
|
| 591 |
+
|
| 592 |
+
# Return processing result
|
| 593 |
+
return {
|
| 594 |
+
"status": "processed",
|
| 595 |
+
"buffer_size": len(self.audio_processor.audio_buffer),
|
| 596 |
+
"speaker_id": speaker_id,
|
| 597 |
+
"similarity": similarity if embedding is not None else None
|
| 598 |
+
}
|
| 599 |
|
| 600 |
except Exception as e:
|
| 601 |
logger.error(f"Error processing audio chunk: {e}")
|
| 602 |
+
return {"status": "error", "message": str(e)}
|
| 603 |
|
| 604 |
def resample_audio(self, audio_bytes, from_rate, to_rate):
|
| 605 |
"""Resample audio to target sample rate"""
|
ui.py
CHANGED
|
@@ -173,10 +173,57 @@ def build_ui():
|
|
| 173 |
};
|
| 174 |
|
| 175 |
wsConnection.onmessage = (event) => {
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
};
|
| 181 |
|
| 182 |
wsConnection.onerror = (error) => {
|
|
|
|
| 173 |
};
|
| 174 |
|
| 175 |
wsConnection.onmessage = (event) => {
|
| 176 |
+
try {
|
| 177 |
+
// Parse the JSON message
|
| 178 |
+
const message = JSON.parse(event.data);
|
| 179 |
+
|
| 180 |
+
// Process different message types
|
| 181 |
+
switch(message.type) {
|
| 182 |
+
case 'transcription':
|
| 183 |
+
// Handle transcription data
|
| 184 |
+
if (message.data && typeof message.data === 'object') {
|
| 185 |
+
document.getElementById("conversation").innerHTML = message.data.conversation_html ||
|
| 186 |
+
JSON.stringify(message.data);
|
| 187 |
+
}
|
| 188 |
+
break;
|
| 189 |
+
|
| 190 |
+
case 'connection':
|
| 191 |
+
console.log('Connection status:', message.status);
|
| 192 |
+
updateStatus(message.status === 'connected' ? 'connected' : 'warning');
|
| 193 |
+
break;
|
| 194 |
+
|
| 195 |
+
case 'conversation_update':
|
| 196 |
+
if (message.conversation_html) {
|
| 197 |
+
document.getElementById("conversation").innerHTML = message.conversation_html;
|
| 198 |
+
}
|
| 199 |
+
break;
|
| 200 |
+
|
| 201 |
+
case 'conversation_cleared':
|
| 202 |
+
document.getElementById("conversation").innerHTML =
|
| 203 |
+
"<i>Conversation cleared. Start speaking again...</i>";
|
| 204 |
+
break;
|
| 205 |
+
|
| 206 |
+
case 'error':
|
| 207 |
+
console.error('Error message from server:', message.message);
|
| 208 |
+
updateStatus('warning', message.message);
|
| 209 |
+
break;
|
| 210 |
+
|
| 211 |
+
default:
|
| 212 |
+
// If it's just HTML content without proper JSON structure (legacy format)
|
| 213 |
+
document.getElementById("conversation").innerHTML = event.data;
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
// Auto-scroll to bottom
|
| 217 |
+
const container = document.getElementById("conversation");
|
| 218 |
+
container.scrollTop = container.scrollHeight;
|
| 219 |
+
} catch (e) {
|
| 220 |
+
// Fallback for non-JSON messages (legacy format)
|
| 221 |
+
document.getElementById("conversation").innerHTML = event.data;
|
| 222 |
+
|
| 223 |
+
// Auto-scroll to bottom
|
| 224 |
+
const container = document.getElementById("conversation");
|
| 225 |
+
container.scrollTop = container.scrollHeight;
|
| 226 |
+
}
|
| 227 |
};
|
| 228 |
|
| 229 |
wsConnection.onerror = (error) => {
|