rcastriotta commited on
Commit
1a3fc6f
·
1 Parent(s): 86dac0c
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. Dockerfile +93 -0
  3. docker-compose.yml +17 -0
  4. node-server/.gitignore +2 -0
  5. node-server/package-lock.json +0 -0
  6. node-server/package.json +26 -0
  7. node-server/server.js +33 -0
  8. node-server/transcription-client.js +172 -0
  9. node-server/websocket.js +30 -0
  10. seamless-server/.DS_Store +0 -0
  11. seamless-server/.gitignore +5 -0
  12. seamless-server/models/Seamless/vad_s2st_sc_24khz_main.yaml +25 -0
  13. seamless-server/models/SeamlessStreaming/vad_s2st_sc_main.yaml +21 -0
  14. seamless-server/old_server.py +874 -0
  15. seamless-server/requirements.txt +34 -0
  16. seamless-server/run_docker.sh +5 -0
  17. seamless-server/server.py +288 -0
  18. seamless-server/src/auth.py +20 -0
  19. seamless-server/src/client.py +23 -0
  20. seamless-server/src/context.py +83 -0
  21. seamless-server/src/logging.py +58 -0
  22. seamless-server/src/room.py +65 -0
  23. seamless-server/src/simuleval_agent_directory.py +171 -0
  24. seamless-server/src/simuleval_transcoder.py +409 -0
  25. seamless-server/src/speech_and_text_output.py +15 -0
  26. seamless-server/src/transcoder_helpers.py +44 -0
  27. seamless-server/src/transcriber.py +128 -0
  28. seamless-server/src/translate.py +21 -0
  29. seamless-server/whl/seamless_communication-1.0.0-py3-none-any.whl +0 -0
  30. streaming-test-app/.eslintrc.cjs +18 -0
  31. streaming-test-app/.gitignore +24 -0
  32. streaming-test-app/index.html +13 -0
  33. streaming-test-app/package-lock.json +0 -0
  34. streaming-test-app/package.json +53 -0
  35. streaming-test-app/src/App.tsx +57 -0
  36. streaming-test-app/src/Blink.tsx +41 -0
  37. streaming-test-app/src/DebugSection.tsx +62 -0
  38. streaming-test-app/src/RoomConfig.tsx +271 -0
  39. streaming-test-app/src/SocketWrapper.tsx +218 -0
  40. streaming-test-app/src/StreamingInterface.css +56 -0
  41. streaming-test-app/src/StreamingInterface.tsx +1219 -0
  42. streaming-test-app/src/URLParams.ts +50 -0
  43. streaming-test-app/src/assets/Roboto-msdf.json +0 -0
  44. streaming-test-app/src/assets/Roboto-msdf.png +0 -0
  45. streaming-test-app/src/assets/RobotoMono-Regular-msdf.json +0 -0
  46. streaming-test-app/src/assets/RobotoMono-Regular.png +0 -0
  47. streaming-test-app/src/assets/seamless.svg +6 -0
  48. streaming-test-app/src/createBufferedSpeechPlayer.ts +173 -0
  49. streaming-test-app/src/cursorBlinkInterval.ts +1 -0
  50. streaming-test-app/src/debug.ts +257 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
Dockerfile ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build frontend with node
2
+ FROM node:20-alpine AS frontend
3
+ RUN apk add --no-cache libc6-compat
4
+ WORKDIR /app
5
+
6
+ COPY streaming-test-app .
7
+ RUN \
8
+ if [ -f yarn.lock ]; then yarn --frozen-lockfile; \
9
+ elif [ -f package-lock.json ]; then npm ci; \
10
+ elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i --frozen-lockfile; \
11
+ else echo "Lockfile not found." && exit 1; \
12
+ fi
13
+
14
+ RUN npm run build
15
+
16
+ # build backend on CUDA
17
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS backend
18
+ WORKDIR /app
19
+
20
+ ENV DEBIAN_FRONTEND=noninteractive
21
+ ENV NODE_MAJOR=20
22
+
23
+ RUN apt-get update && \
24
+ apt-get upgrade -y && \
25
+ apt-get install -y --no-install-recommends \
26
+ git \
27
+ git-lfs \
28
+ wget \
29
+ curl \
30
+ # python build dependencies \
31
+ build-essential \
32
+ libssl-dev \
33
+ zlib1g-dev \
34
+ libbz2-dev \
35
+ libreadline-dev \
36
+ libsqlite3-dev \
37
+ libncursesw5-dev \
38
+ xz-utils \
39
+ tk-dev \
40
+ libxml2-dev \
41
+ libxmlsec1-dev \
42
+ libffi-dev \
43
+ liblzma-dev \
44
+ sox libsox-fmt-all \
45
+ # gradio dependencies \
46
+ ffmpeg \
47
+ # fairseq2 dependencies \
48
+ libjpeg8-dev \
49
+ libpng-dev \
50
+ libsndfile-dev && \
51
+ apt-get clean && \
52
+ rm -rf /var/lib/apt/lists/*
53
+
54
+ USER root
55
+ RUN ln -s /usr/lib/x86_64-linux-gnu/libsox.so.3 /usr/lib/x86_64-linux-gnu/libsox.so
56
+ # install older versions libjpeg62-turbo and libpng15
57
+ RUN wget http://ftp.us.debian.org/debian/pool/main/libj/libjpeg-turbo/libjpeg62-turbo_2.1.5-2_amd64.deb && \
58
+ dpkg -i libjpeg62-turbo_2.1.5-2_amd64.deb && \
59
+ rm libjpeg62-turbo_2.1.5-2_amd64.deb
60
+ RUN wget https://master.dl.sourceforge.net/project/libpng/libpng15/1.5.30/libpng-1.5.30.tar.gz && \
61
+ tar -xvf libpng-1.5.30.tar.gz && cd libpng-1.5.30 && ./configure && make && make install && cd .. && rm -rf libpng-1.5.30.tar.gz libpng-1.5.30
62
+
63
+ RUN useradd -m -u 1000 user
64
+ USER user
65
+ ENV HOME=/home/user \
66
+ PATH=/home/user/.local/bin:$PATH
67
+ WORKDIR $HOME/app
68
+
69
+ RUN curl https://pyenv.run | bash
70
+ ENV PATH=$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH
71
+ ARG PYTHON_VERSION=3.10.12
72
+ RUN pyenv install $PYTHON_VERSION && \
73
+ pyenv global $PYTHON_VERSION && \
74
+ pyenv rehash && \
75
+ pip install --no-cache-dir -U pip setuptools wheel
76
+
77
+ COPY --chown=user:user ./seamless-server ./seamless-server
78
+ # change dir since pip needs to seed whl folder
79
+ RUN cd seamless-server && \
80
+ pip install fairseq2 --pre --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/nightly/pt2.1.1/cu118 && \
81
+ pip install --no-cache-dir --upgrade -r requirements.txt
82
+ COPY --from=frontend /app/dist ./streaming-test-app/dist
83
+
84
+ WORKDIR $HOME/app/seamless-server
85
+ RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=false \
86
+ huggingface-cli login --token $(cat /run/secrets/HF_TOKEN) || echo "HF_TOKEN error" && \
87
+ huggingface-cli download meta-private/SeamlessExpressive pretssel_melhifigan_wm-final.pt --local-dir ./models/Seamless/ || echo "HF_TOKEN error" && \
88
+ ln -s $(readlink -f models/Seamless/pretssel_melhifigan_wm-final.pt) models/Seamless/pretssel_melhifigan_wm.pt || true;
89
+
90
+ USER user
91
+ RUN ["chmod", "+x", "./run_docker.sh"]
92
+ CMD ./run_docker.sh
93
+
docker-compose.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3'
2
+ services:
3
+ seamless:
4
+ build: .
5
+ volumes:
6
+ - ./seamless-server:/home/user/app/seamless-server # for hot reload in DEV
7
+ ports:
8
+ - "80:7860"
9
+ environment:
10
+ - NODE_ENV=development
11
+ deploy:
12
+ resources:
13
+ reservations:
14
+ devices:
15
+ - driver: nvidia
16
+ count: 1
17
+ capabilities: [gpu]
node-server/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /node_modules
2
+ .env
node-server/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
node-server/package.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "server",
3
+ "version": "1.0.0",
4
+ "description": "",
5
+ "main": "server.js",
6
+ "scripts": {
7
+ "test": "echo \"Error: no test specified\" && exit 1",
8
+ "start": "node server.js",
9
+ "dev": "nodemon index.js"
10
+ },
11
+ "engines": {
12
+ "node": "v20.5.0"
13
+ },
14
+ "keywords": [],
15
+ "author": "",
16
+ "license": "ISC",
17
+ "dependencies": {
18
+ "@deepgram/sdk": "^3.0.1",
19
+ "cors": "^2.8.5",
20
+ "crypto": "^1.0.1",
21
+ "dotenv": "^16.4.1",
22
+ "express": "^4.18.2",
23
+ "nodemon": "^3.0.3",
24
+ "socket.io": "^4.7.4"
25
+ }
26
+ }
node-server/server.js ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ require("dotenv").config();
2
+ const express = require("express");
3
+ const cors = require("cors");
4
+ const app = express();
5
+ const http = require("http").Server(app);
6
+ const initializeWebSocket = require("./websocket");
7
+
8
+ // TODO redis store
9
+ const io = require("socket.io")(http, {
10
+ cors: {
11
+ origin: "http://localhost:5173",
12
+ methods: ["GET", "POST"],
13
+ },
14
+ });
15
+
16
+ initializeWebSocket(io);
17
+
18
+ app.use(cors({ credentials: false, origin: "http://localhost:5173" }));
19
+
20
+ app.use(express.json());
21
+
22
+ app.use((req, _, next) => {
23
+ req.io = io;
24
+ next();
25
+ });
26
+
27
+
28
+ app.get("/", (req, res) => res.send("worked"))
29
+
30
+ const PORT = process.env.PORT || 3002;
31
+ http.listen(PORT, () => {
32
+ console.log(`Server listening at http://localhost:${PORT}`);
33
+ });
node-server/transcription-client.js ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const { createClient, LiveTranscriptionEvents } = require("@deepgram/sdk");
2
+ const EventEmitter = require("events");
3
+ const crypto = require("crypto");
4
+
5
+ class TranscriptionClient extends EventEmitter {
6
+ constructor() {
7
+ super();
8
+ this.deepgramStream = null;
9
+ this.deepgramSessionId = null;
10
+ this.currentTranscript = "";
11
+ this.currentDiarization = {};
12
+ this.releaseTimeout = null;
13
+ this.killTimeout = null;
14
+ this.releaseThresholdMS = 4000;
15
+ this.killThresholdMS = 1000 * 60 * 2;
16
+ this.diarize = false;
17
+ this.speakerLabels = {};
18
+ }
19
+
20
+ startTranscriptionStream(language) {
21
+ console.log("started deepgram");
22
+ const localSessionId = crypto.randomUUID();
23
+ this.deepgramSessionId = localSessionId;
24
+ const deepgram = createClient(process.env.DEEPGRAM_API_KEY);
25
+ this.deepgramStream = deepgram.listen.live({
26
+ model: "nova-2",
27
+ punctuate: true,
28
+ language,
29
+ interim_results: true,
30
+ diarize: this.diarize,
31
+ smart_format: true,
32
+ endpointing: "2",
33
+ });
34
+
35
+ this.deepgramStream.on(LiveTranscriptionEvents.Error, (err) => {
36
+ console.log("Deepgram error: ", err);
37
+ });
38
+
39
+ this.deepgramStream.on(LiveTranscriptionEvents.Warning, (err) => {
40
+ console.log("Deepgram error: ", err);
41
+ });
42
+
43
+ this.deepgramStream.on(LiveTranscriptionEvents.Open, () => {
44
+ this.resetKillTimeout();
45
+
46
+ this.deepgramStream.on(
47
+ LiveTranscriptionEvents.Transcript,
48
+ async (data) => {
49
+ try {
50
+ const response = data.channel.alternatives[0];
51
+ const text = response?.transcript || "";
52
+ if (text.length > 1) {
53
+ clearTimeout(this.releaseTimeout);
54
+ this.releaseTimeout = setTimeout(() => {
55
+ this.releaseTranslations(true);
56
+ }, this.releaseThresholdMS);
57
+ this.resetKillTimeout();
58
+ }
59
+
60
+ // important not to translate interim results
61
+ if (response.transcript && data.is_final) {
62
+ // console.log(response.transcript);
63
+ const words = data.channel?.alternatives[0]?.words || [];
64
+ words.forEach(({ punctuated_word, speaker, start, end }) => {
65
+ if (!this.currentDiarization[speaker])
66
+ this.currentDiarization[speaker] = "";
67
+ this.currentDiarization[speaker] += " " + punctuated_word;
68
+ });
69
+ this.emit("transcript", text)
70
+ this.currentTranscript += " " + text;
71
+ this.releaseTranslations();
72
+ // this.fullTranscript += " " + this.currentTranscript;
73
+ }
74
+ } catch (err) {
75
+ console.log(
76
+ "TranscribeTranslate.LiveTranscriptionEvents.Transcript:",
77
+ err
78
+ );
79
+ }
80
+ }
81
+ );
82
+ });
83
+ return this.deepgramSessionId;
84
+ }
85
+
86
+ resetKillTimeout = () => {
87
+ clearTimeout(this.killTimeout);
88
+ this.killTimeout = setTimeout(
89
+ () => this.endTranscriptionStream(),
90
+ this.killThresholdMS
91
+ );
92
+ };
93
+
94
+ releaseTranslations = async (triggeredByPause = false) => {
95
+ try {
96
+ let segment = "";
97
+ let speaker = null;
98
+ if (this.diarize) {
99
+ const processedSpeakers = Object.entries(this.currentDiarization).map(
100
+ ([speaker, transcript]) => ({
101
+ ...this.checkShouldSegment(transcript, triggeredByPause ? 5 : 50),
102
+ speaker,
103
+ })
104
+ );
105
+ const chosen = processedSpeakers.find((s) => s.canRelease);
106
+ if (!chosen) return;
107
+ this.currentDiarization = { [chosen.speaker]: chosen.secondPart };
108
+ segment = chosen.firstPart;
109
+ speaker = this.getSpeakerLabel(chosen.speaker);
110
+ } else {
111
+ const { canRelease, firstPart, secondPart } = this.checkShouldSegment(
112
+ this.currentTranscript,
113
+ triggeredByPause ? 5 : 50
114
+ );
115
+ if (!canRelease) return;
116
+ this.currentTranscript = secondPart;
117
+ segment = firstPart;
118
+ }
119
+
120
+ // translate segment
121
+ this.emit("translation", segment)
122
+
123
+
124
+ this.lastEmittedSpeaker = speaker;
125
+ } catch (err) {
126
+ console.log("TranscribeTranslate.releaseTranslations:", err);
127
+ }
128
+ };
129
+
130
+ endTranscriptionStream() {
131
+ try {
132
+ clearTimeout(this.releaseTimeout);
133
+ clearTimeout(this.killTimeout);
134
+ if (!this.deepgramStream) return;
135
+ this.deepgramStream.finish();
136
+ this.deepgramStream = null;
137
+ this.currentTranscript = "";
138
+ } catch (err) {
139
+ console.log("Failed to end deepgram stream", err);
140
+ }
141
+ }
142
+
143
+ checkShouldSegment = (str, minCharLimit = 25) => {
144
+ let firstPart = "";
145
+ let secondPart = "";
146
+ const punct = new Set([".", "!", "?", "。", "۔"]);
147
+ for (let i = 0; i < str.length; i += 1) {
148
+ const char = str[i];
149
+ if (i > minCharLimit) {
150
+ if (punct.has(char)) {
151
+ firstPart = str.slice(0, i + 1);
152
+ secondPart = str.slice(i + 1);
153
+ }
154
+ }
155
+ }
156
+
157
+ return { canRelease: !!firstPart.length, firstPart, secondPart };
158
+ };
159
+
160
+ send(payload) {
161
+ try {
162
+ if (!this.deepgramStream) return;
163
+ if (this.deepgramStream.getReadyState() === 1) {
164
+ this.deepgramStream.send(payload);
165
+ }
166
+ } catch (err) {
167
+ console.log("Failed to start deepgram stream", err);
168
+ }
169
+ }
170
+ }
171
+
172
+ module.exports = TranscriptionClient;
node-server/websocket.js ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const TranscriptClient = require("./transcription-client");
2
+
3
+ // TODO remove x seconds after host left (incase reconnect)
4
+ const initializeWebSocket = (io) => {
5
+ io.on("connection", (socket) => {
6
+ console.log(`connection made (${socket.id})`);
7
+ const transcriptClient = new TranscriptClient();
8
+
9
+ transcriptClient.on("translation", (result) => {
10
+ console.log(result)
11
+ io.to(socket.id).emit("translation", result)
12
+ })
13
+
14
+ socket.on('configure_stream', ({language}) => {
15
+ transcriptClient.startTranscriptionStream("en-US")
16
+ })
17
+
18
+ socket.on('incoming_audio', (data) => {
19
+ transcriptClient.send(data)
20
+ })
21
+
22
+ socket.on("disconnect", () => {
23
+ transcriptClient.endTranscriptionStream()
24
+ });
25
+ });
26
+
27
+ return io;
28
+ };
29
+
30
+ module.exports = initializeWebSocket;
seamless-server/.DS_Store ADDED
Binary file (6.15 kB). View file
 
seamless-server/.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ src/__pycache__/
3
+ debug/
4
+ .vscode/
5
+ .env
seamless-server/models/Seamless/vad_s2st_sc_24khz_main.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ agent_class: seamless_communication.streaming.agents.seamless_s2st.SeamlessS2STDualVocoderVADAgent
2
+ monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
3
+ unity_model_name: seamless_streaming_unity
4
+ sentencepiece_model: spm_256k_nllb100.model
5
+
6
+ task: s2st
7
+ tgt_lang: "eng"
8
+ min_unit_chunk_size: 50
9
+ decision_threshold: 0.7
10
+ no_early_stop: True
11
+ block_ngrams: True
12
+ vocoder_name: vocoder_v2
13
+ expr_vocoder_name: vocoder_pretssel
14
+ gated_model_dir: .
15
+ expr_vocoder_gain: 3.0
16
+ upstream_idx: 1
17
+ wav2vec_yaml: wav2vec.yaml
18
+ min_starting_wait_w2vbert: 192
19
+
20
+ config_yaml: cfg_fbank_u2t.yaml
21
+ upstream_idx: 1
22
+ detokenize_only: True
23
+ device: cuda:0
24
+ max_len_a: 0
25
+ max_len_b: 1000
seamless-server/models/SeamlessStreaming/vad_s2st_sc_main.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ agent_class: seamless_communication.streaming.agents.seamless_streaming_s2st.SeamlessStreamingS2STJointVADAgent
2
+ monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
3
+ unity_model_name: seamless_streaming_unity
4
+ sentencepiece_model: spm_256k_nllb100.model
5
+
6
+ task: s2st
7
+ tgt_lang: "eng"
8
+ min_unit_chunk_size: 50
9
+ decision_threshold: 0.7
10
+ no_early_stop: True
11
+ block_ngrams: True
12
+ vocoder_name: vocoder_v2
13
+ wav2vec_yaml: wav2vec.yaml
14
+ min_starting_wait_w2vbert: 192
15
+
16
+ config_yaml: cfg_fbank_u2t.yaml
17
+ upstream_idx: 1
18
+ detokenize_only: True
19
+ device: cuda:0
20
+ max_len_a: 0
21
+ max_len_b: 1000
seamless-server/old_server.py ADDED
@@ -0,0 +1,874 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import itemgetter
2
+ import os
3
+ from typing import Any, Optional, Tuple, Dict, TypedDict
4
+ from urllib import parse
5
+ from uuid import uuid4
6
+ from pprint import pformat
7
+ import socketio
8
+ import time
9
+ import random
10
+ import string
11
+ import logging
12
+ from starlette.applications import Starlette
13
+ from starlette.routing import Mount, Route
14
+ from starlette.staticfiles import StaticFiles
15
+ from dotenv import load_dotenv
16
+
17
+ load_dotenv()
18
+
19
+ from src.auth import google_auth_check
20
+ from src.room import Room, Member
21
+ from src.context import ContextManager
22
+ from src.transcriber import Transcriber
23
+
24
+ from src.simuleval_agent_directory import NoAvailableAgentException
25
+ from src.simuleval_agent_directory import SimulevalAgentDirectory
26
+ from src.simuleval_transcoder import SimulevalTranscoder
27
+ from src.transcoder_helpers import get_transcoder_output_events
28
+ from src.logging import initialize_logger
29
+
30
+ DEBUG = True
31
+ ALL_ROOM_ID = "ALL"
32
+ ROOM_ID_USABLE_CHARACTERS = string.ascii_uppercase
33
+ ROOM_ID_LENGTH = 4
34
+ ROOM_LISTENERS_SUFFIX = "_listeners"
35
+ ROOM_SPEAKERS_SUFFIX = "_speakers"
36
+ ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME = "remove_server_lock"
37
+
38
+ logger = initialize_logger("socketio_server_pubsub", level=logging.WARNING)
39
+
40
+ print("=" * 20 + " ⭐️ Starting Server... ⭐️ " + "=" * 20)
41
+
42
+ CLIENT_BUILD_PATH = "../streaming-test-app/dist/"
43
+ static_files = {
44
+ "/": CLIENT_BUILD_PATH,
45
+ "/assets/seamless-db6a2555.svg": {
46
+ "filename": CLIENT_BUILD_PATH + "assets/seamless-db6a2555.svg",
47
+ "content_type": "image/svg+xml",
48
+ },
49
+ }
50
+
51
+ # sio is the main socket.io entrypoint
52
+ sio = socketio.AsyncServer(
53
+ async_mode="asgi",
54
+ cors_allowed_origins="*",
55
+ logger=logger,
56
+ # engineio_logger=logger,
57
+ )
58
+ # sio.logger.setLevel(logging.DEBUG)
59
+ socketio_app = socketio.ASGIApp(sio)
60
+
61
+ app_routes = [
62
+ Mount("/ws", app=socketio_app), # Mount Socket.IO server under /app
63
+ Mount(
64
+ "/", app=StaticFiles(directory=CLIENT_BUILD_PATH, html=True)
65
+ ), # Serve static files from root
66
+ ]
67
+ app = Starlette(debug=True, routes=app_routes)
68
+
69
+ # rooms is indexed by room_id
70
+ rooms: Dict[str, Room] = {}
71
+
72
+
73
+ class MemberDirectoryObject(TypedDict):
74
+ room: Room
75
+ member_object: Member
76
+
77
+
78
+ # member_directory is indexed by client_id
79
+ # NOTE: client_id is really "client session id", meaning that it is unique to a single browser session.
80
+ # If a user opens a new tab, they will have a different client_id and can join another room, join
81
+ # the same room with different roles, etc.
82
+ # NOTE: For a long-running production server we would want to clean up members after a certain timeout
83
+ # but for this limited application we can just keep them around
84
+ member_directory: Dict[str, MemberDirectoryObject] = {}
85
+
86
+
87
+ class ServerLock(TypedDict):
88
+ name: str
89
+ client_id: str
90
+ member_object: Member
91
+
92
+
93
+ SINGLE_USER = os.environ.get("SINGLE_USER")
94
+
95
+ if os.environ.get("LOCK_SERVER_COMPLETELY", "0") == "1":
96
+ logger.info("LOCK_SERVER_COMPLETELY is set. Server will be locked on startup.")
97
+ if SINGLE_USER == "1":
98
+ logger.info(
99
+ f"SINGLE_USER mode is set. Server will only allow one speaker or listener at a time."
100
+ )
101
+ dummy_server_lock_member_object = Member(
102
+ client_id="seamless_user", session_id="dummy", name="Seamless User"
103
+ )
104
+ # Normally this would be an actual transcoder, but it's fine putting True here since currently we only check for the presence of the transcoder
105
+ dummy_server_lock_member_object.transcoder = True
106
+ server_lock: Optional[ServerLock] = (
107
+ {
108
+ "name": "Seamless User",
109
+ "client_id": "seamless_user",
110
+ "member_object": dummy_server_lock_member_object,
111
+ }
112
+ if os.environ.get("LOCK_SERVER_COMPLETELY", "0") == "1"
113
+ else None
114
+ )
115
+
116
+ server_id = str(uuid4())
117
+
118
+ # Specify specific models to load (some environments have issues loading multiple models)
119
+ # See AgentWithInfo with JSON format details.
120
+ models_override = os.environ.get("MODELS_OVERRIDE")
121
+
122
+ available_agents = SimulevalAgentDirectory()
123
+ logger.info("Building and adding agents...")
124
+ if models_override is not None:
125
+ logger.info(f"MODELS_OVERRIDE supplied from env vars: {models_override}")
126
+ available_agents.build_and_add_agents(models_override)
127
+
128
+ agents_capabilities_for_json = available_agents.get_agents_capabilities_list_for_json()
129
+
130
+
131
+ def catch_and_log_exceptions_for_sio_event_handlers(func):
132
+ # wrapper should have the same signature as the original function
133
+ async def catch_exception_wrapper(*args, **kwargs):
134
+ try:
135
+ return await func(*args, **kwargs)
136
+ except Exception as e:
137
+ message = f"[app_pubsub] Caught exception in '{func.__name__}' event handler:\n\n{e}"
138
+ logger.exception(message, stack_info=True)
139
+
140
+ try:
141
+ exception_data = {
142
+ "message": message,
143
+ "timeEpochMs": int(time.time() * 1000),
144
+ }
145
+
146
+ try:
147
+ # Let's try to add as much useful metadata as possible to the server_exception event
148
+ sid = args[0]
149
+ if isinstance(sid, str) and len(sid) > 0:
150
+ session_data = await get_session_data(sid)
151
+ if session_data:
152
+ client_id = session_data.get("client_id")
153
+ member = session_data.get("member_object")
154
+ room = session_data.get("room_object")
155
+
156
+ exception_data["room"] = str(room)
157
+ exception_data["member"] = str(member)
158
+ exception_data["clientID"] = str(client_id)
159
+ except Exception as inner_e:
160
+ # We expect there will be times when clientID or other values aren't present, so just log this as a warning
161
+ logger.warn(
162
+ f"[app_pubsub] Caught exception while trying add additional_data to server_exception:\n\n{inner_e}"
163
+ )
164
+
165
+ # For now let's emit this to all clients. We ultimatley may want to emit it just to the room it's happening in.
166
+ await sio.emit("server_exception", exception_data)
167
+ except Exception as inner_e:
168
+ logger.exception(
169
+ f"[app_pubsub] Caught exception while trying to emit server_exception event:\n{inner_e}"
170
+ )
171
+
172
+ # Re-raise the exception so it's handled normally by the server
173
+ raise e
174
+
175
+ # Set the name of the wrapper to the name of the original function so that the socketio server can associate it with the right event
176
+ catch_exception_wrapper.__name__ = func.__name__
177
+ return catch_exception_wrapper
178
+
179
+
180
+ async def emit_room_state_update(room):
181
+ await sio.emit(
182
+ "room_state_update",
183
+ room.to_json(),
184
+ room=room.room_id,
185
+ )
186
+
187
+
188
+ async def emit_server_state_update():
189
+ room_statuses = {
190
+ room_id: room.get_room_status_dict() for room_id, room in rooms.items()
191
+ }
192
+ total_active_connections = sum(
193
+ [room_status["activeConnections"] for room_status in room_statuses.values()]
194
+ )
195
+ total_active_transcoders = sum(
196
+ [room_status["activeTranscoders"] for room_status in room_statuses.values()]
197
+ )
198
+ logger.info(
199
+ f"[Server Status]: {total_active_connections} active connections (in rooms); {total_active_transcoders} active transcoders"
200
+ )
201
+ logger.info(f"[Server Status]: server_lock={server_lock}")
202
+ server_lock_object_for_js = (
203
+ {
204
+ "name": server_lock.get("name"),
205
+ "clientID": server_lock.get("client_id"),
206
+ "isActive": server_lock.get("member_object")
207
+ and server_lock.get("member_object").transcoder is not None,
208
+ }
209
+ if server_lock
210
+ else None
211
+ )
212
+ await sio.emit(
213
+ "server_state_update",
214
+ {
215
+ "statusByRoom": room_statuses,
216
+ "totalActiveConnections": total_active_connections,
217
+ "totalActiveTranscoders": total_active_transcoders,
218
+ "agentsCapabilities": agents_capabilities_for_json,
219
+ "serverLock": server_lock_object_for_js,
220
+ },
221
+ room=ALL_ROOM_ID,
222
+ )
223
+
224
+
225
+ async def get_session_data(sid):
226
+ session = await sio.get_session(sid)
227
+ # It seems like if the session has not been set that get_session may return None, so let's provide a fallback empty dictionary here
228
+ return session or {}
229
+
230
+
231
+ async def set_session_data(
232
+ sid, client_id, room_id, room_object, member_object, context_obj, transcriber
233
+ ):
234
+ await sio.save_session(
235
+ sid,
236
+ {
237
+ "client_id": client_id,
238
+ "room_id": room_id,
239
+ "room_object": room_object,
240
+ "member_object": member_object,
241
+ "context_obj": context_obj,
242
+ "transcriber": transcriber,
243
+ },
244
+ )
245
+
246
+
247
+ def get_random_room_id():
248
+ return "".join(random.choices(ROOM_ID_USABLE_CHARACTERS, k=ROOM_ID_LENGTH))
249
+
250
+
251
+ def get_random_unused_room_id():
252
+ room_id = get_random_room_id()
253
+ while room_id in rooms:
254
+ room_id = get_random_room_id()
255
+ return room_id
256
+
257
+
258
+ ###############################################
259
+ # Socket.io Basic Event Handlers
260
+ ###############################################
261
+
262
+
263
+ @sio.on("connect")
264
+ @catch_and_log_exceptions_for_sio_event_handlers
265
+ async def connect(sid, environ):
266
+ logger.info(f"📥 [event: connected] sid={sid}")
267
+
268
+ # TODO: Sanitize/validate query param input
269
+ query_params = dict(parse.parse_qsl(environ["QUERY_STRING"]))
270
+ client_id = query_params.get("clientID")
271
+ token = query_params.get("token")
272
+
273
+ if google_auth_check(token) is None:
274
+ await sio.emit("auth_error", "Not authenticated", to=sid)
275
+ logger.info("Invalid auth token, Disconnecting...")
276
+ await sio.disconnect(sid)
277
+ return
278
+
279
+ logger.debug(f"query_params:\n{pformat(query_params)}")
280
+
281
+ if client_id is None:
282
+ logger.info("No clientID provided. Disconnecting...")
283
+ await sio.disconnect(sid)
284
+ return
285
+
286
+ # On reconnect we need to rejoin rooms and reset session data
287
+ if member_directory.get(client_id):
288
+ room = member_directory[client_id].get("room")
289
+ room_id = room.room_id
290
+ # Note: We could also get this from room.members[client_id]
291
+ member = member_directory[client_id].get("member_object")
292
+ context = member_directory[client_id].get("context_obj")
293
+ transcriber = member_directory[client_id].get("transcriber")
294
+ member.connection_status = "connected"
295
+ member.session_id = sid
296
+
297
+ logger.info(
298
+ f"[event: connect] {member} reconnected. Attempting to re-add them to socketio rooms and reset session data."
299
+ )
300
+
301
+ if room is None or member is None:
302
+ logger.error(
303
+ f"[event: connect] {client_id} is reconnecting, but room or member is None. This should not happen."
304
+ )
305
+ await sio.disconnect(sid)
306
+ return
307
+
308
+ sio.enter_room(sid, room_id)
309
+ sio.enter_room(sid, ALL_ROOM_ID)
310
+
311
+ if client_id in room.listeners:
312
+ sio.enter_room(sid, f"{room_id}{ROOM_LISTENERS_SUFFIX}")
313
+ if client_id in room.speakers:
314
+ sio.enter_room(sid, f"{room_id}{ROOM_SPEAKERS_SUFFIX}")
315
+
316
+ # Save the room_id to the socketio client session
317
+ await set_session_data(
318
+ sid,
319
+ client_id=client_id,
320
+ room_id=room.room_id,
321
+ room_object=room,
322
+ member_object=member,
323
+ context_obj=context,
324
+ transcriber=transcriber,
325
+ )
326
+ await emit_room_state_update(room)
327
+ else:
328
+ # Save the client id to the socketio client session
329
+ await set_session_data(
330
+ sid,
331
+ client_id=client_id,
332
+ room_id=None,
333
+ room_object=None,
334
+ member_object=None,
335
+ context_obj=None,
336
+ transcriber=None,
337
+ )
338
+
339
+ await sio.emit("server_id", server_id, to=sid)
340
+ await emit_server_state_update()
341
+
342
+
343
+ @sio.event
344
+ @catch_and_log_exceptions_for_sio_event_handlers
345
+ async def disconnect(sid):
346
+ global server_lock
347
+ session_data = await get_session_data(sid)
348
+
349
+ client_id = None
350
+ member = None
351
+ room = None
352
+
353
+ if session_data:
354
+ client_id = session_data.get("client_id")
355
+ member = session_data.get("member_object")
356
+ room = session_data.get("room_object")
357
+
358
+ logger.info(
359
+ f"[event: disconnect][{room or 'NOT_IN_ROOM'}] member: {member or 'NO_MEMBER_OBJECT'} disconnected"
360
+ )
361
+
362
+ # Release the lock if this is the client that holds the current server lock
363
+ if server_lock and server_lock.get("client_id") == client_id:
364
+ server_lock = None
365
+
366
+ if member:
367
+ member.connection_status = "disconnected"
368
+
369
+ if member.transcoder:
370
+ member.transcoder.close = True
371
+ member.transcoder = None
372
+ member.requested_output_type = None
373
+
374
+ if room:
375
+ logger.info(
376
+ f"[event: disconnect] {member} disconnected from room {room.room_id}"
377
+ )
378
+ await emit_room_state_update(room)
379
+ else:
380
+ logger.info(
381
+ f"[event: disconnect] {member} disconnected, but no room object present. This should not happen."
382
+ )
383
+ else:
384
+ logger.info(
385
+ f"[event: disconnect] client_id {client_id or 'NO_CLIENT_ID'} with sid {sid} in rooms {str(sio.rooms(sid))} disconnected"
386
+ )
387
+
388
+ await emit_server_state_update()
389
+
390
+
391
+ @sio.on("*")
392
+ async def catch_all(event, sid, data):
393
+ logger.info(f"[unhandled event: {event}] sid={sid} data={data}")
394
+
395
+
396
+ ###############################################
397
+ # Socket.io Streaming Event handlers
398
+ ###############################################
399
+
400
+
401
+ @sio.on("join_room")
402
+ @catch_and_log_exceptions_for_sio_event_handlers
403
+ async def join_room(sid, client_id, room_id_from_client, config_dict):
404
+ global server_lock
405
+
406
+ args = {
407
+ "sid": sid,
408
+ "client_id": client_id,
409
+ "room_id": room_id_from_client,
410
+ "config_dict": config_dict,
411
+ }
412
+ logger.info(f"[event: join_room] {args}")
413
+ session_data = await get_session_data(sid)
414
+
415
+ logger.info(f"session_data: {session_data}")
416
+
417
+ room_id = room_id_from_client
418
+ if room_id is None:
419
+ room_id = get_random_unused_room_id()
420
+ logger.info(
421
+ f"No room_id provided. Generating a random, unused room_id: {room_id}"
422
+ )
423
+
424
+ # Create the room if it doesn't already exist
425
+ if room_id not in rooms:
426
+ rooms[room_id] = Room(room_id)
427
+
428
+ room = rooms[room_id]
429
+
430
+ member = None
431
+
432
+ name = "[NO_NAME]"
433
+
434
+ context = ContextManager()
435
+
436
+ transcriber = Transcriber()
437
+
438
+ # If the client is reconnecting use their existing member object. Otherwise create a new one.
439
+ if client_id in room.members:
440
+ member = room.members[client_id]
441
+ logger.info(f"{member} is rejoining room {room_id}.")
442
+ else:
443
+ member_number = len(room.members) + 1
444
+ name = f"Member {member_number}"
445
+ member = Member(
446
+ client_id=client_id,
447
+ session_id=sid,
448
+ name=name,
449
+ )
450
+ allow_user = check_and_lock_single_user(client_id, member)
451
+ if not allow_user:
452
+ logger.error(
453
+ f"In SINGLE_USER mode we only allow one user at a time. Ignoring request to configure stream from client {client_id}."
454
+ )
455
+ return {"status": "error", "message": "max_users"}
456
+
457
+ logger.info(f"Created a new Member object: {member}")
458
+ logger.info(f"Adding {member} to room {room_id}")
459
+ room.members[client_id] = member
460
+
461
+ # Also add them to the member directory
462
+ member_directory[client_id] = {"room": room, "member_object": member}
463
+
464
+ # Join the socketio room, which enables broadcasting to all members of the room
465
+ sio.enter_room(sid, room_id)
466
+ # Join the room for all clients
467
+ sio.enter_room(sid, ALL_ROOM_ID)
468
+
469
+ if "listener" in config_dict["roles"]:
470
+ sio.enter_room(sid, f"{room_id}{ROOM_LISTENERS_SUFFIX}")
471
+ if client_id not in room.listeners:
472
+ room.listeners.append(client_id)
473
+ else:
474
+ sio.leave_room(sid, f"{room_id}{ROOM_LISTENERS_SUFFIX}")
475
+ room.listeners = [
476
+ listener_id for listener_id in room.listeners if listener_id != client_id
477
+ ]
478
+
479
+ if "speaker" in config_dict["roles"]:
480
+ sio.enter_room(sid, f"{room_id}{ROOM_SPEAKERS_SUFFIX}")
481
+ if client_id not in room.speakers:
482
+ room.speakers.append(client_id)
483
+ else:
484
+ sio.leave_room(sid, f"{room_id}{ROOM_SPEAKERS_SUFFIX}")
485
+ # If the person is no longer a speaker they should no longer be able to lock the server
486
+ if server_lock and server_lock.get("client_id") == client_id:
487
+ logger.info(
488
+ f"🔓 Server is now unlocked from client {server_lock.get('client_id')} with name/info: {server_lock.get('name')}"
489
+ )
490
+ server_lock = None
491
+ if member.transcoder:
492
+ member.transcoder.close = True
493
+ member.transcoder = None
494
+ room.speakers = [
495
+ speaker_id for speaker_id in room.speakers if speaker_id != client_id
496
+ ]
497
+
498
+ # Only speakers should be able to lock the server
499
+ if config_dict.get("lockServerName") is not None and "speaker" in config_dict.get(
500
+ "roles", {}
501
+ ):
502
+ # If something goes wrong and the server gets stuck in a locked state the client can
503
+ # force the server to remove the lock by passing the special name ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME
504
+ if (
505
+ server_lock is not None
506
+ and config_dict.get("lockServerName")
507
+ == ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME
508
+ # If we are locking the server completely we don't want someone to be able to unlock it
509
+ and not os.environ.get("LOCK_SERVER_COMPLETELY", "0") == "1"
510
+ ):
511
+ server_lock = None
512
+ logger.info(
513
+ f"🔓 Server lock has been reset by {client_id} using the escape hatch name {ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME}"
514
+ )
515
+
516
+ # If the server is not locked, set a lock. If it's already locked to this client, update the lock object
517
+ if server_lock is None or server_lock.get("client_id") == client_id:
518
+ # TODO: Add some sort of timeout as a backstop in case someone leaves the browser tab open after locking the server
519
+ server_lock = {
520
+ "name": config_dict.get("lockServerName"),
521
+ "client_id": client_id,
522
+ "member_object": member,
523
+ }
524
+ logger.info(
525
+ f"🔒 Server is now locked to client {server_lock.get('client_id')} with name/info: {server_lock.get('name')}\nThis client will have priority over all others until they disconnect."
526
+ )
527
+ # If the server is already locked to someone else, don't allow this client to lock it
528
+ elif server_lock is not None and server_lock.get("client_id") != client_id:
529
+ logger.warn(
530
+ f"⚠️ Server is already locked to client {server_lock.get('client_id')}. Ignoring request to lock to client {client_id}."
531
+ )
532
+ # TODO: Maybe throw an error here?
533
+
534
+ # Save the room_id to the socketio client session
535
+ await set_session_data(
536
+ sid,
537
+ client_id=client_id,
538
+ room_id=room_id,
539
+ room_object=room,
540
+ member_object=member,
541
+ context_obj=context,
542
+ transcriber=transcriber,
543
+ )
544
+
545
+ await emit_room_state_update(room)
546
+ await emit_server_state_update()
547
+
548
+ return {"roomsJoined": sio.rooms(sid), "roomID": room_id}
549
+
550
+
551
+ def check_and_lock_single_user(client_id, member):
552
+ global server_lock
553
+
554
+ if SINGLE_USER is None:
555
+ return True
556
+
557
+ if server_lock is None:
558
+ server_lock = {
559
+ "name": "single_user",
560
+ "client_id": client_id,
561
+ "member_object": member,
562
+ }
563
+ return True
564
+
565
+ return server_lock["client_id"] == client_id
566
+
567
+
568
+ # @sio.on("disconnect")
569
+ # @catch_and_log_exceptions_for_sio_event_handlers
570
+ # async def disconnect(sid):
571
+ # logger.info(f"📤 [event: disconnected] sid={sid}")
572
+ # # Additional code to handle the disconnect event
573
+
574
+
575
+ # TODO: Add code to prevent more than one speaker from connecting/streaming at a time
576
+ @sio.event
577
+ @catch_and_log_exceptions_for_sio_event_handlers
578
+ async def configure_stream(sid, config):
579
+ session_data = await get_session_data(sid)
580
+ client_id, member, room, transcriber = itemgetter(
581
+ "client_id", "member_object", "room_object", "transcriber"
582
+ )(session_data)
583
+
584
+ logger.debug(
585
+ f"[event: configure_stream][{room}] Received stream config from {member}\n{pformat(config)}"
586
+ )
587
+
588
+ if member is None or room is None:
589
+ logger.error(
590
+ f"Received stream config from {member}, but member or room is None. This should not happen."
591
+ )
592
+ return {"status": "error", "message": "member_or_room_is_none"}
593
+
594
+ # if not allow_speaker(room, client_id):
595
+ # logger.error(
596
+ # f"In MAX_SPEAKERS mode we only allow one speaker at a time. Ignoring request to configure stream from client {client_id}."
597
+ # )
598
+ # return {"status": "error", "message": "max_speakers"}
599
+
600
+ # If there is a server lock WITH an active transcoder session, prevent other users from configuring and starting a stream
601
+ # If the server lock client does NOT have an active transcoder session allow this to proceed, knowing that
602
+ # this stream will be interrupted if the server lock client starts streaming
603
+ if (
604
+ server_lock is not None
605
+ and server_lock.get("client_id") != client_id
606
+ and server_lock.get("member_object")
607
+ and server_lock.get("member_object").transcoder is not None
608
+ ):
609
+ logger.warn(
610
+ f"Server is locked to client {server_lock.get('client_id')}. Ignoring request to configure stream from client {client_id}."
611
+ )
612
+ return {"status": "error", "message": "server_locked"}
613
+
614
+ debug = config.get("debug")
615
+ async_processing = config.get("async_processing")
616
+ manual_transcribe = config.get("manual_transcribe")
617
+ member.manual_transcribe = manual_transcribe
618
+
619
+ if manual_transcribe:
620
+ await transcriber.start()
621
+ else:
622
+ # Currently s2s, s2t or s2s&t
623
+ model_type = config.get("model_type")
624
+ member.requested_output_type = model_type
625
+
626
+ model_name = config.get("model_name")
627
+
628
+ try:
629
+ agent = available_agents.get_agent_or_throw(model_name)
630
+ except NoAvailableAgentException as e:
631
+ logger.warn(f"Error while getting agent: {e}")
632
+ # await sio.emit("error", str(e), to=sid)
633
+ await sio.disconnect(sid)
634
+ return {"status": "error", "message": str(e)}
635
+
636
+ if member.transcoder:
637
+ logger.warn(
638
+ "Member already has a transcoder configured. Closing it, and overwriting with a new transcoder..."
639
+ )
640
+ member.transcoder.close = True
641
+
642
+ t0 = time.time()
643
+ try:
644
+ member.transcoder = SimulevalTranscoder(
645
+ agent,
646
+ config["rate"],
647
+ debug=debug,
648
+ buffer_limit=int(config["buffer_limit"]),
649
+ )
650
+ except Exception as e:
651
+ logger.warn(f"Got exception while initializing agents: {e}")
652
+ # await sio.emit("error", str(e), to=sid)
653
+ await sio.disconnect(sid)
654
+ return {"status": "error", "message": str(e)}
655
+
656
+ t1 = time.time()
657
+ logger.debug(f"Booting up VAD and transcoder took {t1-t0} sec")
658
+
659
+ # TODO: if async_processing is false, then we need to run transcoder.process_pipeline_once() whenever we receive audio, or at some other sensible interval
660
+ if async_processing:
661
+ member.transcoder.start()
662
+
663
+ # We need to emit a room state update here since room state now includes # of active transcoders
664
+ await emit_room_state_update(room)
665
+ await emit_server_state_update()
666
+
667
+ return {"status": "ok", "message": "server_ready"}
668
+
669
+
670
+ # The config here is a partial config, meaning it may not contain all the config values -- only the ones the user
671
+ # wants to change
672
+ @sio.on("set_dynamic_config")
673
+ @catch_and_log_exceptions_for_sio_event_handlers
674
+ async def set_dynamic_config(
675
+ sid,
676
+ # partial_config's type is defined in StreamingTypes.ts
677
+ partial_config,
678
+ ):
679
+ session_data = await get_session_data(sid)
680
+
681
+ member = None
682
+ context = None
683
+ if session_data:
684
+ member = session_data.get("member_object")
685
+ context = session_data.get("context_obj")
686
+
687
+ if member:
688
+ new_dynamic_config = {
689
+ **(member.transcoder_dynamic_config or {}),
690
+ **partial_config,
691
+ }
692
+ logger.info(
693
+ f"[set_dynamic_config] Setting new dynamic config:\n\n{pformat(new_dynamic_config)}\n"
694
+ )
695
+ member.transcoder_dynamic_config = new_dynamic_config
696
+
697
+ if context:
698
+ context.set_language(partial_config["targetLanguage"])
699
+
700
+ # TODO set transcriber language
701
+
702
+ return {"status": "ok", "message": "dynamic_config_set"}
703
+
704
+
705
+ @sio.event
706
+ @catch_and_log_exceptions_for_sio_event_handlers
707
+ async def incoming_audio(sid, blob):
708
+ session_data = await get_session_data(sid)
709
+
710
+ client_id = None
711
+ member = None
712
+ room = None
713
+ context = None
714
+ transcriber = None
715
+
716
+ if session_data:
717
+ client_id = session_data.get("client_id")
718
+ member = session_data.get("member_object")
719
+ room = session_data.get("room_object")
720
+ context = session_data.get("context_obj")
721
+ transcriber = session_data.get("transcriber")
722
+
723
+ logger.debug(f"[event: incoming_audio] from member {member}")
724
+
725
+ # If the server is locked by someone else, kill our transcoder and ignore incoming audio
726
+ # If the server lock client does NOT have an active transcoder session allow this incoming audio pipeline to proceed,
727
+ # knowing that this stream will be interrupted if the server lock client starts streaming
728
+ if member.manual_transcribe:
729
+ print(blob)
730
+ await transcriber.sendAudio(blob)
731
+ return
732
+
733
+ if (
734
+ server_lock is not None
735
+ and server_lock.get("client_id") != client_id
736
+ and server_lock.get("member_object")
737
+ and server_lock.get("member_object").transcoder is not None
738
+ ):
739
+ # TODO: Send an event to the client to let them know their streaming session has been killed
740
+ if member.transcoder:
741
+ member.transcoder.close = True
742
+ member.transcoder = None
743
+ # Update both room state and server state given that the number of active transcoders has changed
744
+ if room:
745
+ await emit_room_state_update(room)
746
+ await emit_server_state_update()
747
+ logger.warn(
748
+ f"[incoming_audio] Server is locked to client {server_lock.get('client_id')}. Ignoring incoming audio from client {client_id}."
749
+ )
750
+ return
751
+
752
+ if member is None or room is None:
753
+ logger.error(
754
+ f"[incoming_audio] Received incoming_audio from {member}, but member or room is None. This should not happen."
755
+ )
756
+ return
757
+
758
+ if member.manual_transcribe:
759
+ transcriber.sendAudio(blob)
760
+ else:
761
+ # NOTE: bytes and bytearray are very similar, but bytes is immutable, and is what is returned by socketio
762
+ if not isinstance(blob, bytes):
763
+ logger.error(
764
+ f"[incoming_audio] Received audio from {member}, but it was not of type `bytes`. type(blob) = {type(blob)}"
765
+ )
766
+ return
767
+
768
+ if member.transcoder is None:
769
+ logger.error(
770
+ f"[incoming_audio] Received audio from {member}, but no transcoder configured to process it (member.transcoder is None). This should not happen."
771
+ )
772
+ return
773
+
774
+ member.transcoder.process_incoming_bytes(
775
+ blob, dynamic_config=member.transcoder_dynamic_config
776
+ )
777
+
778
+ # Send back any available model output
779
+ # NOTE: In theory it would make sense remove this from the incoming_audio handler and
780
+ # handle this in a dedicated thread that checks for output and sends it right away,
781
+ # but in practice for our limited demo use cases this approach didn't add noticeable
782
+ # latency, so we're keeping it simple for now.
783
+ events = get_transcoder_output_events(member.transcoder)
784
+ logger.debug(f"[incoming_audio] transcoder output events: {len(events)}")
785
+
786
+ if len(events) == 0:
787
+ logger.debug("[incoming_audio] No transcoder output to send")
788
+ else:
789
+ for e in events:
790
+ if e[
791
+ "event"
792
+ ] == "translation_speech" and member.requested_output_type in [
793
+ "s2s",
794
+ "s2s&t",
795
+ ]:
796
+ logger.debug("[incoming_audio] Sending translation_speech event")
797
+ await sio.emit(
798
+ "translation_speech", e, room=f"{room.room_id}_listeners"
799
+ )
800
+ elif e[
801
+ "event"
802
+ ] == "translation_text" and member.requested_output_type in [
803
+ "s2t",
804
+ "s2s&t",
805
+ ]:
806
+ logger.debug("[incoming_audio] Sending translation_text event")
807
+ await sio.emit(
808
+ "translation_text", e, room=f"{room.room_id}_listeners"
809
+ )
810
+ context.add_text_chunk(e["payload"])
811
+ else:
812
+ logger.error(
813
+ f"[incoming_audio] Unexpected event type: {e['event']}"
814
+ )
815
+
816
+ new_context = context.get_current_context()
817
+ if new_context:
818
+ await sio.emit(
819
+ "context",
820
+ {"event": "context", "payload": new_context},
821
+ room=f"{room.room_id}_listeners",
822
+ )
823
+ return
824
+
825
+
826
+ @sio.event
827
+ @catch_and_log_exceptions_for_sio_event_handlers
828
+ async def stop_stream(sid):
829
+ session_data = await get_session_data(sid)
830
+ client_id, member, room = itemgetter("client_id", "member_object", "room_object")(
831
+ session_data
832
+ )
833
+
834
+ logger.debug(f"[event: stop_stream][{room}] Attempting to stop stream for {member}")
835
+
836
+ if member is None or room is None:
837
+ message = f"Received stop_stream from {member}, but member or room is None. This should not happen."
838
+ logger.error(message)
839
+ return {"status": "error", "message": message}
840
+
841
+ # In order to stop the stream and end the transcoder thread, set close to True and unset it for the member
842
+ if member.transcoder:
843
+ member.transcoder.close = True
844
+ member.transcoder = None
845
+ else:
846
+ message = f"Received stop_stream from {member}, but member.transcoder is None. This should not happen."
847
+ logger.warn(message)
848
+
849
+ # We need to emit a room state update here since room state now includes # of active transcoders
850
+ await emit_room_state_update(room)
851
+ # Emit a server state update now that we've changed the number of active transcoders
852
+ await emit_server_state_update()
853
+
854
+ return {"status": "ok", "message": "Stream stopped"}
855
+
856
+
857
+ @sio.on("clear_transcript_for_all")
858
+ @catch_and_log_exceptions_for_sio_event_handlers
859
+ async def clear_transcript_for_all(sid):
860
+ session_data = await get_session_data(sid)
861
+
862
+ room = session_data.get("room_object")
863
+
864
+ if room:
865
+ await sio.emit("clear_transcript", room=f"{room.room_id}")
866
+ else:
867
+ logger.error("[clear_transcript] room is None. This should not happen.")
868
+
869
+
870
+ @sio.event
871
+ @catch_and_log_exceptions_for_sio_event_handlers
872
+ async def set_name(sid, name):
873
+ logger.info(f"[Event: set_name] name={name}")
874
+ await sio.save_session(sid, {"name": name})
seamless-server/requirements.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # seamless_communication
2
+ git+https://github.com/facebookresearch/seamless_communication.git
3
+ # ./whl/seamless_communication-1.0.0-py3-none-any.whl
4
+ Flask==2.1.3
5
+ Flask_Sockets==0.2.1
6
+ g2p_en==2.1.0
7
+ gevent==22.10.2
8
+ gevent_websocket==0.10.1
9
+ librosa==0.9.2
10
+ numpy==1.24.4
11
+ openai_whisper==20230124
12
+ protobuf==4.24.2
13
+ psola==0.0.1
14
+ pydub==0.25.1
15
+ silero==0.4.1
16
+ soundfile==0.11.0
17
+ stable_ts==1.4.0
18
+ # torch # to be installed by user for desired PyTorch version
19
+ # simuleval # to be installed by seamless_communication
20
+ Werkzeug==2.0.3
21
+ whisper==1.1.10
22
+ colorlog==6.7.0
23
+ python-socketio==5.9.0
24
+ uvicorn[standard]==0.23.2
25
+ parallel-wavegan==0.5.5
26
+ python-jose[cryptography]==3.3.0
27
+ starlette==0.32.0.post1
28
+ hf_transfer==0.1.4
29
+ huggingface_hub==0.19.
30
+ google-auth
31
+ python-dotenv
32
+ deepgram-sdk
33
+ sentencepiece
34
+ fairseq2
seamless-server/run_docker.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # !/bin/bash
2
+ if [ -f models/Seamless/pretssel_melhifigan_wm.pt ] ; then
3
+ export USE_EXPRESSIVE_MODEL=1;
4
+ fi
5
+ uvicorn new:app --host 0.0.0.0 --port 7860 --reload
seamless-server/server.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import itemgetter
2
+ import os
3
+ from urllib import parse
4
+ from pprint import pformat
5
+ import socketio
6
+ import time
7
+ import logging
8
+ from starlette.applications import Starlette
9
+ from starlette.routing import Mount, Route
10
+ from starlette.staticfiles import StaticFiles
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+
15
+ from src.auth import google_auth_check
16
+ from src.client import Client
17
+ from src.context import ContextManager
18
+ from src.transcriber import Transcriber
19
+
20
+ from src.simuleval_agent_directory import NoAvailableAgentException
21
+ from src.simuleval_agent_directory import SimulevalAgentDirectory
22
+ from src.simuleval_transcoder import SimulevalTranscoder
23
+ from src.transcoder_helpers import get_transcoder_output_events
24
+ from src.logging import (
25
+ initialize_logger,
26
+ catch_and_log_exceptions_for_sio_event_handlers,
27
+ )
28
+
29
+ logger = initialize_logger(__name__, level=logging.WARNING)
30
+ print("=" * 20 + " ⭐️ Starting Server... ⭐️ " + "=" * 20)
31
+
32
+ sio = socketio.AsyncServer(
33
+ async_mode="asgi",
34
+ cors_allowed_origins="*",
35
+ logger=logger,
36
+ # engineio_logger=logger,
37
+ )
38
+ socketio_app = socketio.ASGIApp(sio)
39
+
40
+ app_routes = [
41
+ Mount("/ws", app=socketio_app),
42
+ ]
43
+ app = Starlette(debug=True, routes=app_routes)
44
+
45
+ # Specify specific models to load (some environments have issues loading multiple models)
46
+ # See AgentWithInfo with JSON format details.
47
+ models_override = os.environ.get("MODELS_OVERRIDE")
48
+
49
+ available_agents = SimulevalAgentDirectory()
50
+ logger.info("Building and adding agents...")
51
+ if models_override is not None:
52
+ logger.info(f"MODELS_OVERRIDE supplied from env vars: {models_override}")
53
+ available_agents.build_and_add_agents(models_override)
54
+
55
+ agents_capabilities_for_json = available_agents.get_agents_capabilities_list_for_json()
56
+
57
+
58
+ clients = {}
59
+
60
+
61
+ @sio.on("connect")
62
+ @catch_and_log_exceptions_for_sio_event_handlers(logger, sio)
63
+ async def connect(sid, environ):
64
+ logger.info(f"📥 [event: connected] sid={sid}")
65
+
66
+ # TODO: Sanitize/validate query param input
67
+ query_params = dict(parse.parse_qsl(environ["QUERY_STRING"]))
68
+ client_id = query_params.get("clientID")
69
+ token = query_params.get("token")
70
+
71
+ if google_auth_check(token) is None:
72
+ await sio.emit("auth_error", "Not authenticated", to=sid)
73
+ logger.info("Invalid auth token, Disconnecting...")
74
+ await sio.disconnect(sid)
75
+ return
76
+
77
+ logger.debug(f"query_params:\n{pformat(query_params)}")
78
+
79
+ if client_id is None:
80
+ logger.info("No clientID provided. Disconnecting...")
81
+ await sio.disconnect(sid)
82
+ return
83
+
84
+ clients[sid] = Client(client_id)
85
+
86
+
87
+ @sio.on("*")
88
+ async def catch_all(event, sid, data):
89
+ logger.info(f"[unhandled event: {event}] sid={sid} data={data}")
90
+
91
+
92
+ @sio.event
93
+ @catch_and_log_exceptions_for_sio_event_handlers(logger, sio)
94
+ async def configure_stream(sid, config):
95
+ client_obj = clients[sid]
96
+ logger.warning(sid)
97
+
98
+ if client_obj is None:
99
+ logger.error(f"No client object for {sid}")
100
+ await sio.disconnect(sid)
101
+ return {"status": "error", "message": "member_or_room_is_none"}
102
+
103
+ debug = config.get("debug")
104
+ async_processing = config.get("async_processing")
105
+ manual_transcribe = config.get("manual_transcribe")
106
+ client_obj.manual_transcribe = manual_transcribe
107
+
108
+ if manual_transcribe:
109
+ client_obj.transcriber = Transcriber()
110
+ client_obj.transcriber.start()
111
+ else:
112
+ # Currently s2s, s2t or s2s&t
113
+ model_type = config.get("model_type")
114
+ client_obj.requested_output_type = model_type
115
+
116
+ model_name = config.get("model_name")
117
+
118
+ try:
119
+ agent = available_agents.get_agent_or_throw(model_name)
120
+ except NoAvailableAgentException as e:
121
+ logger.warn(f"Error while getting agent: {e}")
122
+ await sio.disconnect(sid)
123
+ return {"status": "error", "message": str(e)}
124
+
125
+ if client_obj.transcoder:
126
+ logger.warn(
127
+ "Member already has a transcoder configured. Closing it, and overwriting with a new transcoder..."
128
+ )
129
+ client_obj.transcoder.close = True
130
+
131
+ t0 = time.time()
132
+ try:
133
+ client_obj.transcoder = SimulevalTranscoder(
134
+ agent,
135
+ config["rate"],
136
+ debug=debug,
137
+ buffer_limit=int(config["buffer_limit"]),
138
+ )
139
+ except Exception as e:
140
+ logger.warn(f"Got exception while initializing agents: {e}")
141
+ await sio.disconnect(sid)
142
+ return {"status": "error", "message": str(e)}
143
+
144
+ t1 = time.time()
145
+ logger.debug(f"Booting up VAD and transcoder took {t1-t0} sec")
146
+
147
+ # TODO: if async_processing is false, then we need to run transcoder.process_pipeline_once() whenever we receive audio, or at some other sensible interval
148
+ if async_processing:
149
+ client_obj.transcoder.start()
150
+
151
+ client_obj.context = ContextManager()
152
+ return {"status": "ok", "message": "server_ready"}
153
+
154
+
155
+ @sio.on("set_dynamic_config")
156
+ @catch_and_log_exceptions_for_sio_event_handlers(logger, sio)
157
+ async def set_dynamic_config(
158
+ sid,
159
+ partial_config,
160
+ ):
161
+ client_obj = clients[sid]
162
+
163
+ if client_obj is None:
164
+ logger.error(f"No client object for {sid}")
165
+ await sio.disconnect(sid)
166
+ return {"status": "error", "message": "member_or_room_is_none"}
167
+
168
+ new_dynamic_config = {
169
+ **(client_obj.transcoder_dynamic_config or {}),
170
+ **partial_config,
171
+ }
172
+ logger.info(
173
+ f"[set_dynamic_config] Setting new dynamic config:\n\n{pformat(new_dynamic_config)}\n"
174
+ )
175
+
176
+ client_obj.transcoder_dynamic_config = new_dynamic_config
177
+
178
+ if client_obj.context:
179
+ client_obj.context.set_language(partial_config["targetLanguage"])
180
+
181
+ # TODO set transcriber language
182
+
183
+ return {"status": "ok", "message": "dynamic_config_set"}
184
+
185
+
186
+ @sio.event
187
+ async def incoming_audio(sid, blob):
188
+ client_obj = clients[sid]
189
+
190
+ if client_obj is None:
191
+ logger.error(f"No client object for {sid}")
192
+ await sio.disconnect(sid)
193
+ return {"status": "error", "message": "member_or_room_is_none"}
194
+
195
+ if client_obj.manual_transcribe:
196
+ client_obj.transcriber.send_audio(blob)
197
+ else:
198
+ # NOTE: bytes and bytearray are very similar, but bytes is immutable, and is what is returned by socketio
199
+ if not isinstance(blob, bytes):
200
+ logger.error(
201
+ f"[incoming_audio] Received audio from {sid}, but it was not of type `bytes`. type(blob) = {type(blob)}"
202
+ )
203
+ return
204
+
205
+ if client_obj.transcoder is None:
206
+ logger.error(
207
+ f"[incoming_audio] Received audio from {sid}, but no transcoder configured to process it (member.transcoder is None). This should not happen."
208
+ )
209
+ return
210
+
211
+ client_obj.transcoder.process_incoming_bytes(
212
+ blob, dynamic_config=client_obj.transcoder_dynamic_config
213
+ )
214
+
215
+ # Send back any available model output
216
+ # NOTE: In theory it would make sense remove this from the incoming_audio handler and
217
+ # handle this in a dedicated thread that checks for output and sends it right away,
218
+ # but in practice for our limited demo use cases this approach didn't add noticeable
219
+ # latency, so we're keeping it simple for now.
220
+ events = get_transcoder_output_events(client_obj.transcoder)
221
+ logger.debug(f"[incoming_audio] transcoder output events: {len(events)}")
222
+
223
+ if len(events) == 0:
224
+ logger.debug("[incoming_audio] No transcoder output to send")
225
+ else:
226
+ for e in events:
227
+ if e[
228
+ "event"
229
+ ] == "translation_speech" and client_obj.requested_output_type in [
230
+ "s2s",
231
+ "s2s&t",
232
+ ]:
233
+ logger.debug("[incoming_audio] Sending translation_speech event")
234
+ await sio.emit("translation_speech", e, room=sid)
235
+ elif e[
236
+ "event"
237
+ ] == "translation_text" and client_obj.requested_output_type in [
238
+ "s2t",
239
+ "s2s&t",
240
+ ]:
241
+ logger.debug("[incoming_audio] Sending translation_text event")
242
+ await sio.emit("translation_text", e, room=sid)
243
+ client_obj.context.add_text_chunk(e["payload"])
244
+ else:
245
+ logger.error(
246
+ f"[incoming_audio] Unexpected event type: {e['event']}"
247
+ )
248
+ new_context = client_obj.context.get_current_context()
249
+ if new_context:
250
+ await sio.emit(
251
+ "context",
252
+ {"event": "context", "payload": new_context},
253
+ room=sid,
254
+ )
255
+ return
256
+
257
+
258
+ @sio.event
259
+ async def stop_stream(sid):
260
+ client_obj = clients[sid]
261
+
262
+ if client_obj is None:
263
+ logger.error(f"No client object for {sid}")
264
+ await sio.disconnect(sid)
265
+ return {"status": "error", "message": "member_or_room_is_none"}
266
+
267
+ if client_obj.transcoder:
268
+ client_obj.transcoder.close = True
269
+ client_obj.transcoder = None
270
+
271
+ if client_obj.transcriber:
272
+ client_obj.transcriber.close_connection()
273
+
274
+
275
+ @sio.event
276
+ async def disconnect(sid):
277
+ client_obj = clients[sid]
278
+ if client_obj is None:
279
+ return
280
+
281
+ if client_obj.transcriber:
282
+ client_obj.transcriber.stop()
283
+
284
+ if client_obj.transcoder:
285
+ client_obj.transcoder.close = True
286
+ client_obj.transcoder = None
287
+
288
+ del clients[sid]
seamless-server/src/auth.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.logging import initialize_logger
2
+ import requests
3
+
4
+ logger = initialize_logger(__name__)
5
+
6
+
7
+ def google_auth_check(token):
8
+ try:
9
+ response = requests.get(
10
+ "https://www.googleapis.com/oauth2/v3/tokeninfo",
11
+ params={"access_token": token},
12
+ )
13
+ if response.status_code == 200:
14
+ token_info = response.json()
15
+ return token_info
16
+ else:
17
+ return None
18
+ except Exception as e:
19
+ logger.info(e)
20
+ return None
seamless-server/src/client.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Client:
2
+ def __init__(
3
+ self,
4
+ client_id,
5
+ ) -> None:
6
+ self.client_id = client_id
7
+ self.connection_status = "connected"
8
+ self.transcoder = None
9
+ self.transcriber = None
10
+ self.context = None
11
+ self.requested_output_type = None
12
+ self.transcoder_dynamic_config = None
13
+ self.manual_transcribe = None
14
+
15
+ def __str__(self) -> str:
16
+ return f"{self.name} (id: {self.client_id[:4]}...) ({self.connection_status})"
17
+
18
+ def to_json(self):
19
+ self_vars = vars(self)
20
+ return {
21
+ **self_vars,
22
+ "transcoder": self.transcoder is not None,
23
+ }
seamless-server/src/context.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ from threading import Thread
4
+ from src.logging import initialize_logger
5
+ import os
6
+
7
+ # TODO get language key
8
+ prompt = """
9
+ Transcription: "[TRANSCRIPT]"
10
+ Task: Give a concise, 1-sentence summary of what the speaker is talking about.
11
+ IMPORTANT: The summary must be in the language: [LANGUAGE].
12
+ Return the response in JSON format with the following attribute: summary
13
+ Response in JSON Format:
14
+ """
15
+
16
+
17
+ logger = initialize_logger(__name__)
18
+
19
+
20
+ class ContextManager:
21
+ def __init__(self):
22
+ self.text_buffer = ""
23
+ self.amt = 0
24
+ self.max_char_memory = 300
25
+ self.char_between_release = 200
26
+ self.language = None
27
+ self.current_context = {}
28
+
29
+ def get_current_context(self):
30
+ if self.current_context and self.current_context["read"] is False:
31
+ self.current_context["read"] = True
32
+ return self.current_context["text"]
33
+ return None
34
+
35
+ def summarize(self, text):
36
+ if self.language is None:
37
+ return
38
+ try:
39
+ url = "https://voice-llm.openai.azure.com/openai/deployments/voice-LLM/chat/completions?api-version=2023-12-01-preview"
40
+ headers = {
41
+ "Content-Type": "application/json",
42
+ "api-key": os.getenv("AZURE_API_KEY"),
43
+ }
44
+
45
+ body = {
46
+ "model": "gpt-35-turbo",
47
+ "messages": [
48
+ {
49
+ "role": "user",
50
+ "content": prompt.replace("[TRANSCRIPT]", text).replace(
51
+ "[LANGUAGE]", self.language
52
+ ),
53
+ }
54
+ ],
55
+ }
56
+
57
+ response = requests.post(url, headers=headers, json=body)
58
+ response_data = response.json()
59
+ parsed = json.loads(response_data["choices"][0]["message"]["content"])[
60
+ "summary"
61
+ ]
62
+ self.current_context = {"text": parsed, "read": False}
63
+ except Exception as e:
64
+ logger.warning(e)
65
+
66
+ def add_text_chunk(self, text):
67
+ self.text_buffer += " " + text
68
+ cur_len = len(self.text_buffer)
69
+
70
+ # continously trim context to save memory
71
+ if len(self.text_buffer) > self.max_char_memory:
72
+ self.text_buffer = self.text_buffer[cur_len - self.max_char_memory :]
73
+
74
+ self.amt += len(text)
75
+ if self.amt > self.char_between_release:
76
+ self.amt = 0
77
+ thread = Thread(target=self.summarize, args=(self.text_buffer,))
78
+ thread.start()
79
+
80
+ def set_language(self, lang):
81
+ self.language = lang
82
+ self.text_buffer = ""
83
+ self.amt = 0
seamless-server/src/logging.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import colorlog
3
+ import sys
4
+ import time
5
+
6
+
7
+ def initialize_logger(name, level=logging.WARNING):
8
+ logger = logging.getLogger(name)
9
+ logger.propagate = False
10
+ handler = colorlog.StreamHandler(stream=sys.stdout)
11
+ formatter = colorlog.ColoredFormatter(
12
+ "%(log_color)s[%(asctime)s][%(levelname)s][%(module)s]:%(reset)s %(message)s",
13
+ reset=True,
14
+ log_colors={
15
+ "DEBUG": "cyan",
16
+ "INFO": "green",
17
+ "WARNING": "yellow",
18
+ "ERROR": "red",
19
+ "CRITICAL": "red,bg_white",
20
+ },
21
+ )
22
+ handler.setFormatter(formatter)
23
+ logger.addHandler(handler)
24
+ logger.setLevel(level)
25
+ return logger
26
+
27
+
28
+ def catch_and_log_exceptions_for_sio_event_handlers(sio, logger):
29
+ # wrapper should have the same signature as the original function
30
+ def decorator(func):
31
+ async def catch_exception_wrapper(*args, **kwargs):
32
+ try:
33
+ return await func(*args, **kwargs)
34
+ except Exception as e:
35
+ message = f"[app_pubsub] Caught exception in '{func.__name__}' event handler:\n\n{e}"
36
+ logger.exception(message, stack_info=True)
37
+
38
+ try:
39
+ exception_data = {
40
+ "message": message,
41
+ "timeEpochMs": int(time.time() * 1000),
42
+ }
43
+
44
+ # For now let's emit this to all clients. We ultimatley may want to emit it just to the room it's happening in.
45
+ await sio.emit("server_exception", exception_data)
46
+ except Exception as inner_e:
47
+ logger.exception(
48
+ f"[app_pubsub] Caught exception while trying to emit server_exception event:\n{inner_e}"
49
+ )
50
+
51
+ # Re-raise the exception so it's handled normally by the server
52
+ raise e
53
+
54
+ # Set the name of the wrapper to the name of the original function so that the socketio server can associate it with the right event
55
+ catch_exception_wrapper.__name__ = func.__name__
56
+ return catch_exception_wrapper
57
+
58
+ return decorator
seamless-server/src/room.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import json
2
+ import uuid
3
+
4
+
5
+ class Room:
6
+ def __init__(self, room_id) -> None:
7
+ self.room_id = room_id
8
+ # members is a dict from client_id to Member
9
+ self.members = {}
10
+
11
+ # listeners and speakers are lists of client_id's
12
+ self.listeners = []
13
+ self.speakers = []
14
+
15
+ def __str__(self) -> str:
16
+ return f"Room {self.room_id} ({len(self.members)} member{'s' if len(self.members) == 1 else ''})"
17
+
18
+ def to_json(self):
19
+ varsResult = vars(self)
20
+ # Remember: result is just a shallow copy, so result.members === self.members
21
+ # Because of that, we need to jsonify self.members without writing over result.members,
22
+ # which we do here via dictionary unpacking (the ** operator)
23
+ result = {
24
+ **varsResult,
25
+ "members": {key: value.to_json() for (key, value) in self.members.items()},
26
+ "activeTranscoders": self.get_active_transcoders(),
27
+ }
28
+
29
+ return result
30
+
31
+ def get_active_connections(self):
32
+ return len(
33
+ [m for m in self.members.values() if m.connection_status == "connected"]
34
+ )
35
+
36
+ def get_active_transcoders(self):
37
+ return len([m for m in self.members.values() if m.transcoder is not None])
38
+
39
+ def get_room_status_dict(self):
40
+ return {
41
+ "activeConnections": self.get_active_connections(),
42
+ "activeTranscoders": self.get_active_transcoders(),
43
+ }
44
+
45
+
46
+ class Member:
47
+ def __init__(self, client_id, session_id, name) -> None:
48
+ self.client_id = client_id
49
+ self.session_id = session_id
50
+ self.name = name
51
+ self.connection_status = "connected"
52
+ self.transcoder = None
53
+ self.requested_output_type = None
54
+ self.transcoder_dynamic_config = None
55
+ self.manual_transcribe = None
56
+
57
+ def __str__(self) -> str:
58
+ return f"{self.name} (id: {self.client_id[:4]}...) ({self.connection_status})"
59
+
60
+ def to_json(self):
61
+ self_vars = vars(self)
62
+ return {
63
+ **self_vars,
64
+ "transcoder": self.transcoder is not None,
65
+ }
seamless-server/src/simuleval_agent_directory.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Creates a directory in which to look up available agents
2
+
3
+ import os
4
+ from typing import List, Optional
5
+ from src.simuleval_transcoder import SimulevalTranscoder
6
+ import json
7
+ import logging
8
+
9
+ logger = logging.getLogger("socketio_server_pubsub")
10
+
11
+ # fmt: off
12
+ M4T_P0_LANGS = [
13
+ "eng",
14
+ "arb", "ben", "cat", "ces", "cmn", "cym", "dan",
15
+ "deu", "est", "fin", "fra", "hin", "ind", "ita",
16
+ "jpn", "kor", "mlt", "nld", "pes", "pol", "por",
17
+ "ron", "rus", "slk", "spa", "swe", "swh", "tel",
18
+ "tgl", "tha", "tur", "ukr", "urd", "uzn", "vie",
19
+ ]
20
+ # fmt: on
21
+
22
+
23
+ class NoAvailableAgentException(Exception):
24
+ pass
25
+
26
+
27
+ class AgentWithInfo:
28
+ def __init__(
29
+ self,
30
+ agent,
31
+ name: str,
32
+ modalities: List[str],
33
+ target_langs: List[str],
34
+ # Supported dynamic params are defined in StreamingTypes.ts
35
+ dynamic_params: List[str] = [],
36
+ description="",
37
+ has_expressive: Optional[bool] = None,
38
+ ):
39
+ self.agent = agent
40
+ self.has_expressive = has_expressive
41
+ self.name = name
42
+ self.description = description
43
+ self.modalities = modalities
44
+ self.target_langs = target_langs
45
+ self.dynamic_params = dynamic_params
46
+
47
+ def get_capabilities_for_json(self):
48
+ return {
49
+ "name": self.name,
50
+ "description": self.description,
51
+ "modalities": self.modalities,
52
+ "targetLangs": self.target_langs,
53
+ "dynamicParams": self.dynamic_params,
54
+ }
55
+
56
+ @classmethod
57
+ def load_from_json(cls, config: str):
58
+ """
59
+ Takes in JSON array of models to load in, e.g.
60
+ [{"name": "s2s_m4t_emma-unity2_multidomain_v0.1", "description": "M4T model that supports simultaneous S2S and S2T", "modalities": ["s2t", "s2s"], "targetLangs": ["en"]},
61
+ {"name": "s2s_m4t_expr-emma_v0.1", "description": "ES-EN expressive model that supports S2S and S2T", "modalities": ["s2t", "s2s"], "targetLangs": ["en"]}]
62
+ """
63
+ configs = json.loads(config)
64
+ agents = []
65
+ for config in configs:
66
+ agent = SimulevalTranscoder.build_agent(config["name"])
67
+ agents.append(
68
+ AgentWithInfo(
69
+ agent=agent,
70
+ name=config["name"],
71
+ modalities=config["modalities"],
72
+ target_langs=config["targetLangs"],
73
+ )
74
+ )
75
+ return agents
76
+
77
+
78
+ class SimulevalAgentDirectory:
79
+ # Available models. These are the directories where the models can be found, and also serve as an ID for the model.
80
+ seamless_streaming_agent = "SeamlessStreaming"
81
+ seamless_agent = "Seamless"
82
+
83
+ def __init__(self):
84
+ self.agents = []
85
+ self.did_build_and_add_agents = False
86
+
87
+ def add_agent(self, agent: AgentWithInfo):
88
+ self.agents.append(agent)
89
+
90
+ def build_agent_if_available(self, model_id, config_name=None):
91
+ agent = None
92
+ try:
93
+ if config_name is not None:
94
+ agent = SimulevalTranscoder.build_agent(
95
+ model_id,
96
+ config_name=config_name,
97
+ )
98
+ else:
99
+ agent = SimulevalTranscoder.build_agent(
100
+ model_id,
101
+ )
102
+ except Exception as e:
103
+ from fairseq2.assets.error import AssetError
104
+ logger.warning("Failed to build agent %s: %s" % (model_id, e))
105
+ if isinstance(e, AssetError):
106
+ logger.warning(
107
+ "Please download gated assets and set `gated_model_dir` in the config"
108
+ )
109
+ raise e
110
+
111
+ return agent
112
+
113
+ def build_and_add_agents(self, models_override=None):
114
+ if self.did_build_and_add_agents:
115
+ return
116
+
117
+ if models_override is not None:
118
+ agent_infos = AgentWithInfo.load_from_json(models_override)
119
+ for agent_info in agent_infos:
120
+ self.add_agent(agent_info)
121
+ else:
122
+ s2s_agent = None
123
+ if os.environ.get("USE_EXPRESSIVE_MODEL", "0") == "1":
124
+ logger.info("Building expressive model...")
125
+ s2s_agent = self.build_agent_if_available(
126
+ SimulevalAgentDirectory.seamless_agent,
127
+ config_name="vad_s2st_sc_24khz_main.yaml",
128
+ )
129
+ has_expressive = True
130
+ else:
131
+ logger.info("Building non-expressive model...")
132
+ s2s_agent = self.build_agent_if_available(
133
+ SimulevalAgentDirectory.seamless_streaming_agent,
134
+ config_name="vad_s2st_sc_main.yaml",
135
+ )
136
+ has_expressive = False
137
+
138
+ if s2s_agent:
139
+ self.add_agent(
140
+ AgentWithInfo(
141
+ agent=s2s_agent,
142
+ name=SimulevalAgentDirectory.seamless_streaming_agent,
143
+ modalities=["s2t", "s2s"],
144
+ target_langs=M4T_P0_LANGS,
145
+ dynamic_params=["expressive"],
146
+ description="multilingual expressive model that supports S2S and S2T",
147
+ has_expressive=has_expressive,
148
+ )
149
+ )
150
+
151
+ if len(self.agents) == 0:
152
+ logger.error(
153
+ "No agents were loaded. This likely means you are missing the actual model files specified in simuleval_agent_directory."
154
+ )
155
+
156
+ self.did_build_and_add_agents = True
157
+
158
+ def get_agent(self, name):
159
+ for agent in self.agents:
160
+ if agent.name == name:
161
+ return agent
162
+ return None
163
+
164
+ def get_agent_or_throw(self, name):
165
+ agent = self.get_agent(name)
166
+ if agent is None:
167
+ raise NoAvailableAgentException("No agent found with name= %s" % (name))
168
+ return agent
169
+
170
+ def get_agents_capabilities_list_for_json(self):
171
+ return [agent.get_capabilities_for_json() for agent in self.agents]
seamless-server/src/simuleval_transcoder.py ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from simuleval.utils.agent import build_system_from_dir
2
+ from typing import Any, List, Optional, Tuple, Union
3
+ import numpy as np
4
+ import soundfile
5
+ import io
6
+ import asyncio
7
+ from simuleval.agents.pipeline import TreeAgentPipeline
8
+ from simuleval.agents.states import AgentStates
9
+ from simuleval.data.segments import Segment, EmptySegment, SpeechSegment
10
+ import threading
11
+ from pathlib import Path
12
+ import time
13
+ from g2p_en import G2p
14
+ import torch
15
+ import traceback
16
+ import time
17
+ import random
18
+ from src.logging import initialize_logger
19
+ from .speech_and_text_output import SpeechAndTextOutput
20
+
21
+ MODEL_SAMPLE_RATE = 16_000
22
+
23
+ logger = initialize_logger(__name__)
24
+
25
+
26
+ class OutputSegments:
27
+ def __init__(self, segments: Union[List[Segment], Segment]):
28
+ if isinstance(segments, Segment):
29
+ segments = [segments]
30
+ self.segments: List[Segment] = [s for s in segments]
31
+
32
+ @property
33
+ def is_empty(self):
34
+ return all(segment.is_empty for segment in self.segments)
35
+
36
+ @property
37
+ def finished(self):
38
+ return all(segment.finished for segment in self.segments)
39
+
40
+ def compute_length(self, g2p):
41
+ lengths = []
42
+ for segment in self.segments:
43
+ if segment.data_type == "text":
44
+ lengths.append(len([x for x in g2p(segment.content) if x != " "]))
45
+ elif segment.data_type == "speech":
46
+ lengths.append(len(segment.content) / MODEL_SAMPLE_RATE)
47
+ elif isinstance(segment, EmptySegment):
48
+ continue
49
+ else:
50
+ logger.warning(
51
+ f"Unexpected data_type: {segment.data_type} not in 'speech', 'text'"
52
+ )
53
+ return max(lengths)
54
+
55
+ @classmethod
56
+ def join_output_buffer(
57
+ cls, buffer: List[List[Segment]], output: SpeechAndTextOutput
58
+ ):
59
+ num_segments = len(buffer[0])
60
+ for i in range(num_segments):
61
+ segment_list = [
62
+ buffer[j][i]
63
+ for j in range(len(buffer))
64
+ if buffer[j][i].data_type is not None
65
+ ]
66
+ if len(segment_list) == 0:
67
+ continue
68
+ if len(set(segment.data_type for segment in segment_list)) != 1:
69
+ logger.warning(
70
+ f"Data type mismatch at {i}: {set(segment.data_type for segment in segment_list)}"
71
+ )
72
+ continue
73
+ data_type = segment_list[0].data_type
74
+ if data_type == "text":
75
+ if output.text is not None:
76
+ logger.warning("Multiple text outputs, overwriting!")
77
+ output.text = " ".join([segment.content for segment in segment_list])
78
+ elif data_type == "speech":
79
+ if output.speech_samples is not None:
80
+ logger.warning("Multiple speech outputs, overwriting!")
81
+ speech_out = []
82
+ for segment in segment_list:
83
+ speech_out += segment.content
84
+ output.speech_samples = speech_out
85
+ output.speech_sample_rate = segment.sample_rate
86
+ elif isinstance(segment_list[0], EmptySegment):
87
+ continue
88
+ else:
89
+ logger.warning(
90
+ f"Invalid output buffer data type: {data_type}, expected 'speech' or 'text"
91
+ )
92
+
93
+ return output
94
+
95
+ def __repr__(self) -> str:
96
+ repr_str = str(self.segments)
97
+ return f"{self.__class__.__name__}(\n\t{repr_str}\n)"
98
+
99
+
100
+ class SimulevalTranscoder:
101
+ def __init__(self, agent, sample_rate, debug, buffer_limit):
102
+ self.agent = agent.agent
103
+ self.has_expressive = agent.has_expressive
104
+ self.input_queue = asyncio.Queue()
105
+ self.output_queue = asyncio.Queue()
106
+ self.states = self.agent.build_states()
107
+ if debug:
108
+ self.get_states_root().debug = True
109
+ self.incoming_sample_rate = sample_rate
110
+ self.close = False
111
+ self.g2p = G2p()
112
+
113
+ # buffer all outgoing translations within this amount of time
114
+ self.output_buffer_idle_ms = 5000
115
+ self.output_buffer_size_limit = (
116
+ buffer_limit # phonemes for text, seconds for speech
117
+ )
118
+ self.output_buffer_cur_size = 0
119
+ self.output_buffer: List[List[Segment]] = []
120
+ self.speech_output_sample_rate = None
121
+
122
+ self.last_output_ts = time.time() * 1000
123
+ self.timeout_ms = (
124
+ 30000 # close the transcoder thread after this amount of silence
125
+ )
126
+ self.first_input_ts = None
127
+ self.first_output_ts = None
128
+ self.debug = debug
129
+ self.debug_ts = f"{time.time()}_{random.randint(1000, 9999)}"
130
+ if self.debug:
131
+ debug_folder = Path(__file__).resolve().parent.parent / "debug"
132
+ self.test_incoming_wav = soundfile.SoundFile(
133
+ debug_folder / f"{self.debug_ts}_test_incoming.wav",
134
+ mode="w+",
135
+ format="WAV",
136
+ subtype="PCM_16",
137
+ samplerate=self.incoming_sample_rate,
138
+ channels=1,
139
+ )
140
+ self.get_states_root().test_input_segments_wav = soundfile.SoundFile(
141
+ debug_folder / f"{self.debug_ts}_test_input_segments.wav",
142
+ mode="w+",
143
+ format="WAV",
144
+ samplerate=MODEL_SAMPLE_RATE,
145
+ channels=1,
146
+ )
147
+
148
+ def get_states_root(self) -> AgentStates:
149
+ if isinstance(self.agent, TreeAgentPipeline):
150
+ # self.states is a dict
151
+ return self.states[self.agent.source_module]
152
+ else:
153
+ # self.states is a list
154
+ return self.states[0]
155
+
156
+ def reset_states(self):
157
+ if isinstance(self.agent, TreeAgentPipeline):
158
+ states_iter = self.states.values()
159
+ else:
160
+ states_iter = self.states
161
+ for state in states_iter:
162
+ state.reset()
163
+
164
+ def debug_log(self, *args):
165
+ if self.debug:
166
+ logger.info(*args)
167
+
168
+ @classmethod
169
+ def build_agent(cls, model_path, config_name):
170
+ logger.info(f"Building simuleval agent: {model_path}, {config_name}")
171
+ agent = build_system_from_dir(
172
+ Path(__file__).resolve().parent.parent / f"models/{model_path}",
173
+ config_name=config_name,
174
+ )
175
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
176
+ logger.warning(f"agent built on {device}")
177
+ agent.to(device, fp16=True)
178
+ logger.info(
179
+ f"Successfully built simuleval agent {model_path} on device {device}"
180
+ )
181
+
182
+ return agent
183
+
184
+ def process_incoming_bytes(self, incoming_bytes, dynamic_config):
185
+ # TODO: We probably want to do some validation on dynamic_config to ensure it has what we needs
186
+ segment, sr = self._preprocess_wav(incoming_bytes)
187
+ segment = SpeechSegment(
188
+ content=segment,
189
+ sample_rate=sr,
190
+ tgt_lang=dynamic_config.get("targetLanguage"),
191
+ config=dynamic_config,
192
+ )
193
+ if dynamic_config.get("expressive") is True and self.has_expressive is False:
194
+ logger.warning(
195
+ "Passing 'expressive' but the agent does not support expressive output!"
196
+ )
197
+ # # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
198
+ self.input_queue.put_nowait(segment)
199
+
200
+ def get_input_segment(self):
201
+ if self.input_queue.empty():
202
+ return None
203
+ chunk = self.input_queue.get_nowait()
204
+ self.input_queue.task_done()
205
+ return chunk
206
+
207
+ def convert_waveform(
208
+ self,
209
+ waveform: Union[np.ndarray, torch.Tensor],
210
+ sample_rate: int,
211
+ normalize_volume: bool = False,
212
+ to_mono: bool = False,
213
+ to_sample_rate: Optional[int] = None,
214
+ ) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
215
+ """convert a waveform:
216
+ - to a target sample rate
217
+ - from multi-channel to mono channel
218
+ - volume normalization
219
+
220
+ Args:
221
+ waveform (numpy.ndarray or torch.Tensor): 2D original waveform
222
+ (channels x length)
223
+ sample_rate (int): original sample rate
224
+ normalize_volume (bool): perform volume normalization
225
+ to_mono (bool): convert to mono channel if having multiple channels
226
+ to_sample_rate (Optional[int]): target sample rate
227
+ Returns:
228
+ waveform (numpy.ndarray): converted 2D waveform (channels x length)
229
+ sample_rate (float): target sample rate
230
+ """
231
+ try:
232
+ import torchaudio.sox_effects as ta_sox
233
+ except ImportError:
234
+ raise ImportError("Please install torchaudio: pip install torchaudio")
235
+
236
+ effects = []
237
+ if normalize_volume:
238
+ effects.append(["gain", "-n"])
239
+ if to_sample_rate is not None and to_sample_rate != sample_rate:
240
+ effects.append(["rate", f"{to_sample_rate}"])
241
+ if to_mono and waveform.shape[0] > 1:
242
+ effects.append(["channels", "1"])
243
+ if len(effects) > 0:
244
+ is_np_input = isinstance(waveform, np.ndarray)
245
+ _waveform = torch.from_numpy(waveform) if is_np_input else waveform
246
+ converted, converted_sample_rate = ta_sox.apply_effects_tensor(
247
+ _waveform, sample_rate, effects
248
+ )
249
+ if is_np_input:
250
+ converted = converted.numpy()
251
+ return converted, converted_sample_rate
252
+ return waveform, sample_rate
253
+
254
+ def _preprocess_wav(self, data: Any) -> Tuple[np.ndarray, int]:
255
+ segment, sample_rate = soundfile.read(
256
+ io.BytesIO(data),
257
+ dtype="float32",
258
+ always_2d=True,
259
+ frames=-1,
260
+ start=0,
261
+ format="RAW",
262
+ subtype="PCM_16",
263
+ samplerate=self.incoming_sample_rate,
264
+ channels=1,
265
+ )
266
+ if self.debug:
267
+ self.test_incoming_wav.seek(0, soundfile.SEEK_END)
268
+ self.test_incoming_wav.write(segment)
269
+
270
+ segment = segment.T
271
+ segment, new_sample_rate = self.convert_waveform(
272
+ segment,
273
+ sample_rate,
274
+ normalize_volume=False,
275
+ to_mono=True,
276
+ to_sample_rate=MODEL_SAMPLE_RATE,
277
+ )
278
+
279
+ assert MODEL_SAMPLE_RATE == new_sample_rate
280
+ segment = segment.squeeze(axis=0)
281
+ return segment, new_sample_rate
282
+
283
+ def process_pipeline_impl(self, input_segment):
284
+ try:
285
+ with torch.no_grad():
286
+ output_segment = OutputSegments(
287
+ self.agent.pushpop(input_segment, self.states)
288
+ )
289
+ if (
290
+ self.get_states_root().first_input_ts is not None
291
+ and self.first_input_ts is None
292
+ ):
293
+ # TODO: this is hacky
294
+ self.first_input_ts = self.get_states_root().first_input_ts
295
+
296
+ if not output_segment.is_empty:
297
+ self.output_queue.put_nowait(output_segment)
298
+
299
+ if output_segment.finished:
300
+ self.debug_log("OUTPUT SEGMENT IS FINISHED. Resetting states.")
301
+
302
+ self.reset_states()
303
+
304
+ if self.debug:
305
+ # when we rebuild states, this value is reset to whatever
306
+ # is in the system dir config, which defaults debug=False.
307
+ self.get_states_root().debug = True
308
+ except Exception as e:
309
+ logger.error(f"Got exception while processing pipeline: {e}")
310
+ traceback.print_exc()
311
+ return input_segment
312
+
313
+ def process_pipeline_loop(self):
314
+ if self.close:
315
+ return # closes the thread
316
+
317
+ self.debug_log("processing_pipeline")
318
+ while not self.close:
319
+ input_segment = self.get_input_segment()
320
+ if input_segment is None:
321
+ if self.get_states_root().is_fresh_state: # TODO: this is hacky
322
+ time.sleep(0.3)
323
+ else:
324
+ time.sleep(0.03)
325
+ continue
326
+ self.process_pipeline_impl(input_segment)
327
+ self.debug_log("finished processing_pipeline")
328
+
329
+ def process_pipeline_once(self):
330
+ if self.close:
331
+ return
332
+
333
+ self.debug_log("processing pipeline once")
334
+ input_segment = self.get_input_segment()
335
+ if input_segment is None:
336
+ return
337
+ self.process_pipeline_impl(input_segment)
338
+ self.debug_log("finished processing_pipeline_once")
339
+
340
+ def get_output_segment(self):
341
+ if self.output_queue.empty():
342
+ return None
343
+
344
+ output_chunk = self.output_queue.get_nowait()
345
+ self.output_queue.task_done()
346
+ return output_chunk
347
+
348
+ def start(self):
349
+ self.debug_log("starting transcoder in a thread")
350
+ threading.Thread(target=self.process_pipeline_loop).start()
351
+
352
+ def first_translation_time(self):
353
+ return round((self.first_output_ts - self.first_input_ts) / 1000, 2)
354
+
355
+ def get_buffered_output(self) -> SpeechAndTextOutput:
356
+ now = time.time() * 1000
357
+ self.debug_log(f"get_buffered_output queue size: {self.output_queue.qsize()}")
358
+ while not self.output_queue.empty():
359
+ tmp_out = self.get_output_segment()
360
+ if tmp_out and tmp_out.compute_length(self.g2p) > 0:
361
+ if len(self.output_buffer) == 0:
362
+ self.last_output_ts = now
363
+ self._populate_output_buffer(tmp_out)
364
+ self._increment_output_buffer_size(tmp_out)
365
+
366
+ if tmp_out.finished:
367
+ self.debug_log("tmp_out.finished")
368
+ res = self._gather_output_buffer_data(final=True)
369
+ self.debug_log(f"gathered output data: {res}")
370
+ self.output_buffer = []
371
+ self.increment_output_buffer_size = 0
372
+ self.last_output_ts = now
373
+ self.first_output_ts = now
374
+ return res
375
+ else:
376
+ self.debug_log("tmp_out.compute_length is not > 0")
377
+
378
+ if len(self.output_buffer) > 0 and (
379
+ now - self.last_output_ts >= self.output_buffer_idle_ms
380
+ or self.output_buffer_cur_size >= self.output_buffer_size_limit
381
+ ):
382
+ self.debug_log(
383
+ "[get_buffered_output] output_buffer is not empty. getting res to return."
384
+ )
385
+ self.last_output_ts = now
386
+ res = self._gather_output_buffer_data(final=False)
387
+ self.debug_log(f"gathered output data: {res}")
388
+ self.output_buffer = []
389
+ self.output_buffer_phoneme_count = 0
390
+ self.first_output_ts = now
391
+ return res
392
+ else:
393
+ self.debug_log("[get_buffered_output] output_buffer is empty...")
394
+ return None
395
+
396
+ def _gather_output_buffer_data(self, final):
397
+ output = SpeechAndTextOutput()
398
+ output.final = final
399
+ output = OutputSegments.join_output_buffer(self.output_buffer, output)
400
+ return output
401
+
402
+ def _increment_output_buffer_size(self, segment: OutputSegments):
403
+ self.output_buffer_cur_size += segment.compute_length(self.g2p)
404
+
405
+ def _populate_output_buffer(self, segment: OutputSegments):
406
+ self.output_buffer.append(segment.segments)
407
+
408
+ def _compute_phoneme_count(self, string: str) -> int:
409
+ return len([x for x in self.g2p(string) if x != " "])
seamless-server/src/speech_and_text_output.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Provides a container to return both speech and text output from our model at the same time
2
+
3
+
4
+ class SpeechAndTextOutput:
5
+ def __init__(
6
+ self,
7
+ text: str = None,
8
+ speech_samples: list = None,
9
+ speech_sample_rate: float = None,
10
+ final: bool = False,
11
+ ):
12
+ self.text = text
13
+ self.speech_samples = speech_samples
14
+ self.speech_sample_rate = speech_sample_rate
15
+ self.final = final
seamless-server/src/transcoder_helpers.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ logger = logging.getLogger("socketio_server_pubsub")
4
+
5
+
6
+ def get_transcoder_output_events(transcoder) -> list:
7
+ speech_and_text_output = transcoder.get_buffered_output()
8
+ if speech_and_text_output is None:
9
+ logger.debug("No output from transcoder.get_buffered_output()")
10
+ return []
11
+
12
+ logger.debug(f"We DID get output from the transcoder! {speech_and_text_output}")
13
+
14
+ lat = None
15
+
16
+ events = []
17
+
18
+
19
+ if speech_and_text_output.speech_samples:
20
+ events.append(
21
+ {
22
+ "event": "translation_speech",
23
+ "payload": speech_and_text_output.speech_samples,
24
+ "sample_rate": speech_and_text_output.speech_sample_rate,
25
+ }
26
+ )
27
+
28
+ if speech_and_text_output.text:
29
+ events.append(
30
+ {
31
+ "event": "translation_text",
32
+ "payload": speech_and_text_output.text,
33
+ }
34
+ )
35
+
36
+ for e in events:
37
+ e["eos"] = speech_and_text_output.final
38
+
39
+ # if not latency_sent:
40
+ # lat = transcoder.first_translation_time()
41
+ # latency_sent = True
42
+ # to_send["latency"] = lat
43
+
44
+ return events
seamless-server/src/transcriber.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from deepgram import DeepgramClient, LiveTranscriptionEvents, LiveOptions
2
+ import asyncio
3
+ import os
4
+ from src.logging import initialize_logger
5
+ import logging
6
+ import threading
7
+ import time
8
+
9
+ logger = initialize_logger("transcriber", level=logging.INFO)
10
+
11
+ options = LiveOptions(
12
+ model="nova-2",
13
+ language="en-US",
14
+ smart_format=True,
15
+ punctuate=True,
16
+ # smart_format=True,
17
+ sample_rate=48000,
18
+ interim_results=True,
19
+ )
20
+
21
+
22
+ class Transcriber:
23
+ def __init__(
24
+ self,
25
+ ):
26
+ self.deepgram_api_key = os.getenv("DEEPGRAM_API_KEY")
27
+ self.deepgram = None
28
+ self.dg_connection = None
29
+ self.audio_queue = asyncio.Queue()
30
+ self.stop_event = threading.Event()
31
+
32
+ def process_audio(self):
33
+ while not self.stop_event.is_set():
34
+ try:
35
+ if self.dg_connection is None:
36
+ logger.info("returned from process")
37
+ return
38
+
39
+ if self.audio_queue.empty():
40
+ time.sleep(0.1)
41
+ continue
42
+
43
+ data = self.audio_queue.get_nowait()
44
+ self.dg_connection.send(data)
45
+ self.audio_queue.task_done()
46
+ logger.info("sent data to deepgram")
47
+ except Exception as e:
48
+ logger.warning(f"Error while sending data: {e}")
49
+ break
50
+
51
+ logger.info("Audio processing thread is stopping")
52
+
53
+ def on_transcript(self, result, *args, **kwargs):
54
+ try:
55
+ sentence = result.channel.alternatives[0].transcript
56
+ logger.info(f"Transcription: {sentence}")
57
+ except Exception as e:
58
+ logger.warning(e)
59
+
60
+ def close_connection(self):
61
+ if self.dg_connection:
62
+ self.dg_connection.finish()
63
+ self.dg_connection = None
64
+ logger.info("finished deepgram connection")
65
+
66
+ def stop(self):
67
+ self.stop_event.set()
68
+ self.close_connection()
69
+ logger.info("Requested to stop the audio processing thread")
70
+
71
+ def on_close(self, *args, **kwargs):
72
+ logger.info("Deepgram connection closed")
73
+ self.dg_connection = None
74
+
75
+ def on_utterance_end(self, utterance_end, *args, **kwargs):
76
+ logger.info(f"\n\n{utterance_end}\n\n")
77
+
78
+ def on_error(self, e, *args, **kwargs):
79
+ logger.warning(f"Deepgram error received {e}")
80
+ self.dg_connection = None
81
+
82
+ def start_deepgram(self):
83
+ try:
84
+ self.deepgram = DeepgramClient(self.deepgram_api_key)
85
+ dg_connection = self.deepgram.listen.live.v("1")
86
+ except Exception as e:
87
+ logger.warning(f"Could not open socket: {e}")
88
+ return
89
+
90
+ def on_message(self, result, **kwargs):
91
+ sentence = result.channel.alternatives[0].transcript
92
+ if len(sentence) == 0:
93
+ return
94
+ logger.info(f"speaker: {sentence}")
95
+
96
+ def on_metadata(self, metadata, **kwargs):
97
+ logger.info(f"\n\n{metadata}\n\n")
98
+
99
+ def on_utterance_end(self, utterance_end, **kwargs):
100
+ logger.info(f"\n\n{utterance_end}\n\n")
101
+
102
+ def on_error(self, error, **kwargs):
103
+ logger.info(f"\n\n{error}\n\n")
104
+
105
+ def on_close(self, **kwargs):
106
+ logger.info(f"\n\nclosed\n\n")
107
+
108
+ dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
109
+ dg_connection.on(LiveTranscriptionEvents.Metadata, on_metadata)
110
+ # dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
111
+ dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
112
+ dg_connection.on(LiveTranscriptionEvents.Error, on_error)
113
+ dg_connection.on(LiveTranscriptionEvents.Close, on_close)
114
+
115
+ dg_connection.start(options)
116
+ self.dg_connection = dg_connection
117
+
118
+ logger.info("deepgram connection opened")
119
+ self.process_audio()
120
+
121
+ def start(self):
122
+ threading.Thread(target=self.start_deepgram).start()
123
+
124
+ def send_audio(self, data):
125
+ try:
126
+ self.audio_queue.put_nowait(data)
127
+ except Exception as e:
128
+ logger.warning(e)
seamless-server/src/translate.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from seamless_communication.inference import Translator
3
+
4
+
5
+ # Initialize a Translator object with a multitask model, vocoder on the GPU.
6
+ translator = Translator(
7
+ "seamlessM4T_v2_large", "vocoder_v2", torch.device("cuda:0"), torch.float16
8
+ )
9
+
10
+
11
+ def translate_text(text):
12
+ print("test")
13
+ # text_output, speech_output = translator.predict(
14
+ # input=text,
15
+ # task_str="T2ST",
16
+ # tgt_lang="spa",
17
+ # src_lang="eng",
18
+ # text_generation_opts=None,
19
+ # unit_generation_opts=None,
20
+ # )
21
+ # print(text_output)
seamless-server/whl/seamless_communication-1.0.0-py3-none-any.whl ADDED
Binary file (204 kB). View file
 
streaming-test-app/.eslintrc.cjs ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module.exports = {
2
+ root: true,
3
+ env: {browser: true, es2020: true},
4
+ extends: [
5
+ 'eslint:recommended',
6
+ 'plugin:@typescript-eslint/recommended',
7
+ 'plugin:react-hooks/recommended',
8
+ ],
9
+ ignorePatterns: ['dist', '.eslintrc.cjs'],
10
+ parser: '@typescript-eslint/parser',
11
+ plugins: ['react-refresh'],
12
+ rules: {
13
+ 'react-refresh/only-export-components': [
14
+ 'warn',
15
+ {allowConstantExport: true},
16
+ ],
17
+ },
18
+ };
streaming-test-app/.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ node_modules
11
+ dist
12
+ dist-ssr
13
+ *.local
14
+
15
+ # Editor directories and files
16
+ .vscode/*
17
+ !.vscode/extensions.json
18
+ .idea
19
+ .DS_Store
20
+ *.suo
21
+ *.ntvs*
22
+ *.njsproj
23
+ *.sln
24
+ *.sw?
streaming-test-app/index.html ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <link rel="icon" type="image/svg+xml" href="/src/assets/seamless.svg" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>Seamless Translation</title>
8
+ </head>
9
+ <body>
10
+ <div id="root"></div>
11
+ <script type="module" src="/src/main.tsx"></script>
12
+ </body>
13
+ </html>
streaming-test-app/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
streaming-test-app/package.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "streaming-test-app",
3
+ "private": true,
4
+ "version": "0.0.14",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite --host --strictPort",
8
+ "build": "vite build",
9
+ "preview": "vite preview",
10
+ "clean:node-modules": "rm -rf node_modules/",
11
+ "ts-check": "tsc --noEmit",
12
+ "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0",
13
+ "prettier-check": "cd ../ && yarn run prettier-base --check streaming-test-app",
14
+ "signal": "concurrently --names \"TS,LINT,PRETTIER\" -c \"bgBlack.bold,bgRed.bold,bgCyan.bold\" \"yarn run ts-check\" \"yarn run lint\" \"yarn run prettier-check\""
15
+ },
16
+ "dependencies": {
17
+ "@emotion/react": "11.11.1",
18
+ "@emotion/styled": "11.11.0",
19
+ "@mui/icons-material": "5.14.3",
20
+ "@mui/material": "5.14.5",
21
+ "@react-three/drei": "^9.83.9",
22
+ "@react-three/fiber": "^8.14.1",
23
+ "@react-three/xr": "^5.7.1",
24
+ "amazon-cognito-identity-js": "^6.3.6",
25
+ "audiobuffer-to-wav": "^1.0.0",
26
+ "aws-sdk": "^2.1472.0",
27
+ "js-cookie": "^3.0.5",
28
+ "lodash": "4.17.21",
29
+ "react": "^18.2.0",
30
+ "react-dom": "^18.2.0",
31
+ "react-google-charts": "^4.0.1",
32
+ "socket.io-client": "^4.7.2",
33
+ "three": "^0.156.1",
34
+ "three-mesh-ui": "^6.5.4",
35
+ "uuid": "^9.0.0",
36
+ "zustand": "^4.4.3"
37
+ },
38
+ "devDependencies": {
39
+ "@types/node": "^20.5.3",
40
+ "@types/react": "^18.2.15",
41
+ "@types/react-dom": "^18.2.7",
42
+ "@types/uuid": "^9.0.2",
43
+ "@typescript-eslint/eslint-plugin": "^6.0.0",
44
+ "@typescript-eslint/parser": "^6.0.0",
45
+ "@vitejs/plugin-react": "^4.0.3",
46
+ "concurrently": "8.2.1",
47
+ "eslint": "^8.45.0",
48
+ "eslint-plugin-react-hooks": "^4.6.0",
49
+ "eslint-plugin-react-refresh": "^0.4.3",
50
+ "typescript": "5.1.6",
51
+ "vite": "^4.4.5"
52
+ }
53
+ }
streaming-test-app/src/App.tsx ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import SocketWrapper from './SocketWrapper';
2
+ import {ThemeProvider} from '@mui/material/styles';
3
+ import theme from './theme';
4
+ import StreamingInterface from './StreamingInterface';
5
+ import CssBaseline from '@mui/material/CssBaseline';
6
+ import {createContext, useCallback, useState} from 'react';
7
+ import packageJson from '../package.json';
8
+
9
+ console.log(`Streaming React App version: ${packageJson?.version}`);
10
+
11
+ // Roboto font for mui ui library
12
+ // import '@fontsource/roboto/300.css';
13
+ // import '@fontsource/roboto/400.css';
14
+ // import '@fontsource/roboto/500.css';
15
+ // import '@fontsource/roboto/700.css';
16
+
17
+ export const AppResetKeyContext = createContext<(newKey: string) => void>(
18
+ () => {
19
+ throw new Error('AppResetKeyContext not initialized');
20
+ },
21
+ );
22
+
23
+ function App() {
24
+ return (
25
+ <ThemeProvider theme={theme}>
26
+ <CssBaseline />
27
+ <SocketWrapper>
28
+ <StreamingInterface />
29
+ </SocketWrapper>
30
+ </ThemeProvider>
31
+ );
32
+ }
33
+
34
+ function AppWrapper() {
35
+ const [appResetKey, setAppResetKey] = useState<string>('[initial value]');
36
+ const setAppResetKeyHandler = useCallback((newKey: string) => {
37
+ setAppResetKey((prev) => {
38
+ console.warn(
39
+ `Resetting the app with appResetKey: ${newKey}; prevKey: ${prev}`,
40
+ );
41
+ if (prev === newKey) {
42
+ console.error(
43
+ `The appResetKey was the same as the previous key, so the app will not reset.`,
44
+ );
45
+ }
46
+ return newKey;
47
+ });
48
+ }, []);
49
+
50
+ return (
51
+ <AppResetKeyContext.Provider value={setAppResetKeyHandler}>
52
+ <App key={appResetKey} />
53
+ </AppResetKeyContext.Provider>
54
+ );
55
+ }
56
+
57
+ export default AppWrapper;
streaming-test-app/src/Blink.tsx ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Box from '@mui/material/Box';
2
+ import {useEffect, useState} from 'react';
3
+
4
+ type Props = {
5
+ intervalMs: number;
6
+ children: React.ReactNode;
7
+ shouldBlink: boolean;
8
+ // display?: 'block' | 'inline' | 'inline-block';
9
+ };
10
+
11
+ export default function Blink({
12
+ // display = 'inline-block',
13
+ shouldBlink,
14
+ intervalMs,
15
+ children,
16
+ }: Props): React.ReactElement {
17
+ const [cursorBlinkOn, setCursorBlinkOn] = useState(false);
18
+
19
+ useEffect(() => {
20
+ if (shouldBlink) {
21
+ const interval = setInterval(() => {
22
+ setCursorBlinkOn((prev) => !prev);
23
+ }, intervalMs);
24
+
25
+ return () => clearInterval(interval);
26
+ } else {
27
+ setCursorBlinkOn(false);
28
+ }
29
+ }, [intervalMs, shouldBlink]);
30
+
31
+ return (
32
+ <Box
33
+ component="span"
34
+ sx={{
35
+ display: 'inline-block',
36
+ visibility: cursorBlinkOn ? 'visible' : 'hidden',
37
+ }}>
38
+ {children}
39
+ </Box>
40
+ );
41
+ }
streaming-test-app/src/DebugSection.tsx ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {Chart} from 'react-google-charts';
2
+ import debug from './debug';
3
+ import {
4
+ Accordion,
5
+ AccordionDetails,
6
+ AccordionSummary,
7
+ Button,
8
+ Typography,
9
+ } from '@mui/material';
10
+ import {useState} from 'react';
11
+ import ArrowDropDownIcon from '@mui/icons-material/ArrowDropDown';
12
+
13
+ export default function DebugChart() {
14
+ const [showDebugTimings, setShowDebugTimings] = useState<boolean>(false);
15
+
16
+ const data = debug()?.getChartData();
17
+ const options = {
18
+ timeline: {
19
+ groupByRowLabel: true,
20
+ },
21
+ };
22
+
23
+ return (
24
+ <div className="horizontal-padding-sra text-chunk-sra">
25
+ <Accordion
26
+ expanded={showDebugTimings}
27
+ onChange={() => setShowDebugTimings(!showDebugTimings)}
28
+ elevation={0}
29
+ sx={{border: 1, borderColor: 'rgba(0, 0, 0, 0.3)'}}>
30
+ <AccordionSummary
31
+ expandIcon={<ArrowDropDownIcon />}
32
+ className="debug-section">
33
+ Debug Info
34
+ </AccordionSummary>
35
+ <AccordionDetails>
36
+ {data && data.length > 1 ? (
37
+ <>
38
+ <Chart
39
+ chartType="Timeline"
40
+ data={data}
41
+ width="100%"
42
+ height="400px"
43
+ options={options}
44
+ />
45
+ <Button
46
+ variant="contained"
47
+ sx={{marginBottom: 1}}
48
+ onClick={() => {
49
+ debug()?.downloadInputAudio();
50
+ debug()?.downloadOutputAudio();
51
+ }}>
52
+ Download Input / Ouput Audio
53
+ </Button>
54
+ </>
55
+ ) : (
56
+ <Typography>No input / output detected</Typography>
57
+ )}
58
+ </AccordionDetails>
59
+ </Accordion>
60
+ </div>
61
+ );
62
+ }
streaming-test-app/src/RoomConfig.tsx ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Stack from '@mui/material/Stack';
2
+ import TextField from '@mui/material/TextField';
3
+ import {isValidRoomID, isValidPartialRoomID} from './generateNewRoomID';
4
+ import {useCallback, useEffect, useState} from 'react';
5
+ import Button from '@mui/material/Button';
6
+ import {useSocket} from './useSocket';
7
+ import FormGroup from '@mui/material/FormGroup';
8
+ import FormControlLabel from '@mui/material/FormControlLabel';
9
+ import Checkbox from '@mui/material/Checkbox';
10
+ import {RoomState} from './types/RoomState';
11
+ import setURLParam from './setURLParam';
12
+ import {getURLParams} from './URLParams';
13
+ import {
14
+ JoinRoomConfig,
15
+ Roles,
16
+ ServerState,
17
+ StreamingStatus,
18
+ } from './types/StreamingTypes';
19
+ import Alert from '@mui/material/Alert';
20
+
21
+ function capitalize(str: string): string {
22
+ return str.charAt(0).toUpperCase() + str.slice(1);
23
+ }
24
+
25
+ type Props = {
26
+ roomState: RoomState | null;
27
+ serverState: ServerState | null;
28
+ onJoinRoomOrUpdateRoles?: () => void;
29
+ streamingStatus: StreamingStatus;
30
+ setHasMaxUsers: (hasMaxUsers: boolean) => void;
31
+ };
32
+
33
+ export default function RoomConfig({
34
+ roomState,
35
+ serverState,
36
+ onJoinRoomOrUpdateRoles,
37
+ setHasMaxUsers,
38
+ streamingStatus,
39
+ }: Props) {
40
+ const {socket, clientID} = useSocket();
41
+
42
+ const urlParams = getURLParams();
43
+ const roomIDParam = urlParams.roomID;
44
+ const autoJoinRoom = urlParams.autoJoin;
45
+
46
+ const [roomID, setRoomID] = useState<string>(
47
+ (roomIDParam ?? '').toUpperCase(),
48
+ );
49
+ const [roomIDError, setRoomIDError] = useState<boolean>(false);
50
+ const [roles, setRoles] = useState<{speaker: boolean; listener: boolean}>({
51
+ speaker: true,
52
+ listener: true,
53
+ });
54
+ const [lockServer, setLockServer] = useState<boolean>(false);
55
+ const [lockServerName, setLockServerName] = useState<string>('');
56
+
57
+ const [joinInProgress, setJoinInProgress] = useState<boolean>(false);
58
+ const [didAttemptAutoJoin, setDidAttemptAutoJoin] = useState<boolean>(false);
59
+
60
+ const isValidServerLock =
61
+ lockServer === false ||
62
+ (lockServerName != null && lockServerName.length > 0);
63
+ const isValidRoles = Object.values(roles).filter(Boolean).length > 0;
64
+ const isValidAllInputs =
65
+ isValidRoomID(roomID) && isValidRoles && isValidServerLock;
66
+ const roomIDFromServer = roomState?.room_id ?? null;
67
+
68
+ const onJoinRoom = useCallback(
69
+ (createNewRoom: boolean) => {
70
+ if (socket == null) {
71
+ console.error('Socket is null, cannot join room');
72
+ return;
73
+ }
74
+ console.debug(`Attempting to join roomID ${roomID}...`);
75
+
76
+ const lockServerValidated: string | null =
77
+ lockServer && roles['speaker'] ? lockServerName : null;
78
+
79
+ setJoinInProgress(true);
80
+
81
+ const configObject: JoinRoomConfig = {
82
+ roles: (Object.keys(roles) as Array<Roles>).filter(
83
+ (role) => roles[role] === true,
84
+ ),
85
+ lockServerName: lockServerValidated,
86
+ };
87
+
88
+ socket.emit(
89
+ 'join_room',
90
+ clientID,
91
+ createNewRoom ? null : roomID,
92
+ configObject,
93
+ (result) => {
94
+ console.log('join_room result:', result);
95
+ if (result.message === 'max_users') {
96
+ setHasMaxUsers(true);
97
+ setJoinInProgress(false);
98
+ return;
99
+ } else {
100
+ setHasMaxUsers(false);
101
+ }
102
+ if (createNewRoom) {
103
+ setRoomID(result.roomID);
104
+ }
105
+ if (onJoinRoomOrUpdateRoles != null) {
106
+ onJoinRoomOrUpdateRoles();
107
+ }
108
+ setURLParam('roomID', result.roomID);
109
+ setJoinInProgress(false);
110
+ },
111
+ );
112
+ },
113
+ [
114
+ clientID,
115
+ lockServer,
116
+ lockServerName,
117
+ onJoinRoomOrUpdateRoles,
118
+ roles,
119
+ roomID,
120
+ socket,
121
+ ],
122
+ );
123
+
124
+ useEffect(() => {
125
+ if (
126
+ autoJoinRoom === true &&
127
+ didAttemptAutoJoin === false &&
128
+ socket != null
129
+ ) {
130
+ // We want to consider this an attempt whether or not we actually try to join, because
131
+ // we only want auto-join to happen on initial load
132
+ setDidAttemptAutoJoin(true);
133
+ if (
134
+ isValidAllInputs &&
135
+ joinInProgress === false &&
136
+ roomIDFromServer == null
137
+ ) {
138
+ console.debug('Attempting to auto-join room...');
139
+
140
+ onJoinRoom(false);
141
+ } else {
142
+ console.debug('Unable to auto-join room', {
143
+ isValidAllInputs,
144
+ joinInProgress,
145
+ roomIDFromServer,
146
+ });
147
+ }
148
+ }
149
+ }, [
150
+ autoJoinRoom,
151
+ didAttemptAutoJoin,
152
+ isValidAllInputs,
153
+ joinInProgress,
154
+ onJoinRoom,
155
+ roomIDFromServer,
156
+ socket,
157
+ ]);
158
+
159
+ return (
160
+ <Stack direction="column" spacing="12px">
161
+ <Stack direction="row" spacing="12px" sx={{alignItems: 'center'}}>
162
+ <TextField
163
+ size="small"
164
+ label="Room Code"
165
+ variant="outlined"
166
+ disabled={roomState?.room_id != null}
167
+ value={roomID}
168
+ error={roomIDError}
169
+ onChange={(e) => {
170
+ const id = e.target.value.toUpperCase();
171
+ if (isValidPartialRoomID(id)) {
172
+ setRoomIDError(false);
173
+ setRoomID(id);
174
+ } else {
175
+ setRoomIDError(true);
176
+ }
177
+ }}
178
+ sx={{width: '8em'}}
179
+ />
180
+
181
+ <div>
182
+ <Button
183
+ variant="contained"
184
+ disabled={
185
+ isValidAllInputs === false ||
186
+ joinInProgress ||
187
+ streamingStatus !== 'stopped'
188
+ }
189
+ onClick={() => onJoinRoom(false)}>
190
+ {roomState?.room_id != null ? 'Update Roles' : 'Join Room'}
191
+ </Button>
192
+ </div>
193
+
194
+ {roomState?.room_id == null && (
195
+ <div>
196
+ <Button
197
+ variant="contained"
198
+ disabled={
199
+ roomState?.room_id != null ||
200
+ joinInProgress ||
201
+ streamingStatus !== 'stopped'
202
+ }
203
+ onClick={() => onJoinRoom(true)}>
204
+ {'Create New Room'}
205
+ </Button>
206
+ </div>
207
+ )}
208
+ </Stack>
209
+
210
+ <FormGroup>
211
+ {Object.keys(roles).map((role) => {
212
+ return (
213
+ <FormControlLabel
214
+ disabled={streamingStatus !== 'stopped'}
215
+ key={role}
216
+ control={
217
+ <Checkbox
218
+ checked={roles[role]}
219
+ onChange={(event: React.ChangeEvent<HTMLInputElement>) => {
220
+ setRoles((prevRoles) => ({
221
+ ...prevRoles,
222
+ [role]: event.target.checked,
223
+ }));
224
+ }}
225
+ />
226
+ }
227
+ label={capitalize(role)}
228
+ />
229
+ );
230
+ })}
231
+
232
+ {urlParams.enableServerLock && roles['speaker'] === true && (
233
+ <>
234
+ <FormControlLabel
235
+ disabled={streamingStatus !== 'stopped'}
236
+ control={
237
+ <Checkbox
238
+ checked={lockServer}
239
+ onChange={(event: React.ChangeEvent<HTMLInputElement>) => {
240
+ setLockServer(event.target.checked);
241
+ }}
242
+ />
243
+ }
244
+ label="Lock Server (prevent other users from streaming)"
245
+ />
246
+ </>
247
+ )}
248
+ </FormGroup>
249
+
250
+ {urlParams.enableServerLock &&
251
+ roles['speaker'] === true &&
252
+ lockServer && (
253
+ <TextField
254
+ disabled={streamingStatus !== 'stopped'}
255
+ label="Enter Your Name + Expected Lock End Time"
256
+ variant="outlined"
257
+ value={lockServerName}
258
+ onChange={(event: React.ChangeEvent<HTMLInputElement>) => {
259
+ setLockServerName(event.target.value);
260
+ }}
261
+ helperText="Locking the server will prevent anyone else from using it until you close the page, in order to maximize server performance. Please only use this for live demos."
262
+ />
263
+ )}
264
+
265
+ {serverState?.serverLock != null &&
266
+ serverState.serverLock.clientID === clientID && (
267
+ <Alert severity="success">{`The server is now locked for your use (${serverState?.serverLock?.name}). Close this window to release the lock so that others may use the server.`}</Alert>
268
+ )}
269
+ </Stack>
270
+ );
271
+ }
streaming-test-app/src/SocketWrapper.tsx ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {useContext, useEffect, useMemo, useRef, useState} from 'react';
2
+ import socketIOClient, {Socket} from 'socket.io-client';
3
+ import useStable from './useStable';
4
+ import {v4 as uuidv4} from 'uuid';
5
+ import {SocketContext} from './useSocket';
6
+ import {AppResetKeyContext} from './App';
7
+ import Backdrop from '@mui/material/Backdrop';
8
+ import CircularProgress from '@mui/material/CircularProgress';
9
+ import Typography from '@mui/material/Typography';
10
+ import {getURLParams} from './URLParams';
11
+
12
+ // The time to wait before showing a "disconnected" screen upon initial app load
13
+ const INITIAL_DISCONNECT_SCREEN_DELAY = 2000;
14
+ const SERVER_URL_DEFAULT = `${window.location.protocol === "https:" ? "wss" : "ws"
15
+ }://${window.location.host}`;
16
+
17
+ export default function SocketWrapper({children}) {
18
+ const [socket, setSocket] = useState<Socket | null>(null);
19
+ const [connected, setConnected] = useState<boolean | null>(null);
20
+ // Default to true:
21
+ const [willAttemptReconnect] = useState<boolean>(true);
22
+ const serverIDRef = useRef<string | null>(null);
23
+
24
+ const setAppResetKey = useContext(AppResetKeyContext);
25
+
26
+ /**
27
+ * Previously we had stored the clientID in local storage, but in that case
28
+ * if a user refreshes their page they'll still have the same clientID, and
29
+ * will be put back into the same room, which may be confusing if they're trying
30
+ * to join a new room or reset the app interface. So now clientIDs persist only as
31
+ * long as the react app full lifecycle
32
+ */
33
+ const clientID = useStable<string>(() => {
34
+ const newID = uuidv4();
35
+ // Set the clientID in session storage so if the page reloads the person
36
+ // still retains their member/room config
37
+ return newID;
38
+ });
39
+
40
+ const socketObject = useMemo(
41
+ () => ({socket, clientID, connected: connected ?? false}),
42
+ [socket, clientID, connected],
43
+ );
44
+
45
+ useEffect(() => {
46
+ const queryParams = {
47
+ clientID: clientID,
48
+ };
49
+
50
+ const serverURLFromParams = getURLParams().serverURL;
51
+ const serverURL = serverURLFromParams ?? SERVER_URL_DEFAULT;
52
+
53
+ console.log(
54
+ `Opening socket connection to ${
55
+ serverURL?.length === 0 ? 'window.location.host' : serverURL
56
+ } with query params:`,
57
+ queryParams,
58
+ );
59
+
60
+ const newSocket: Socket = socketIOClient(serverURL, {
61
+ query: queryParams,
62
+ // Normally socket.io will fallback to http polling, but we basically never
63
+ // want that because that'd mean awful performance. It'd be better for the app
64
+ // to simply break in that case and not connect.
65
+ transports: ['websocket'],
66
+ path: '/ws/socket.io'
67
+ });
68
+
69
+ const onServerID = (serverID: string) => {
70
+ console.debug('Received server ID:', serverID);
71
+ if (serverIDRef.current != null) {
72
+ if (serverIDRef.current !== serverID) {
73
+ console.error(
74
+ 'Server ID changed. Resetting the app using the app key',
75
+ );
76
+ setAppResetKey(serverID);
77
+ }
78
+ }
79
+ serverIDRef.current = serverID;
80
+ };
81
+
82
+ newSocket.on('server_id', onServerID);
83
+
84
+ setSocket(newSocket);
85
+
86
+ return () => {
87
+ newSocket.off('server_id', onServerID);
88
+ console.log(
89
+ 'Closing socket connection in the useEffect cleanup function...',
90
+ );
91
+ newSocket.disconnect();
92
+ setSocket(null);
93
+ };
94
+ }, [clientID, setAppResetKey]);
95
+
96
+ useEffect(() => {
97
+ if (socket != null) {
98
+ const onAny = (eventName: string, ...args) => {
99
+ console.debug(`[event: ${eventName}] args:`, ...args);
100
+ };
101
+
102
+ socket.onAny(onAny);
103
+
104
+ return () => {
105
+ socket.offAny(onAny);
106
+ };
107
+ }
108
+ return () => {};
109
+ }, [socket]);
110
+
111
+ useEffect(() => {
112
+ if (socket != null) {
113
+ const onConnect = (...args) => {
114
+ console.debug('Connected to server with args:', ...args);
115
+ setConnected(true);
116
+ };
117
+
118
+ const onConnectError = (err) => {
119
+ console.error(`Connection error due to ${err.message}`);
120
+ };
121
+
122
+ const onDisconnect = (reason) => {
123
+ setConnected(false);
124
+ console.log(`Disconnected due to ${reason}`);
125
+ };
126
+
127
+ socket.on('connect', onConnect);
128
+ socket.on('connect_error', onConnectError);
129
+ socket.on('disconnect', onDisconnect);
130
+
131
+ return () => {
132
+ socket.off('connect', onConnect);
133
+ socket.off('connect_error', onConnectError);
134
+ socket.off('disconnect', onDisconnect);
135
+ };
136
+ }
137
+ }, [socket]);
138
+
139
+ useEffect(() => {
140
+ if (socket != null) {
141
+ const onReconnectError = (err) => {
142
+ console.log(`Reconnect error due to ${err.message}`);
143
+ };
144
+
145
+ socket.io.on('reconnect_error', onReconnectError);
146
+
147
+ const onError = (err) => {
148
+ console.log(`General socket error with message ${err.message}`);
149
+ };
150
+ socket.io.on('error', onError);
151
+
152
+ const onReconnect = (attempt) => {
153
+ console.log(`Reconnected after ${attempt} attempt(s)`);
154
+ };
155
+ socket.io.on('reconnect', onReconnect);
156
+
157
+ const disconnectOnBeforeUnload = () => {
158
+ console.log('Disconnecting due to beforeunload event...');
159
+ socket.disconnect();
160
+ setSocket(null);
161
+ };
162
+ window.addEventListener('beforeunload', disconnectOnBeforeUnload);
163
+
164
+ return () => {
165
+ socket.io.off('reconnect_error', onReconnectError);
166
+ socket.io.off('error', onError);
167
+ socket.io.off('reconnect', onReconnect);
168
+ window.removeEventListener('beforeunload', disconnectOnBeforeUnload);
169
+ };
170
+ }
171
+ }, [clientID, setAppResetKey, socket]);
172
+
173
+ /**
174
+ * Wait to show the disconnected screen on initial app load
175
+ */
176
+ useEffect(() => {
177
+ window.setTimeout(() => {
178
+ setConnected((prev) => {
179
+ if (prev === null) {
180
+ return false;
181
+ }
182
+ return prev;
183
+ });
184
+ }, INITIAL_DISCONNECT_SCREEN_DELAY);
185
+ }, []);
186
+
187
+ return (
188
+ <SocketContext.Provider value={socketObject}>
189
+ {children}
190
+
191
+ <Backdrop
192
+ open={connected === false && willAttemptReconnect === true}
193
+ sx={{
194
+ color: '#fff',
195
+ zIndex: (theme) => theme.zIndex.drawer + 1,
196
+ }}>
197
+ <div
198
+ style={{
199
+ alignItems: 'center',
200
+ flexDirection: 'column',
201
+ textAlign: 'center',
202
+ }}>
203
+ <CircularProgress color="inherit" />
204
+ <Typography
205
+ align="center"
206
+ fontSize={{sm: 18, xs: 16}}
207
+ sx={{
208
+ fontFamily:
209
+ 'ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace',
210
+ fontWeight: 'bold',
211
+ }}>
212
+ {'Disconnected. Attempting to reconnect...'}
213
+ </Typography>
214
+ </div>
215
+ </Backdrop>
216
+ </SocketContext.Provider>
217
+ );
218
+ }
streaming-test-app/src/StreamingInterface.css ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .app-wrapper-sra {
2
+ display: flex;
3
+ flex-direction: column;
4
+ justify-content: center;
5
+ align-items: center;
6
+ }
7
+
8
+ .main-container-sra {
9
+ background-color: white;
10
+ display: flex;
11
+ flex-direction: column;
12
+ justify-content: flex-start;
13
+ text-align: left;
14
+ margin: 16px;
15
+ margin-bottom: 36px;
16
+ border-radius: 8px;
17
+ box-shadow: 0px 24px 30px rgba(0, 0, 0, 0.3);
18
+ border: 1px solid rgba(0, 0, 0, 0.05);
19
+ overflow: hidden;
20
+ }
21
+
22
+ .top-section-sra {
23
+ padding-top: 24px;
24
+ margin-bottom: 24px;
25
+ display: flex;
26
+ flex-direction: column;
27
+ justify-content: flex-start;
28
+ }
29
+
30
+ .horizontal-padding-sra {
31
+ padding-left: 20px;
32
+ padding-right: 20px;
33
+ }
34
+
35
+ .header-container-sra {
36
+ display: flex;
37
+ flex-direction: row;
38
+ justify-content: flex-start;
39
+ align-items: center;
40
+ margin-bottom: 24px;
41
+ }
42
+
43
+ .header-icon-sra {
44
+ display: block;
45
+ margin-right: 12px;
46
+ }
47
+
48
+ .translation-text-container-sra {
49
+ background-color: #f8f8f8;
50
+ padding-top: 12px;
51
+ padding-bottom: 4px;
52
+ }
53
+
54
+ .text-chunk-sra {
55
+ margin-bottom: 12px;
56
+ }
streaming-test-app/src/StreamingInterface.tsx ADDED
@@ -0,0 +1,1219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {useCallback, useEffect, useLayoutEffect, useRef, useState} from 'react';
2
+ import Button from '@mui/material/Button';
3
+ import Typography from '@mui/material/Typography';
4
+ import InputLabel from '@mui/material/InputLabel';
5
+ import FormControl from '@mui/material/FormControl';
6
+ import Select, {SelectChangeEvent} from '@mui/material/Select';
7
+ import MenuItem from '@mui/material/MenuItem';
8
+ import Stack from '@mui/material/Stack';
9
+ import seamlessLogoUrl from './assets/seamless.svg';
10
+ import {
11
+ AgentCapabilities,
12
+ BaseResponse,
13
+ BrowserAudioStreamConfig,
14
+ DynamicConfig,
15
+ PartialDynamicConfig,
16
+ SUPPORTED_INPUT_SOURCES,
17
+ SUPPORTED_OUTPUT_MODES,
18
+ ServerExceptionData,
19
+ ServerSpeechData,
20
+ ServerState,
21
+ ServerTextData,
22
+ StartStreamEventConfig,
23
+ StreamingStatus,
24
+ SupportedInputSource,
25
+ SupportedOutputMode,
26
+ TranslationSentences,
27
+ } from './types/StreamingTypes';
28
+ import FormLabel from '@mui/material/FormLabel';
29
+ import RadioGroup from '@mui/material/RadioGroup';
30
+ import FormControlLabel from '@mui/material/FormControlLabel';
31
+ import Radio from '@mui/material/Radio';
32
+ import './StreamingInterface.css';
33
+ import RoomConfig from './RoomConfig';
34
+ import Divider from '@mui/material/Divider';
35
+ import {useSocket} from './useSocket';
36
+ import {RoomState} from './types/RoomState';
37
+ import useStable from './useStable';
38
+ import float32To16BitPCM from './float32To16BitPCM';
39
+ import createBufferedSpeechPlayer from './createBufferedSpeechPlayer';
40
+ import Checkbox from '@mui/material/Checkbox';
41
+ import Alert from '@mui/material/Alert';
42
+ import isScrolledToDocumentBottom from './isScrolledToDocumentBottom';
43
+ import Box from '@mui/material/Box';
44
+ import Slider from '@mui/material/Slider';
45
+ import VolumeDown from '@mui/icons-material/VolumeDown';
46
+ import VolumeUp from '@mui/icons-material/VolumeUp';
47
+ import Mic from '@mui/icons-material/Mic';
48
+ import MicOff from '@mui/icons-material/MicOff';
49
+ import XRDialog from './react-xr/XRDialog';
50
+ import getTranslationSentencesFromReceivedData from './getTranslationSentencesFromReceivedData';
51
+ import {
52
+ sliceTranslationSentencesUpToIndex,
53
+ getTotalSentencesLength,
54
+ } from './sliceTranslationSentencesUtils';
55
+ import Blink from './Blink';
56
+ import {CURSOR_BLINK_INTERVAL_MS} from './cursorBlinkInterval';
57
+ import {getURLParams} from './URLParams';
58
+ import debug from './debug';
59
+ import DebugSection from './DebugSection';
60
+ import Switch from '@mui/material/Switch';
61
+ import Grid from '@mui/material/Grid';
62
+ import {getLanguageFromThreeLetterCode} from './languageLookup';
63
+ import HeadphonesIcon from '@mui/icons-material/Headphones';
64
+
65
+ const AUDIO_STREAM_DEFAULTS = {
66
+ userMedia: {
67
+ echoCancellation: false,
68
+ noiseSuppression: true,
69
+ },
70
+ displayMedia: {
71
+ echoCancellation: false,
72
+ noiseSuppression: false,
73
+ },
74
+ } as const;
75
+
76
+ async function requestUserMediaAudioStream(
77
+ config: BrowserAudioStreamConfig = AUDIO_STREAM_DEFAULTS['userMedia'],
78
+ ) {
79
+ const stream = await navigator.mediaDevices.getUserMedia({
80
+ audio: {...config, channelCount: 1},
81
+ });
82
+ console.debug(
83
+ '[requestUserMediaAudioStream] stream created with settings:',
84
+ stream.getAudioTracks()?.[0]?.getSettings(),
85
+ );
86
+ return stream;
87
+ }
88
+
89
+ async function requestDisplayMediaAudioStream(
90
+ config: BrowserAudioStreamConfig = AUDIO_STREAM_DEFAULTS['displayMedia'],
91
+ ) {
92
+ const stream = await navigator.mediaDevices.getDisplayMedia({
93
+ audio: {...config, channelCount: 1},
94
+ });
95
+ console.debug(
96
+ '[requestDisplayMediaAudioStream] stream created with settings:',
97
+ stream.getAudioTracks()?.[0]?.getSettings(),
98
+ );
99
+ return stream;
100
+ }
101
+
102
+ const buttonLabelMap: {[key in StreamingStatus]: string} = {
103
+ stopped: 'Start Streaming',
104
+ running: 'Stop Streaming',
105
+ starting: 'Starting...',
106
+ };
107
+
108
+ const BUFFER_LIMIT = 1;
109
+
110
+ const SCROLLED_TO_BOTTOM_THRESHOLD_PX = 36;
111
+
112
+ const GAIN_MULTIPLIER_OVER_1 = 3;
113
+
114
+ const getGainScaledValue = (value) =>
115
+ value > 1 ? (value - 1) * GAIN_MULTIPLIER_OVER_1 + 1 : value;
116
+
117
+ const TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD = 2;
118
+
119
+ const MAX_SERVER_EXCEPTIONS_TRACKED = 500;
120
+
121
+ export const TYPING_ANIMATION_DELAY_MS = 6;
122
+
123
+ export default function StreamingInterface() {
124
+ const urlParams = getURLParams();
125
+ const debugParam = urlParams.debug;
126
+ const [animateTextDisplay, setAnimateTextDisplay] = useState<boolean>(
127
+ urlParams.animateTextDisplay,
128
+ );
129
+
130
+ const socketObject = useSocket();
131
+ const {socket, clientID} = socketObject;
132
+
133
+ const [serverState, setServerState] = useState<ServerState | null>(null);
134
+ const [agent, setAgent] = useState<AgentCapabilities | null>(null);
135
+ const model = agent?.name ?? null;
136
+ const agentsCapabilities: Array<AgentCapabilities> =
137
+ serverState?.agentsCapabilities ?? [];
138
+ const currentAgent: AgentCapabilities | null =
139
+ agentsCapabilities.find((agent) => agent.name === model) ?? null;
140
+
141
+ const [serverExceptions, setServerExceptions] = useState<
142
+ Array<ServerExceptionData>
143
+ >([]);
144
+ const [roomState, setRoomState] = useState<RoomState | null>(null);
145
+ const roomID = roomState?.room_id ?? null;
146
+ const isSpeaker =
147
+ (clientID != null && roomState?.speakers.includes(clientID)) ?? false;
148
+ const isListener =
149
+ (clientID != null && roomState?.listeners.includes(clientID)) ?? false;
150
+
151
+ const [streamingStatus, setStreamingStatus] =
152
+ useState<StreamingStatus>('stopped');
153
+
154
+ const isStreamConfiguredRef = useRef<boolean>(false);
155
+ const [hasMaxUsers, setHasMaxUsers] = useState<boolean>(false);
156
+
157
+ const [outputMode, setOutputMode] = useState<SupportedOutputMode>('s2s&t');
158
+ const [inputSource, setInputSource] =
159
+ useState<SupportedInputSource>('userMedia');
160
+ const [enableNoiseSuppression, setEnableNoiseSuppression] = useState<
161
+ boolean | null
162
+ >(null);
163
+ const [enableEchoCancellation, setEnableEchoCancellation] = useState<
164
+ boolean | null
165
+ >(null);
166
+
167
+ // Dynamic Params:
168
+ const [targetLang, setTargetLang] = useState<string | null>(null);
169
+ const [enableExpressive, setEnableExpressive] = useState<boolean | null>(
170
+ null,
171
+ );
172
+
173
+ const [serverDebugFlag, setServerDebugFlag] = useState<boolean>(
174
+ debugParam ?? false,
175
+ );
176
+
177
+ const [receivedData, setReceivedData] = useState<Array<ServerTextData>>([]);
178
+ const [
179
+ translationSentencesAnimatedIndex,
180
+ setTranslationSentencesAnimatedIndex,
181
+ ] = useState<number>(0);
182
+
183
+ const lastTranslationResultRef = useRef<HTMLDivElement | null>(null);
184
+
185
+ const [inputStream, setInputStream] = useState<MediaStream | null>(null);
186
+ const [inputStreamSource, setInputStreamSource] =
187
+ useState<MediaStreamAudioSourceNode | null>(null);
188
+ const audioContext = useStable<AudioContext>(() => new AudioContext());
189
+ const [scriptNodeProcessor, setScriptNodeProcessor] =
190
+ useState<ScriptProcessorNode | null>(null);
191
+
192
+ const [muted, setMuted] = useState<boolean>(false);
193
+ // The onaudioprocess script needs an up-to-date reference to the muted state, so
194
+ // we use a ref here and keep it in sync via useEffect
195
+ const mutedRef = useRef<boolean>(muted);
196
+ useEffect(() => {
197
+ mutedRef.current = muted;
198
+ }, [muted]);
199
+
200
+ const [gain, setGain] = useState<number>(1);
201
+
202
+ const isScrolledToBottomRef = useRef<boolean>(isScrolledToDocumentBottom());
203
+
204
+ // Some config options must be set when starting streaming and cannot be chaned dynamically.
205
+ // This controls whether they are disabled or not
206
+ const streamFixedConfigOptionsDisabled =
207
+ streamingStatus !== 'stopped' || roomID == null;
208
+
209
+ const bufferedSpeechPlayer = useStable(() => {
210
+ const player = createBufferedSpeechPlayer({
211
+ onStarted: () => {
212
+ console.debug('📢 PLAYBACK STARTED 📢');
213
+ },
214
+ onEnded: () => {
215
+ console.debug('🛑 PLAYBACK ENDED 🛑');
216
+ },
217
+ });
218
+
219
+ // Start the player now so it eagerly plays audio when it arrives
220
+ player.start();
221
+ return player;
222
+ });
223
+
224
+ const translationSentencesBase: TranslationSentences =
225
+ getTranslationSentencesFromReceivedData(receivedData);
226
+
227
+ const translationSentencesBaseTotalLength = getTotalSentencesLength(
228
+ translationSentencesBase,
229
+ );
230
+
231
+ const translationSentences: TranslationSentences = animateTextDisplay
232
+ ? sliceTranslationSentencesUpToIndex(
233
+ translationSentencesBase,
234
+ translationSentencesAnimatedIndex,
235
+ )
236
+ : translationSentencesBase;
237
+
238
+ // We want the blinking cursor to show before any text has arrived, so let's add an empty string so that the cursor shows up
239
+ const translationSentencesWithEmptyStartingString =
240
+ streamingStatus === 'running' && translationSentences.length === 0
241
+ ? ['']
242
+ : translationSentences;
243
+
244
+ /******************************************
245
+ * Event Handlers
246
+ ******************************************/
247
+
248
+ const setAgentAndUpdateParams = useCallback(
249
+ (newAgent: AgentCapabilities | null) => {
250
+ setAgent((prevAgent) => {
251
+ if (prevAgent?.name !== newAgent?.name) {
252
+ setTargetLang(newAgent?.targetLangs[0] ?? null);
253
+ setEnableExpressive(null);
254
+ }
255
+ return newAgent;
256
+ });
257
+ },
258
+ [],
259
+ );
260
+
261
+ const onSetDynamicConfig = useCallback(
262
+ async (partialConfig: PartialDynamicConfig) => {
263
+ return new Promise<void>((resolve, reject) => {
264
+ if (socket == null) {
265
+ reject(new Error('[onSetDynamicConfig] socket is null '));
266
+ return;
267
+ }
268
+
269
+ socket.emit(
270
+ 'set_dynamic_config',
271
+ partialConfig,
272
+ (result: BaseResponse) => {
273
+ console.log('[emit result: set_dynamic_config]', result);
274
+ if (result.status === 'ok') {
275
+ resolve();
276
+ } else {
277
+ reject();
278
+ }
279
+ },
280
+ );
281
+ });
282
+ },
283
+ [socket],
284
+ );
285
+
286
+ const configureStreamAsync = ({sampleRate}: {sampleRate: number}) => {
287
+ return new Promise<void>((resolve, reject) => {
288
+ if (socket == null) {
289
+ reject(new Error('[configureStreamAsync] socket is null '));
290
+ return;
291
+ }
292
+ const modelName = agent?.name ?? null;
293
+ if (modelName == null) {
294
+ reject(new Error('[configureStreamAsync] modelName is null '));
295
+ return;
296
+ }
297
+
298
+ const config: StartStreamEventConfig = {
299
+ event: 'config',
300
+ rate: sampleRate,
301
+ model_name: modelName,
302
+ debug: serverDebugFlag,
303
+ // synchronous processing isn't implemented on the v2 pubsub server, so hardcode this to true
304
+ async_processing: true,
305
+ buffer_limit: BUFFER_LIMIT,
306
+ model_type: outputMode,
307
+ };
308
+
309
+ console.log('[configureStreamAsync] sending config', config);
310
+
311
+ socket.emit('configure_stream', config, (statusObject) => {
312
+ if (statusObject.status === 'ok') {
313
+ isStreamConfiguredRef.current = true;
314
+ console.debug(
315
+ '[configureStreamAsync] stream configured!',
316
+ statusObject,
317
+ );
318
+ resolve();
319
+ } else {
320
+ isStreamConfiguredRef.current = false;
321
+ reject(
322
+ new Error(
323
+ `[configureStreamAsync] configure_stream returned status: ${statusObject.status}`,
324
+ ),
325
+ );
326
+ return;
327
+ }
328
+ });
329
+ });
330
+ };
331
+
332
+ const startStreaming = async () => {
333
+ if (streamingStatus !== 'stopped') {
334
+ console.warn(
335
+ `Attempting to start stream when status is ${streamingStatus}`,
336
+ );
337
+ return;
338
+ }
339
+
340
+ setStreamingStatus('starting');
341
+
342
+ if (audioContext.state === 'suspended') {
343
+ console.warn('audioContext was suspended! resuming...');
344
+ await audioContext.resume();
345
+ }
346
+
347
+ let stream: MediaStream | null = null;
348
+
349
+ try {
350
+ if (inputSource === 'userMedia') {
351
+ stream = await requestUserMediaAudioStream({
352
+ noiseSuppression:
353
+ enableNoiseSuppression ??
354
+ AUDIO_STREAM_DEFAULTS['userMedia'].noiseSuppression,
355
+ echoCancellation:
356
+ enableEchoCancellation ??
357
+ AUDIO_STREAM_DEFAULTS['userMedia'].echoCancellation,
358
+ });
359
+ } else if (inputSource === 'displayMedia') {
360
+ stream = await requestDisplayMediaAudioStream({
361
+ noiseSuppression:
362
+ enableNoiseSuppression ??
363
+ AUDIO_STREAM_DEFAULTS['displayMedia'].noiseSuppression,
364
+ echoCancellation:
365
+ enableEchoCancellation ??
366
+ AUDIO_STREAM_DEFAULTS['displayMedia'].echoCancellation,
367
+ });
368
+ } else {
369
+ throw new Error(`Unsupported input source requested: ${inputSource}`);
370
+ }
371
+ setInputStream(stream);
372
+ } catch (e) {
373
+ console.error('[startStreaming] media stream request failed:', e);
374
+ setStreamingStatus('stopped');
375
+ return;
376
+ }
377
+
378
+ const mediaStreamSource = audioContext.createMediaStreamSource(stream);
379
+ setInputStreamSource(mediaStreamSource);
380
+ /**
381
+ * NOTE: This currently uses a deprecated way of processing the audio (createScriptProcessor), but
382
+ * which is easy and convenient for our purposes.
383
+ *
384
+ * Documentation for the deprecated way of doing it is here: https://developer.mozilla.org/en-US/docs/Web/API/BaseAudioContext/createScriptProcessor
385
+ *
386
+ * In an ideal world this would be migrated to something like this SO answer: https://stackoverflow.com/a/65448287
387
+ */
388
+ const scriptProcessor = audioContext.createScriptProcessor(16384, 1, 1);
389
+ setScriptNodeProcessor(scriptProcessor);
390
+
391
+ scriptProcessor.onaudioprocess = (event) => {
392
+ if (isStreamConfiguredRef.current === false) {
393
+ console.debug('[onaudioprocess] stream is not configured yet!');
394
+ return;
395
+ }
396
+ if (socket == null) {
397
+ console.warn('[onaudioprocess] socket is null in onaudioprocess');
398
+ return;
399
+ }
400
+
401
+ if (mutedRef.current) {
402
+ // We still want to send audio to the server when we're muted to ensure we
403
+ // get any remaining audio back from the server, so let's pass an array length 1 with a value of 0
404
+ const mostlyEmptyInt16Array = new Int16Array(1);
405
+ socket.emit('incoming_audio', mostlyEmptyInt16Array);
406
+ } else {
407
+ const float32Audio = event.inputBuffer.getChannelData(0);
408
+ const pcm16Audio = float32To16BitPCM(float32Audio);
409
+ socket.emit('incoming_audio', pcm16Audio);
410
+ }
411
+
412
+ debug()?.sentAudio(event);
413
+ };
414
+
415
+ mediaStreamSource.connect(scriptProcessor);
416
+ scriptProcessor.connect(audioContext.destination);
417
+
418
+ bufferedSpeechPlayer.start();
419
+
420
+ try {
421
+ if (targetLang == null) {
422
+ throw new Error('[startStreaming] targetLang cannot be nullish');
423
+ }
424
+
425
+ // When we are starting the stream we want to pass all the dynamic config values
426
+ // available before actually configuring and starting the stream
427
+ const fullDynamicConfig: DynamicConfig = {
428
+ targetLanguage: targetLang,
429
+ expressive: enableExpressive,
430
+ };
431
+
432
+ await onSetDynamicConfig(fullDynamicConfig);
433
+
434
+ // NOTE: this needs to be the *audioContext* sample rate, not the sample rate of the input stream. Not entirely sure why.
435
+ await configureStreamAsync({
436
+ sampleRate: audioContext.sampleRate,
437
+ });
438
+ } catch (e) {
439
+ console.error('configureStreamAsync failed', e);
440
+ setStreamingStatus('stopped');
441
+ return;
442
+ }
443
+
444
+ setStreamingStatus('running');
445
+ };
446
+
447
+ const stopStreaming = useCallback(async () => {
448
+ if (streamingStatus === 'stopped') {
449
+ console.warn(
450
+ `Attempting to stop stream when status is ${streamingStatus}`,
451
+ );
452
+ return;
453
+ }
454
+
455
+ // Stop the speech playback right away
456
+ bufferedSpeechPlayer.stop();
457
+
458
+ if (inputStreamSource == null || scriptNodeProcessor == null) {
459
+ console.error(
460
+ 'inputStreamSource || scriptNodeProcessor is null in stopStreaming',
461
+ );
462
+ } else {
463
+ inputStreamSource.disconnect(scriptNodeProcessor);
464
+ scriptNodeProcessor.disconnect(audioContext.destination);
465
+
466
+ // Release the mic input so we stop showing the red recording icon in the browser
467
+ inputStream?.getTracks().forEach((track) => track.stop());
468
+ }
469
+
470
+ if (socket == null) {
471
+ console.warn('Unable to emit stop_stream because socket is null');
472
+ } else {
473
+ socket.emit('stop_stream', (result) => {
474
+ console.debug('[emit result: stop_stream]', result);
475
+ });
476
+ }
477
+
478
+ setStreamingStatus('stopped');
479
+ }, [
480
+ audioContext.destination,
481
+ bufferedSpeechPlayer,
482
+ inputStream,
483
+ inputStreamSource,
484
+ scriptNodeProcessor,
485
+ socket,
486
+ streamingStatus,
487
+ ]);
488
+
489
+ const onClearTranscriptForAll = useCallback(() => {
490
+ if (socket != null) {
491
+ socket.emit('clear_transcript_for_all');
492
+ }
493
+ }, [socket]);
494
+
495
+ /******************************************
496
+ * Effects
497
+ ******************************************/
498
+
499
+ useEffect(() => {
500
+ if (socket == null) {
501
+ return;
502
+ }
503
+
504
+ const onRoomStateUpdate = (roomState: RoomState) => {
505
+ setRoomState(roomState);
506
+ };
507
+
508
+ socket.on('room_state_update', onRoomStateUpdate);
509
+
510
+ return () => {
511
+ socket.off('room_state_update', onRoomStateUpdate);
512
+ };
513
+ }, [socket]);
514
+
515
+ useEffect(() => {
516
+ if (socket != null) {
517
+ const onTranslationText = (data: ServerTextData) => {
518
+ setReceivedData((prev) => [...prev, data]);
519
+ debug()?.receivedText(data.payload);
520
+ };
521
+
522
+ const onTranslationSpeech = (data: ServerSpeechData) => {
523
+ bufferedSpeechPlayer.addAudioToBuffer(data.payload, data.sample_rate);
524
+ };
525
+
526
+ socket.on('translation_text', onTranslationText);
527
+ socket.on('translation_speech', onTranslationSpeech);
528
+
529
+ return () => {
530
+ socket.off('translation_text', onTranslationText);
531
+ socket.off('translation_speech', onTranslationSpeech);
532
+ };
533
+ }
534
+ }, [bufferedSpeechPlayer, socket]);
535
+
536
+ useEffect(() => {
537
+ if (socket != null) {
538
+ const onServerStateUpdate = (newServerState: ServerState) => {
539
+ setServerState(newServerState);
540
+
541
+ // If a client creates a server lock, we want to stop streaming if we're not them
542
+ if (
543
+ newServerState.serverLock?.isActive === true &&
544
+ newServerState.serverLock?.clientID !== clientID &&
545
+ streamingStatus === 'running'
546
+ ) {
547
+ stopStreaming();
548
+ }
549
+
550
+ const firstAgentNullable = newServerState.agentsCapabilities[0];
551
+ if (agent == null && firstAgentNullable != null) {
552
+ setAgentAndUpdateParams(firstAgentNullable);
553
+ }
554
+ };
555
+
556
+ socket.on('server_state_update', onServerStateUpdate);
557
+
558
+ return () => {
559
+ socket.off('server_state_update', onServerStateUpdate);
560
+ };
561
+ }
562
+ }, [
563
+ agent,
564
+ clientID,
565
+ setAgentAndUpdateParams,
566
+ socket,
567
+ stopStreaming,
568
+ streamingStatus,
569
+ ]);
570
+
571
+ useEffect(() => {
572
+ if (socket != null) {
573
+ const onServerException = (
574
+ exceptionDataWithoutClientTime: ServerExceptionData,
575
+ ) => {
576
+ const exceptionData = {
577
+ ...exceptionDataWithoutClientTime,
578
+ timeStringClient: new Date(
579
+ exceptionDataWithoutClientTime['timeEpochMs'],
580
+ ).toLocaleString(),
581
+ };
582
+
583
+ setServerExceptions((prev) =>
584
+ [exceptionData, ...prev].slice(0, MAX_SERVER_EXCEPTIONS_TRACKED),
585
+ );
586
+ console.error(
587
+ `[server_exception] The server encountered an exception: ${exceptionData['message']}`,
588
+ exceptionData,
589
+ );
590
+ };
591
+
592
+ socket.on('server_exception', onServerException);
593
+
594
+ return () => {
595
+ socket.off('server_exception', onServerException);
596
+ };
597
+ }
598
+ }, [socket]);
599
+
600
+ useEffect(() => {
601
+ if (socket != null) {
602
+ const onClearTranscript = () => {
603
+ setReceivedData([]);
604
+ setTranslationSentencesAnimatedIndex(0);
605
+ };
606
+
607
+ socket.on('clear_transcript', onClearTranscript);
608
+
609
+ return () => {
610
+ socket.off('clear_transcript', onClearTranscript);
611
+ };
612
+ }
613
+ }, [socket]);
614
+
615
+ useEffect(() => {
616
+ const onScroll = () => {
617
+ if (isScrolledToDocumentBottom(SCROLLED_TO_BOTTOM_THRESHOLD_PX)) {
618
+ isScrolledToBottomRef.current = true;
619
+ return;
620
+ }
621
+ isScrolledToBottomRef.current = false;
622
+ return;
623
+ };
624
+
625
+ document.addEventListener('scroll', onScroll);
626
+
627
+ return () => {
628
+ document.removeEventListener('scroll', onScroll);
629
+ };
630
+ }, []);
631
+
632
+ useLayoutEffect(() => {
633
+ if (
634
+ lastTranslationResultRef.current != null &&
635
+ isScrolledToBottomRef.current
636
+ ) {
637
+ // Scroll the div to the most recent entry
638
+ lastTranslationResultRef.current.scrollIntoView();
639
+ }
640
+ // Run the effect every time data is received, so that
641
+ // we scroll to the bottom even if we're just adding text to
642
+ // a pre-existing chunk
643
+ }, [receivedData]);
644
+
645
+ useEffect(() => {
646
+ if (!animateTextDisplay) {
647
+ return;
648
+ }
649
+
650
+ if (
651
+ translationSentencesAnimatedIndex < translationSentencesBaseTotalLength
652
+ ) {
653
+ const timeout = setTimeout(() => {
654
+ setTranslationSentencesAnimatedIndex((prev) => prev + 1);
655
+ debug()?.startRenderText();
656
+ }, TYPING_ANIMATION_DELAY_MS);
657
+
658
+ return () => clearTimeout(timeout);
659
+ } else {
660
+ debug()?.endRenderText();
661
+ }
662
+ }, [
663
+ animateTextDisplay,
664
+ translationSentencesAnimatedIndex,
665
+ translationSentencesBaseTotalLength,
666
+ ]);
667
+
668
+ /******************************************
669
+ * Sub-components
670
+ ******************************************/
671
+
672
+ const volumeSliderNode = (
673
+ <Stack
674
+ spacing={2}
675
+ direction="row"
676
+ sx={{mb: 1, width: '100%'}}
677
+ alignItems="center">
678
+ <VolumeDown color="primary" />
679
+ <Slider
680
+ aria-label="Volume"
681
+ defaultValue={1}
682
+ scale={getGainScaledValue}
683
+ min={0}
684
+ max={3}
685
+ step={0.1}
686
+ marks={[
687
+ {value: 0, label: '0%'},
688
+ {value: 1, label: '100%'},
689
+ {value: 2, label: '400%'},
690
+ {value: 3, label: '700%'},
691
+ ]}
692
+ valueLabelFormat={(value) => `${(value * 100).toFixed(0)}%`}
693
+ valueLabelDisplay="auto"
694
+ value={gain}
695
+ onChange={(_event: Event, newValue: number | number[]) => {
696
+ if (typeof newValue === 'number') {
697
+ const scaledGain = getGainScaledValue(newValue);
698
+ // We want the actual gain node to use the scaled value
699
+ bufferedSpeechPlayer.setGain(scaledGain);
700
+ // But we want react state to keep track of the non-scaled value
701
+ setGain(newValue);
702
+ } else {
703
+ console.error(
704
+ `[volume slider] Unexpected non-number value: ${newValue}`,
705
+ );
706
+ }
707
+ }}
708
+ />
709
+ <VolumeUp color="primary" />
710
+ </Stack>
711
+ );
712
+
713
+ const xrDialogComponent = (
714
+ <XRDialog
715
+ animateTextDisplay={
716
+ animateTextDisplay &&
717
+ translationSentencesAnimatedIndex == translationSentencesBaseTotalLength
718
+ }
719
+ bufferedSpeechPlayer={bufferedSpeechPlayer}
720
+ translationSentences={translationSentences}
721
+ roomState={roomState}
722
+ roomID={roomID}
723
+ startStreaming={startStreaming}
724
+ stopStreaming={stopStreaming}
725
+ debugParam={debugParam}
726
+ onARHidden={() => {
727
+ setAnimateTextDisplay(urlParams.animateTextDisplay);
728
+ }}
729
+ onARVisible={() => setAnimateTextDisplay(false)}
730
+ />
731
+ );
732
+
733
+ return (
734
+ <div className="app-wrapper-sra">
735
+ <Box
736
+ // eslint-disable-next-line @typescript-eslint/ban-ts-comment
737
+ // @ts-ignore Not sure why it's complaining about complexity here
738
+ sx={{width: '100%', maxWidth: '660px', minWidth: '320px'}}>
739
+ <div className="main-container-sra">
740
+ <div className="top-section-sra horizontal-padding-sra">
741
+ <div className="header-container-sra">
742
+ <img
743
+ src={seamlessLogoUrl}
744
+ className="header-icon-sra"
745
+ alt="Seamless Translation Logo"
746
+ height={24}
747
+ width={24}
748
+ />
749
+
750
+ <div>
751
+ <Typography variant="h1" sx={{color: '#65676B'}}>
752
+ Seamless Translation
753
+ </Typography>
754
+ </div>
755
+ </div>
756
+ <div className="header-container-sra">
757
+ <div>
758
+ <Typography variant="body2" sx={{color: '#65676B'}}>
759
+ Welcome! This space is limited to one user at a time.
760
+ If using the live HF space, sharing room code to listeners on another
761
+ IP address may not work because it's running on different replicas.
762
+ Use headphones if you are both speaker and listener to prevent feedback.
763
+ <br/>
764
+ If max users reached, please duplicate the space <a target="_blank" rel="noopener noreferrer" href="https://huggingface.co/spaces/facebook/seamless-streaming?duplicate=true">here</a>.
765
+ In your duplicated space, join a room as speaker or listener (or both),
766
+ and share the room code to invite listeners.
767
+ <br/>
768
+ Check out the seamless_communication <a target="_blank" rel="noopener noreferrer" href="https://github.com/facebookresearch/seamless_communication/tree/main">README</a> for more information.
769
+ <br/>
770
+ SeamlessStreaming model is a research model and is not released
771
+ for production deployment. It is important to use a microphone with
772
+ noise cancellation (for e.g. a smartphone), otherwise you may see model hallucination on noises.
773
+ It works best if you pause every couple of sentences, or you may wish adjust the VAD threshold
774
+ in the model config. The real-time performance will degrade
775
+ if you try streaming multiple speakers at the same time.
776
+ </Typography>
777
+ </div>
778
+ </div>
779
+ <Stack spacing="22px" direction="column">
780
+ <Box>
781
+ <RoomConfig
782
+ roomState={roomState}
783
+ serverState={serverState}
784
+ streamingStatus={streamingStatus}
785
+ setHasMaxUsers={setHasMaxUsers}
786
+ onJoinRoomOrUpdateRoles={() => {
787
+ // If the user has switched from speaker to listener we need to tell the
788
+ // player to play eagerly, since currently the listener doesn't have any stop/start controls
789
+ bufferedSpeechPlayer.start();
790
+ }}
791
+ />
792
+
793
+ {isListener && !isSpeaker && (
794
+ <Box
795
+ sx={{
796
+ paddingX: 6,
797
+ paddingBottom: 2,
798
+ marginY: 2,
799
+ display: 'flex',
800
+ flexDirection: 'column',
801
+ alignItems: 'center',
802
+ }}>
803
+ {volumeSliderNode}
804
+ </Box>
805
+ )}
806
+ </Box>
807
+
808
+ {isSpeaker && (
809
+ <>
810
+ <Divider />
811
+
812
+ <Stack spacing="12px" direction="column">
813
+ <FormLabel id="output-modes-radio-group-label">
814
+ Model
815
+ </FormLabel>
816
+ <FormControl
817
+ disabled={
818
+ streamFixedConfigOptionsDisabled ||
819
+ agentsCapabilities.length === 0
820
+ }
821
+ fullWidth
822
+ sx={{minWidth: '14em'}}>
823
+ <InputLabel id="model-selector-input-label">
824
+ Model
825
+ </InputLabel>
826
+ <Select
827
+ labelId="model-selector-input-label"
828
+ label="Model"
829
+ onChange={(e: SelectChangeEvent) => {
830
+ const newAgent =
831
+ agentsCapabilities.find(
832
+ (agent) => e.target.value === agent.name,
833
+ ) ?? null;
834
+ if (newAgent == null) {
835
+ console.error(
836
+ 'Unable to find agent with name',
837
+ e.target.value,
838
+ );
839
+ }
840
+ setAgentAndUpdateParams(newAgent);
841
+ }}
842
+ value={model ?? ''}>
843
+ {agentsCapabilities.map((agent) => (
844
+ <MenuItem value={agent.name} key={agent.name}>
845
+ {agent.name}
846
+ </MenuItem>
847
+ ))}
848
+ </Select>
849
+ </FormControl>
850
+
851
+ </Stack>
852
+
853
+ <Stack spacing={0.5}>
854
+ <FormLabel id="output-modes-radio-group-label">
855
+ Output
856
+ </FormLabel>
857
+
858
+ <Box sx={{paddingTop: 2, paddingBottom: 1}}>
859
+ <FormControl fullWidth sx={{minWidth: '14em'}}>
860
+ <InputLabel id="target-selector-input-label">
861
+ Target Language
862
+ </InputLabel>
863
+ <Select
864
+ labelId="target-selector-input-label"
865
+ label="Target Language"
866
+ onChange={(e: SelectChangeEvent) => {
867
+ setTargetLang(e.target.value);
868
+ onSetDynamicConfig({
869
+ targetLanguage: e.target.value,
870
+ });
871
+ }}
872
+ value={targetLang ?? ''}>
873
+ {currentAgent?.targetLangs.map((langCode) => (
874
+ <MenuItem value={langCode} key={langCode}>
875
+ {getLanguageFromThreeLetterCode(langCode) != null
876
+ ? `${getLanguageFromThreeLetterCode(
877
+ langCode,
878
+ )} (${langCode})`
879
+ : langCode}
880
+ </MenuItem>
881
+ ))}
882
+ </Select>
883
+ </FormControl>
884
+ </Box>
885
+
886
+ <Grid container>
887
+ <Grid item xs={12} sm={4}>
888
+ <FormControl
889
+ disabled={streamFixedConfigOptionsDisabled}>
890
+ <RadioGroup
891
+ aria-labelledby="output-modes-radio-group-label"
892
+ value={outputMode}
893
+ onChange={(e) =>
894
+ setOutputMode(
895
+ e.target.value as SupportedOutputMode,
896
+ )
897
+ }
898
+ name="output-modes-radio-buttons-group">
899
+ {
900
+ // TODO: Use supported modalities from agentCapabilities
901
+ SUPPORTED_OUTPUT_MODES.map(({value, label}) => (
902
+ <FormControlLabel
903
+ key={value}
904
+ value={value}
905
+ control={<Radio />}
906
+ label={label}
907
+ />
908
+ ))
909
+ }
910
+ </RadioGroup>
911
+ </FormControl>
912
+ </Grid>
913
+
914
+ <Grid item xs={12} sm={8}>
915
+ <Stack
916
+ direction="column"
917
+ spacing={1}
918
+ alignItems="flex-start"
919
+ sx={{flexGrow: 1}}>
920
+ {currentAgent?.dynamicParams?.includes(
921
+ 'expressive',
922
+ ) && (
923
+ <FormControlLabel
924
+ control={
925
+ <Switch
926
+ checked={enableExpressive ?? false}
927
+ onChange={(
928
+ event: React.ChangeEvent<HTMLInputElement>,
929
+ ) => {
930
+ const newValue = event.target.checked;
931
+ setEnableExpressive(newValue);
932
+ onSetDynamicConfig({
933
+ expressive: newValue,
934
+ });
935
+ }}
936
+ />
937
+ }
938
+ label="Expressive"
939
+ />
940
+ )}
941
+
942
+ {isListener && (
943
+ <Box
944
+ sx={{
945
+ flexGrow: 1,
946
+ paddingX: 1.5,
947
+ paddingY: 1.5,
948
+ width: '100%',
949
+ }}>
950
+ {volumeSliderNode}
951
+ </Box>
952
+ )}
953
+ </Stack>
954
+ </Grid>
955
+ </Grid>
956
+ </Stack>
957
+
958
+ <Stack
959
+ direction="row"
960
+ spacing={2}
961
+ justifyContent="space-between">
962
+ <Box sx={{flex: 1}}>
963
+ <FormControl disabled={streamFixedConfigOptionsDisabled}>
964
+ <FormLabel id="input-source-radio-group-label">
965
+ Input Source
966
+ </FormLabel>
967
+ <RadioGroup
968
+ aria-labelledby="input-source-radio-group-label"
969
+ value={inputSource}
970
+ onChange={(e: React.ChangeEvent<HTMLInputElement>) =>
971
+ setInputSource(
972
+ e.target.value as SupportedInputSource,
973
+ )
974
+ }
975
+ name="input-source-radio-buttons-group">
976
+ {SUPPORTED_INPUT_SOURCES.map(({label, value}) => (
977
+ <FormControlLabel
978
+ key={value}
979
+ value={value}
980
+ control={<Radio />}
981
+ label={label}
982
+ />
983
+ ))}
984
+ </RadioGroup>
985
+ </FormControl>
986
+ </Box>
987
+
988
+ <Box sx={{flex: 1, flexGrow: 2}}>
989
+ <FormControl disabled={streamFixedConfigOptionsDisabled}>
990
+ <FormLabel>Options</FormLabel>
991
+ <FormControlLabel
992
+ control={
993
+ <Checkbox
994
+ checked={
995
+ enableNoiseSuppression ??
996
+ AUDIO_STREAM_DEFAULTS[inputSource]
997
+ .noiseSuppression
998
+ }
999
+ onChange={(
1000
+ event: React.ChangeEvent<HTMLInputElement>,
1001
+ ) =>
1002
+ setEnableNoiseSuppression(event.target.checked)
1003
+ }
1004
+ />
1005
+ }
1006
+ label="Noise Suppression"
1007
+ />
1008
+ <FormControlLabel
1009
+ control={
1010
+ <Checkbox
1011
+ checked={
1012
+ enableEchoCancellation ??
1013
+ AUDIO_STREAM_DEFAULTS[inputSource]
1014
+ .echoCancellation
1015
+ }
1016
+ onChange={(
1017
+ event: React.ChangeEvent<HTMLInputElement>,
1018
+ ) =>
1019
+ setEnableEchoCancellation(event.target.checked)
1020
+ }
1021
+ />
1022
+ }
1023
+ label="Echo Cancellation (not recommended)"
1024
+ />
1025
+ <FormControlLabel
1026
+ control={
1027
+ <Checkbox
1028
+ checked={serverDebugFlag}
1029
+ onChange={(
1030
+ event: React.ChangeEvent<HTMLInputElement>,
1031
+ ) => setServerDebugFlag(event.target.checked)}
1032
+ />
1033
+ }
1034
+ label="Enable Server Debugging"
1035
+ />
1036
+ </FormControl>
1037
+ </Box>
1038
+ </Stack>
1039
+
1040
+ {isSpeaker &&
1041
+ isListener &&
1042
+ inputSource === 'userMedia' &&
1043
+ !enableEchoCancellation &&
1044
+ gain !== 0 && (
1045
+ <div>
1046
+ <Alert severity="warning" icon={<HeadphonesIcon />}>
1047
+ Headphones required to prevent feedback.
1048
+ </Alert>
1049
+ </div>
1050
+ )}
1051
+
1052
+ {isSpeaker && enableEchoCancellation && (
1053
+ <div>
1054
+ <Alert severity="warning">
1055
+ We don't recommend using echo cancellation as it may
1056
+ distort the input audio. If possible, use headphones and
1057
+ disable echo cancellation instead.
1058
+ </Alert>
1059
+ </div>
1060
+ )}
1061
+
1062
+ <Stack direction="row" spacing={2}>
1063
+ {streamingStatus === 'stopped' ? (
1064
+ <Button
1065
+ variant="contained"
1066
+ onClick={startStreaming}
1067
+ disabled={
1068
+ roomID == null ||
1069
+ // Prevent users from starting streaming if there is a server lock with an active session
1070
+ (serverState?.serverLock?.isActive === true &&
1071
+ serverState.serverLock.clientID !== clientID)
1072
+ }>
1073
+ {buttonLabelMap[streamingStatus]}
1074
+ </Button>
1075
+ ) : (
1076
+ <Button
1077
+ variant="contained"
1078
+ color={
1079
+ streamingStatus === 'running' ? 'error' : 'primary'
1080
+ }
1081
+ disabled={
1082
+ streamingStatus === 'starting' || roomID == null
1083
+ }
1084
+ onClick={stopStreaming}>
1085
+ {buttonLabelMap[streamingStatus]}
1086
+ </Button>
1087
+ )}
1088
+
1089
+ <Box>
1090
+ <Button
1091
+ variant="contained"
1092
+ aria-label={muted ? 'Unmute' : 'Mute'}
1093
+ color={muted ? 'info' : 'primary'}
1094
+ onClick={() => setMuted((prev) => !prev)}
1095
+ sx={{
1096
+ borderRadius: 100,
1097
+ paddingX: 0,
1098
+ minWidth: '36px',
1099
+ }}>
1100
+ {muted ? <MicOff /> : <Mic />}
1101
+ </Button>
1102
+ </Box>
1103
+
1104
+ {roomID == null ? null : (
1105
+ <Box
1106
+ sx={{
1107
+ flexGrow: 1,
1108
+ display: 'flex',
1109
+ justifyContent: 'flex-end',
1110
+ }}>
1111
+ {xrDialogComponent}
1112
+ </Box>
1113
+ )}
1114
+ </Stack>
1115
+
1116
+ {serverExceptions.length > 0 && (
1117
+ <div>
1118
+ <Alert severity="error">
1119
+ {`The server encountered an exception. See the browser console for details. You may need to refresh the page to continue using the app.`}
1120
+ </Alert>
1121
+ </div>
1122
+ )}
1123
+ {serverState != null &&
1124
+ serverState.totalActiveTranscoders >=
1125
+ TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD && (
1126
+ <div>
1127
+ <Alert severity="warning">
1128
+ {`The server currently has ${serverState?.totalActiveTranscoders} active streaming sessions. Performance may be degraded.`}
1129
+ </Alert>
1130
+ </div>
1131
+ )}
1132
+
1133
+ {serverState?.serverLock != null &&
1134
+ serverState.serverLock.clientID !== clientID && (
1135
+ <div>
1136
+ <Alert severity="warning">
1137
+ {`The server is currently locked. Priority will be given to that client when they are streaming, and your streaming session may be halted abruptly.`}
1138
+ </Alert>
1139
+ </div>
1140
+ )}
1141
+ </>
1142
+ )}
1143
+ </Stack>
1144
+
1145
+ {isListener && !isSpeaker && (
1146
+ <Box sx={{marginBottom: 1, marginTop: 2}}>
1147
+ {xrDialogComponent}
1148
+ </Box>
1149
+ )}
1150
+ </div>
1151
+
1152
+ {hasMaxUsers && (
1153
+ <div>
1154
+ <Alert severity="error">
1155
+ {`Maximum number of users reached. Please try again at a later time.`}
1156
+ </Alert>
1157
+ </div>
1158
+ )}
1159
+ {debugParam && roomID != null && <DebugSection />}
1160
+
1161
+ <div className="translation-text-container-sra horizontal-padding-sra">
1162
+ <Stack
1163
+ direction="row"
1164
+ spacing={2}
1165
+ sx={{mb: '16px', alignItems: 'center'}}>
1166
+ <Typography variant="h1" sx={{fontWeight: 700, flexGrow: 1}}>
1167
+ Transcript
1168
+ </Typography>
1169
+ {isSpeaker && (
1170
+ <Button
1171
+ variant="text"
1172
+ size="small"
1173
+ onClick={onClearTranscriptForAll}>
1174
+ Clear Transcript for All
1175
+ </Button>
1176
+ )}
1177
+ </Stack>
1178
+ <Stack direction="row">
1179
+ <div className="translation-text-sra">
1180
+ {translationSentencesWithEmptyStartingString.map(
1181
+ (sentence, index, arr) => {
1182
+ const isLast = index === arr.length - 1;
1183
+ const maybeRef = isLast
1184
+ ? {ref: lastTranslationResultRef}
1185
+ : {};
1186
+ return (
1187
+ <div className="text-chunk-sra" key={index} {...maybeRef}>
1188
+ <Typography variant="body1">
1189
+ {sentence}
1190
+ {animateTextDisplay && isLast && (
1191
+ <Blink
1192
+ intervalMs={CURSOR_BLINK_INTERVAL_MS}
1193
+ shouldBlink={
1194
+ (roomState?.activeTranscoders ?? 0) > 0
1195
+ }>
1196
+ <Typography
1197
+ component="span"
1198
+ variant="body1"
1199
+ sx={{
1200
+ display: 'inline-block',
1201
+ transform: 'scaleY(1.25) translateY(-1px)',
1202
+ }}>
1203
+ {'|'}
1204
+ </Typography>
1205
+ </Blink>
1206
+ )}
1207
+ </Typography>
1208
+ </div>
1209
+ );
1210
+ },
1211
+ )}
1212
+ </div>
1213
+ </Stack>
1214
+ </div>
1215
+ </div>
1216
+ </Box>
1217
+ </div>
1218
+ );
1219
+ }
streaming-test-app/src/URLParams.ts ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { getBooleanParamFlag, getStringParamFlag } from './getParamFlag';
2
+ import { URLParamsObject } from './types/URLParamsTypes';
3
+
4
+ /**
5
+ * These are the URL parameters you can provide to the app to change its behavior.
6
+ *
7
+ * Boolean flags can be set by just providing the flag name (`?autoJoin`), or by
8
+ * explicitly setting it to 1 (true) or 0 (false): `?autoJoin=1` or `?autoJoin=0`
9
+ *
10
+ * String flags require an explicit value: `?roomID=ABCD`
11
+ *
12
+ * Examples:
13
+ *
14
+ * - `http://localhost:5173/?roomID=BBCD&autoJoin&debug`
15
+ * - `http://localhost:5173/?serverURL=localhost:8000`
16
+
17
+ * @returns
18
+ */
19
+
20
+ export function getURLParams(): URLParamsObject {
21
+ return {
22
+ // animate the translation text when it arrives, typing it out one letter at a time
23
+ animateTextDisplay: getBooleanParamFlag('animateTextDisplay', true), // default to true;
24
+
25
+ // automatically join the room when the app loads. requires roomID to be set via url param as well
26
+ autoJoin: getBooleanParamFlag('autoJoin', false),
27
+
28
+ // automatically check the server debug flag as true
29
+ debug: getBooleanParamFlag('debug', false),
30
+
31
+ // Enable UI on the client that allows locking out other users of the server when it's being used for high profile demos
32
+ // NOTE: There is an escape hatch for disabling a server lock by setting the name field to remove_server_lock
33
+ enableServerLock: getBooleanParamFlag('enableServerLock', false),
34
+
35
+ // Pre-populate the Room Code field with the provided roomID. Can be used in conjunction with autoJoin to jump straight into the room
36
+ roomID: getStringParamFlag('roomID'),
37
+
38
+ // Use an alternate server URL as the streaming server (useful for pointing to dev servers: http://localhost:5173/?serverURL=localhost:8000)
39
+ serverURL: getStringParamFlag('serverURL'),
40
+
41
+ // Skip the popup dialog that displays within VR, which is mostly redundant with the web based dialog
42
+ skipARIntro: getBooleanParamFlag('skipARIntro', true), // default to true
43
+
44
+ // Shows the translation text in AR in front of an opaque panel covering all the text area
45
+ // single_block = original single text block with background
46
+ // lines = each line is a separate block and animates
47
+ // lines_with_background = adds a panel behind lines
48
+ ARTranscriptionType: getStringParamFlag('ARTranscriptionType') || 'lines',
49
+ };
50
+ }
streaming-test-app/src/assets/Roboto-msdf.json ADDED
The diff for this file is too large to render. See raw diff
 
streaming-test-app/src/assets/Roboto-msdf.png ADDED
streaming-test-app/src/assets/RobotoMono-Regular-msdf.json ADDED
The diff for this file is too large to render. See raw diff
 
streaming-test-app/src/assets/RobotoMono-Regular.png ADDED
streaming-test-app/src/assets/seamless.svg ADDED
streaming-test-app/src/createBufferedSpeechPlayer.ts ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import debug from './debug';
2
+
3
+ type AddAudioToBufferFunction = (
4
+ samples: Array<number>,
5
+ sampleRate: number,
6
+ ) => void;
7
+
8
+ export type BufferedSpeechPlayer = {
9
+ addAudioToBuffer: AddAudioToBufferFunction;
10
+ setGain: (gain: number) => void;
11
+ start: () => void;
12
+ stop: () => void;
13
+ };
14
+
15
+ type Options = {
16
+ onEnded?: () => void;
17
+ onStarted?: () => void;
18
+ };
19
+
20
+ export default function createBufferedSpeechPlayer({
21
+ onStarted,
22
+ onEnded,
23
+ }: Options): BufferedSpeechPlayer {
24
+ const audioContext = new AudioContext();
25
+ const gainNode = audioContext.createGain();
26
+ gainNode.connect(audioContext.destination);
27
+
28
+ let unplayedAudioBuffers: Array<AudioBuffer> = [];
29
+
30
+ let currentPlayingBufferSource: AudioBufferSourceNode | null = null;
31
+
32
+ let isPlaying = false;
33
+
34
+ // This means that the player starts in the 'stopped' state, and you need to call player.start() for it to start playing
35
+ let shouldPlayWhenAudioAvailable = false;
36
+
37
+ const setGain = (gain: number) => {
38
+ gainNode.gain.setValueAtTime(gain, audioContext.currentTime);
39
+ };
40
+
41
+ const start = () => {
42
+ shouldPlayWhenAudioAvailable = true;
43
+ debug()?.start();
44
+ playNextBufferIfNotAlreadyPlaying();
45
+ };
46
+
47
+ // Stop will stop the audio and clear the buffers
48
+ const stop = () => {
49
+ shouldPlayWhenAudioAvailable = false;
50
+
51
+ // Stop the current buffers
52
+ currentPlayingBufferSource?.stop();
53
+ currentPlayingBufferSource = null;
54
+
55
+ unplayedAudioBuffers = [];
56
+
57
+ onEnded != null && onEnded();
58
+ isPlaying = false;
59
+ return;
60
+ };
61
+
62
+ const playNextBufferIfNotAlreadyPlaying = () => {
63
+ if (!isPlaying) {
64
+ playNextBuffer();
65
+ }
66
+ };
67
+
68
+ const playNextBuffer = () => {
69
+ if (shouldPlayWhenAudioAvailable === false) {
70
+ console.debug(
71
+ '[BufferedSpeechPlayer][playNextBuffer] Not playing any more audio because shouldPlayWhenAudioAvailable is false.',
72
+ );
73
+ // NOTE: we do not need to set isPlaying = false or call onEnded because that will be handled in the stop() function
74
+ return;
75
+ }
76
+ if (unplayedAudioBuffers.length === 0) {
77
+ console.debug(
78
+ '[BufferedSpeechPlayer][playNextBuffer] No buffers to play.',
79
+ );
80
+ if (isPlaying) {
81
+ isPlaying = false;
82
+ onEnded != null && onEnded();
83
+ }
84
+ return;
85
+ }
86
+
87
+ // If isPlaying is false, then we are starting playback fresh rather than continuing it, and should call onStarted
88
+ if (isPlaying === false) {
89
+ isPlaying = true;
90
+ onStarted != null && onStarted();
91
+ }
92
+
93
+ const source = audioContext.createBufferSource();
94
+
95
+ // Get the first unplayed buffer from the array, and remove it from the array
96
+ const buffer = unplayedAudioBuffers.shift() ?? null;
97
+ source.buffer = buffer;
98
+ console.debug(
99
+ `[BufferedSpeechPlayer] Playing buffer with ${source.buffer?.length} samples`,
100
+ );
101
+
102
+ source.connect(gainNode);
103
+
104
+ const startTime = new Date().getTime();
105
+ source.start();
106
+ currentPlayingBufferSource = source;
107
+ // This is probably not necessary, but it doesn't hurt
108
+ isPlaying = true;
109
+
110
+ // TODO: consider changing this to a while loop to avoid deep recursion
111
+ const onThisBufferPlaybackEnded = () => {
112
+ console.debug(
113
+ `[BufferedSpeechPlayer] Buffer with ${source.buffer?.length} samples ended.`,
114
+ );
115
+ source.removeEventListener('ended', onThisBufferPlaybackEnded);
116
+ const endTime = new Date().getTime();
117
+ debug()?.playedAudio(startTime, endTime, buffer);
118
+ currentPlayingBufferSource = null;
119
+
120
+ // We don't set isPlaying = false here because we are attempting to continue playing. It will get set to false if there are no more buffers to play
121
+ playNextBuffer();
122
+ };
123
+
124
+ source.addEventListener('ended', onThisBufferPlaybackEnded);
125
+ };
126
+
127
+ const addAudioToBuffer: AddAudioToBufferFunction = (samples, sampleRate) => {
128
+ const incomingArrayBufferChunk = audioContext.createBuffer(
129
+ // 1 channel
130
+ 1,
131
+ samples.length,
132
+ sampleRate,
133
+ );
134
+
135
+ incomingArrayBufferChunk.copyToChannel(
136
+ new Float32Array(samples),
137
+ // first channel
138
+ 0,
139
+ );
140
+
141
+ console.debug(
142
+ `[addAudioToBufferAndPlay] Adding buffer with ${incomingArrayBufferChunk.length} samples to queue.`,
143
+ );
144
+
145
+ unplayedAudioBuffers.push(incomingArrayBufferChunk);
146
+ debug()?.receivedAudio(
147
+ incomingArrayBufferChunk.length / incomingArrayBufferChunk.sampleRate,
148
+ );
149
+ const audioBuffersTableInfo = unplayedAudioBuffers.map((buffer, i) => {
150
+ return {
151
+ index: i,
152
+ duration: buffer.length / buffer.sampleRate,
153
+ samples: buffer.length,
154
+ };
155
+ });
156
+ const totalUnplayedDuration = unplayedAudioBuffers.reduce((acc, buffer) => {
157
+ return acc + buffer.length / buffer.sampleRate;
158
+ }, 0);
159
+
160
+ console.debug(
161
+ `[addAudioToBufferAndPlay] Current state of incoming audio buffers (${totalUnplayedDuration.toFixed(
162
+ 1,
163
+ )}s unplayed):`,
164
+ );
165
+ console.table(audioBuffersTableInfo);
166
+
167
+ if (shouldPlayWhenAudioAvailable) {
168
+ playNextBufferIfNotAlreadyPlaying();
169
+ }
170
+ };
171
+
172
+ return {addAudioToBuffer, setGain, stop, start};
173
+ }
streaming-test-app/src/cursorBlinkInterval.ts ADDED
@@ -0,0 +1 @@
 
 
1
+ export const CURSOR_BLINK_INTERVAL_MS = 500;
streaming-test-app/src/debug.ts ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {TYPING_ANIMATION_DELAY_MS} from './StreamingInterface';
2
+ import {getURLParams} from './URLParams';
3
+ import audioBuffertoWav from 'audiobuffer-to-wav';
4
+ import './StreamingInterface.css';
5
+
6
+ type StartEndTime = {
7
+ start: number;
8
+ end: number;
9
+ };
10
+
11
+ type StartEndTimeWithAudio = StartEndTime & {
12
+ float32Audio: Float32Array;
13
+ };
14
+
15
+ type Text = {
16
+ time: number;
17
+ chars: number;
18
+ };
19
+
20
+ type DebugTimings = {
21
+ receivedAudio: StartEndTime[];
22
+ playedAudio: StartEndTimeWithAudio[];
23
+ receivedText: Text[];
24
+ renderedText: StartEndTime[];
25
+ sentAudio: StartEndTimeWithAudio[];
26
+ startRenderTextTime: number | null;
27
+ startRecordingTime: number | null;
28
+ receivedAudioSampleRate: number | null;
29
+ };
30
+
31
+ function getInitialTimings(): DebugTimings {
32
+ return {
33
+ receivedAudio: [],
34
+ playedAudio: [],
35
+ receivedText: [],
36
+ renderedText: [],
37
+ sentAudio: [],
38
+ startRenderTextTime: null,
39
+ startRecordingTime: null,
40
+ receivedAudioSampleRate: null,
41
+ };
42
+ }
43
+
44
+ function downloadAudioBuffer(audioBuffer: AudioBuffer, fileName: string): void {
45
+ const wav = audioBuffertoWav(audioBuffer);
46
+ const wavBlob = new Blob([new DataView(wav)], {
47
+ type: 'audio/wav',
48
+ });
49
+ const url = URL.createObjectURL(wavBlob);
50
+ const anchor = document.createElement('a');
51
+ anchor.href = url;
52
+ anchor.target = '_blank';
53
+ anchor.download = fileName;
54
+ anchor.click();
55
+ }
56
+
57
+ // Uncomment for debugging without download
58
+ // function playAudioBuffer(audioBuffer: AudioBuffer): void {
59
+ // const audioContext = new AudioContext();
60
+ // const source = audioContext.createBufferSource();
61
+
62
+ // source.buffer = audioBuffer;
63
+ // source.connect(audioContext.destination);
64
+ // source.start();
65
+ // }
66
+
67
+ // Accumulate timings and audio / text translation samples for debugging and exporting
68
+ class DebugTimingsManager {
69
+ timings: DebugTimings = getInitialTimings();
70
+
71
+ start(): void {
72
+ this.timings = getInitialTimings();
73
+ this.timings.startRecordingTime = new Date().getTime();
74
+ }
75
+
76
+ sentAudio(event: AudioProcessingEvent): void {
77
+ const end = new Date().getTime();
78
+ const start = end - event.inputBuffer.duration * 1000;
79
+ // Copy or else buffer seems to be re-used
80
+ const float32Audio = new Float32Array(event.inputBuffer.getChannelData(0));
81
+ this.timings.sentAudio.push({
82
+ start,
83
+ end,
84
+ float32Audio,
85
+ });
86
+ }
87
+
88
+ receivedText(text: string): void {
89
+ this.timings.receivedText.push({
90
+ time: new Date().getTime(),
91
+ chars: text.length,
92
+ });
93
+ }
94
+
95
+ startRenderText(): void {
96
+ if (this.timings.startRenderTextTime == null) {
97
+ this.timings.startRenderTextTime = new Date().getTime();
98
+ }
99
+ }
100
+
101
+ endRenderText(): void {
102
+ if (this.timings.startRenderTextTime == null) {
103
+ console.warn(
104
+ 'Wrong timings of start / end rendering text. startRenderText is null',
105
+ );
106
+ return;
107
+ }
108
+
109
+ this.timings.renderedText.push({
110
+ start: this.timings.startRenderTextTime as number,
111
+ end: new Date().getTime(),
112
+ });
113
+ this.timings.startRenderTextTime = null;
114
+ }
115
+
116
+ receivedAudio(duration: number): void {
117
+ const start = new Date().getTime();
118
+ this.timings.receivedAudio.push({
119
+ start,
120
+ end: start + duration * 1000,
121
+ });
122
+ }
123
+
124
+ playedAudio(start: number, end: number, buffer: AudioBuffer | null): void {
125
+ if (buffer != null) {
126
+ if (this.timings.receivedAudioSampleRate == null) {
127
+ this.timings.receivedAudioSampleRate = buffer.sampleRate;
128
+ }
129
+ if (this.timings.receivedAudioSampleRate != buffer.sampleRate) {
130
+ console.error(
131
+ 'Sample rates of received audio are unequal, will fail to reconstruct debug audio',
132
+ this.timings.receivedAudioSampleRate,
133
+ buffer.sampleRate,
134
+ );
135
+ }
136
+ }
137
+ this.timings.playedAudio.push({
138
+ start,
139
+ end,
140
+ float32Audio:
141
+ buffer == null
142
+ ? new Float32Array()
143
+ : new Float32Array(buffer.getChannelData(0)),
144
+ });
145
+ }
146
+
147
+ getChartData() {
148
+ const columns = [
149
+ {type: 'string', id: 'Series'},
150
+ {type: 'date', id: 'Start'},
151
+ {type: 'date', id: 'End'},
152
+ ];
153
+ return [
154
+ columns,
155
+ ...this.timings.sentAudio.map((sentAudio) => [
156
+ 'Sent Audio',
157
+ new Date(sentAudio.start),
158
+ new Date(sentAudio.end),
159
+ ]),
160
+ ...this.timings.receivedAudio.map((receivedAudio) => [
161
+ 'Received Audio',
162
+ new Date(receivedAudio.start),
163
+ new Date(receivedAudio.end),
164
+ ]),
165
+ ...this.timings.playedAudio.map((playedAudio) => [
166
+ 'Played Audio',
167
+ new Date(playedAudio.start),
168
+ new Date(playedAudio.end),
169
+ ]),
170
+ // Best estimate duration by multiplying length with animation duration for each letter
171
+ ...this.timings.receivedText.map((receivedText) => [
172
+ 'Received Text',
173
+ new Date(receivedText.time),
174
+ new Date(
175
+ receivedText.time + receivedText.chars * TYPING_ANIMATION_DELAY_MS,
176
+ ),
177
+ ]),
178
+ ...this.timings.renderedText.map((renderedText) => [
179
+ 'Rendered Text',
180
+ new Date(renderedText.start),
181
+ new Date(renderedText.end),
182
+ ]),
183
+ ];
184
+ }
185
+
186
+ downloadInputAudio() {
187
+ const audioContext = new AudioContext();
188
+ const totalLength = this.timings.sentAudio.reduce((acc, cur) => {
189
+ return acc + cur?.float32Audio?.length ?? 0;
190
+ }, 0);
191
+ if (totalLength === 0) {
192
+ return;
193
+ }
194
+
195
+ const incomingArrayBuffer = audioContext.createBuffer(
196
+ 1, // 1 channel
197
+ totalLength,
198
+ audioContext.sampleRate,
199
+ );
200
+
201
+ const buffer = incomingArrayBuffer.getChannelData(0);
202
+ let i = 0;
203
+ this.timings.sentAudio.forEach((sentAudio) => {
204
+ sentAudio.float32Audio.forEach((bytes) => {
205
+ buffer[i++] = bytes;
206
+ });
207
+ });
208
+
209
+ // Play for debugging
210
+ // playAudioBuffer(incomingArrayBuffer);
211
+ downloadAudioBuffer(incomingArrayBuffer, `input_audio.wav`);
212
+ }
213
+
214
+ downloadOutputAudio() {
215
+ const playedAudio = this.timings.playedAudio;
216
+ const sampleRate = this.timings.receivedAudioSampleRate;
217
+ if (
218
+ playedAudio.length === 0 ||
219
+ this.timings.startRecordingTime == null ||
220
+ sampleRate == null
221
+ ) {
222
+ return null;
223
+ }
224
+
225
+ let previousEndTime = this.timings.startRecordingTime;
226
+ const audioArray: number[] = [];
227
+ playedAudio.forEach((audio) => {
228
+ const delta = (audio.start - previousEndTime) / 1000;
229
+ for (let i = 0; i < delta * sampleRate; i++) {
230
+ audioArray.push(0.0);
231
+ }
232
+ audio.float32Audio.forEach((bytes) => audioArray.push(bytes));
233
+ previousEndTime = audio.end;
234
+ });
235
+ const audioContext = new AudioContext();
236
+ const incomingArrayBuffer = audioContext.createBuffer(
237
+ 1, // 1 channel
238
+ audioArray.length,
239
+ sampleRate,
240
+ );
241
+
242
+ incomingArrayBuffer.copyToChannel(
243
+ new Float32Array(audioArray),
244
+ 0, // first channel
245
+ );
246
+
247
+ // Play for debugging
248
+ // playAudioBuffer(incomingArrayBuffer);
249
+ downloadAudioBuffer(incomingArrayBuffer, 'output_audio.wav');
250
+ }
251
+ }
252
+
253
+ const debugSingleton = new DebugTimingsManager();
254
+ export default function debug(): DebugTimingsManager | null {
255
+ const debugParam = getURLParams().debug;
256
+ return debugParam ? debugSingleton : null;
257
+ }