Spaces:

mingyang91
/

polyhedron

Sleeping

App Files Files Community

mingyang91 commited on Oct 23, 2023

Commit

bb6818c

verified ·

1 Parent(s): a2ebe0b

add lip-sync

Browse files

Files changed (6) hide show

Cargo.lock +1 -0
Cargo.toml +1 -0
src/lesson.rs +67 -13
src/main.rs +32 -7
static/client.js +11 -3
static/index.html +7 -1

Cargo.lock CHANGED Viewed

@@ -1333,6 +1333,7 @@ dependencies = [
  "hound",
  "poem",
  "serde",
  "symphonia",
  "tokio",
  "tokio-stream",

  "hound",
  "poem",
  "serde",
+ "serde_json",
  "symphonia",
  "tokio",
  "tokio-stream",

Cargo.toml CHANGED Viewed

@@ -22,6 +22,7 @@ futures-util = "0.3.28"
 #symphonia-format-mkv = "0.5.3"
 symphonia = { version = "0.5.3", features = ["mkv", "pcm"] }
 serde = { version = "1.0.189", features = ["derive"] }
 [dependencies.poem]
 version = "1.3.58"

 #symphonia-format-mkv = "0.5.3"
 symphonia = { version = "0.5.3", features = ["mkv", "pcm"] }
 serde = { version = "1.0.189", features = ["derive"] }
+serde_json = { version = "1.0.107", features = [] }
 [dependencies.poem]
 version = "1.3.58"

src/lesson.rs CHANGED Viewed

@@ -1,15 +1,20 @@
 use std::sync::{Arc, Weak};
 use tokio::sync::RwLock;
 use std::collections::BTreeMap;
 use async_stream::stream;
 use aws_config::SdkConfig;
-use aws_sdk_polly::types::VoiceId;
 use aws_sdk_transcribestreaming::operation::start_stream_transcription::StartStreamTranscriptionOutput;
 use aws_sdk_transcribestreaming::primitives::Blob;
 use aws_sdk_transcribestreaming::types::{AudioEvent, AudioStream, LanguageCode, MediaEncoding, TranscriptResultStream};
 use futures_util::{Stream, StreamExt, TryStreamExt};
-use tokio::select;
 use crate::StreamTranscriptionError;
 #[derive(Clone, Debug)]
@@ -297,6 +302,10 @@ impl VoiceLesson {
     pub(crate) fn voice_channel(&self) -> tokio::sync::broadcast::Receiver<Vec<u8>> {
         self.inner.voice_lesson.subscribe()
     }
 }
 impl From<InnerVoiceLesson> for VoiceLesson {
@@ -318,6 +327,7 @@ impl From<Arc<InnerVoiceLesson>> for VoiceLesson {
 struct InnerVoiceLesson {
     parent: LangLesson,
     voice: VoiceId,
     voice_lesson: tokio::sync::broadcast::Sender<Vec<u8>>,
     drop_handler: Option<tokio::sync::oneshot::Sender<Signal>>,
 }
@@ -337,22 +347,18 @@ impl InnerVoiceLesson {
         let mut translate_rx = parent.inner.translated_tx.subscribe();
         let (voice_lesson, _) = tokio::sync::broadcast::channel::<Vec<u8>>(128);
         let shared_voice_lesson = voice_lesson.clone();
         let client = parent.inner.parent.inner.parent.polly_client.clone();
         // let lang: LanguageCode = parent.inner.lang.clone().parse().expect("Invalid language code");
         tokio::spawn(async move {
             let fut = async {
                 while let Ok(translated) = translate_rx.recv().await {
-                    let res = client.synthesize_speech()
-                        .set_text(Some(translated))
-                        .voice_id(shared_voice_id.clone())
-                        .output_format("pcm".into())
-                        // .language_code(lang)
-                        // .language_code("cmn-CN".into())
-                        .send()
-                        .await;
                     match res {
-                        Ok(mut synthesized) => {
-                            while let Some(Ok(bytes)) = synthesized.audio_stream.next().await {
                                 let _ = &shared_voice_lesson.send(bytes.to_vec());
                             }
                         },
@@ -364,7 +370,12 @@ impl InnerVoiceLesson {
                 Ok(())
             };
             select! {
-                _ = fut => {}
                 _ = rx => {}
             }
         });
@@ -372,6 +383,7 @@ impl InnerVoiceLesson {
         InnerVoiceLesson {
             parent,
             voice,
             voice_lesson,
             drop_handler: Some(tx),
         }
@@ -411,3 +423,45 @@ fn to_stream(mut output: StartStreamTranscriptionOutput) -> impl Stream<Item=Res
     }
 }

 use std::sync::{Arc, Weak};
 use tokio::sync::RwLock;
 use std::collections::BTreeMap;
+use std::io::BufRead;
 use async_stream::stream;
 use aws_config::SdkConfig;
+use aws_sdk_polly::primitives::ByteStream;
+use aws_sdk_polly::types::{Engine, OutputFormat, SpeechMarkType, VoiceId};
 use aws_sdk_transcribestreaming::operation::start_stream_transcription::StartStreamTranscriptionOutput;
 use aws_sdk_transcribestreaming::primitives::Blob;
 use aws_sdk_transcribestreaming::types::{AudioEvent, AudioStream, LanguageCode, MediaEncoding, TranscriptResultStream};
+use clap::builder::TypedValueParser;
 use futures_util::{Stream, StreamExt, TryStreamExt};
+use futures_util::future::try_join;
+use serde::{Deserialize, Serialize};
+use tokio::{select, try_join};
 use crate::StreamTranscriptionError;
 #[derive(Clone, Debug)]
     pub(crate) fn voice_channel(&self) -> tokio::sync::broadcast::Receiver<Vec<u8>> {
         self.inner.voice_lesson.subscribe()
     }
+    pub(crate) fn lip_sync_channel(&self) -> tokio::sync::broadcast::Receiver<Vec<Viseme>> {
+        self.inner.lip_sync_tx.subscribe()
+    }
 }
 impl From<InnerVoiceLesson> for VoiceLesson {
 struct InnerVoiceLesson {
     parent: LangLesson,
     voice: VoiceId,
+    lip_sync_tx: tokio::sync::broadcast::Sender<Vec<Viseme>>,
     voice_lesson: tokio::sync::broadcast::Sender<Vec<u8>>,
     drop_handler: Option<tokio::sync::oneshot::Sender<Signal>>,
 }
         let mut translate_rx = parent.inner.translated_tx.subscribe();
         let (voice_lesson, _) = tokio::sync::broadcast::channel::<Vec<u8>>(128);
         let shared_voice_lesson = voice_lesson.clone();
+        let (lip_sync_tx, _) = tokio::sync::broadcast::channel::<Vec<Viseme>>(128);
+        let shared_lip_sync_tx = lip_sync_tx.clone();
         let client = parent.inner.parent.inner.parent.polly_client.clone();
         // let lang: LanguageCode = parent.inner.lang.clone().parse().expect("Invalid language code");
         tokio::spawn(async move {
             let fut = async {
                 while let Ok(translated) = translate_rx.recv().await {
+                    let res = synthesize_speech(&client, translated, shared_voice_id.clone()).await;
                     match res {
+                        Ok((vec, mut audio_stream)) => {
+                            let _ = shared_lip_sync_tx.send(vec);
+                            while let Some(Ok(bytes)) = audio_stream.next().await {
                                 let _ = &shared_voice_lesson.send(bytes.to_vec());
                             }
                         },
                 Ok(())
             };
             select! {
+                res = fut => match res {
+                    Ok(_) => {}
+                    Err(e) => {
+                        println!("Error: {:?}", e);
+                    }
+                },
                 _ = rx => {}
             }
         });
         InnerVoiceLesson {
             parent,
             voice,
+            lip_sync_tx,
             voice_lesson,
             drop_handler: Some(tx),
         }
     }
 }
+// {"time":180,"type":"viseme","value":"r"}
+#[derive(Debug, Deserialize, Clone, Serialize)]
+pub(crate) struct Viseme {
+    time: u32,
+    value: String,
+}
+#[derive(Debug)]
+enum SynthesizeError {
+    Polly(aws_sdk_polly::Error),
+    Transmitting(aws_sdk_polly::error::BoxError),
+}
+async fn synthesize_speech(client: &aws_sdk_polly::Client,
+                           text: String,
+                           voice_id: VoiceId) -> Result<(Vec<Viseme>, ByteStream), SynthesizeError> {
+    let audio_fut = client.synthesize_speech()
+        .engine(Engine::Neural)
+        .set_text(Some(text.clone()))
+        .voice_id(voice_id.clone())
+        .output_format(OutputFormat::Pcm)
+        .send();
+    let visemes_fut = client.synthesize_speech()
+        .engine(Engine::Neural)
+        .set_text(Some(text))
+        .voice_id(voice_id)
+        .speech_mark_types(SpeechMarkType::Viseme)
+        .output_format(OutputFormat::Json)
+        .send();
+    let (audio, visemes) = try_join(audio_fut, visemes_fut)
+        .await
+        .map_err(|e| SynthesizeError::Polly(e.into()))?;
+    let visemes = visemes.audio_stream.collect().await
+        .map_err(|e| SynthesizeError::Transmitting(e.into()))?.to_vec();
+    let parsed: Vec<Viseme> = visemes
+        .lines()
+        .filter_map(|line| line.ok())
+        .filter_map(|line| serde_json::from_str::<Viseme>(&line).ok())
+        .collect();
+    Ok((parsed, audio.audio_stream))
+}

src/main.rs CHANGED Viewed

@@ -28,8 +28,9 @@ use poem::web::{Data, Query};
 use tokio::select;
 use tokio::sync::mpsc::{Receiver, Sender};
 use tokio_stream::Stream;
-use serde::Deserialize;
 use lesson::{LessonsManager};
 mod lesson;
@@ -149,7 +150,10 @@ async fn stream_speaker(ctx: Data<&Context>, query: Query<LessonSpeakerQuery>, w
                 msg = socket.next() => {
                     match msg.as_ref() {
                         Some(Ok(Message::Binary(bin))) => {
-                            origin_tx.send(bin.to_vec()).await.expect("failed to send");
                         },
                         Some(Ok(_)) => {
                             println!("Other: {:?}", msg);
@@ -158,7 +162,7 @@ async fn stream_speaker(ctx: Data<&Context>, query: Query<LessonSpeakerQuery>, w
                             println!("Error: {:?}", e);
                         },
                         None => {
-                            socket.close().await.expect("failed to close");
                             println!("Other: {:?}", msg);
                             break;
                         }
@@ -183,6 +187,14 @@ pub struct LessonListenerQuery {
     voice: String,
 }
 #[handler]
 async fn stream_listener(ctx: Data<&Context>, query: Query<LessonListenerQuery>, ws: WebSocket) -> impl IntoResponse {
     let lesson_opt = ctx.lessons_manager.get_lesson(query.id).await;
@@ -199,19 +211,24 @@ async fn stream_listener(ctx: Data<&Context>, query: Query<LessonListenerQuery>,
         let mut translate_rx = lang_lesson.translated_channel();
         let mut voice_lesson = lang_lesson.get_or_init(voice_id).await;
         let mut voice_rx = voice_lesson.voice_channel();
         loop {
             select! {
                 transcript = transcript_rx.recv() => {
                     if let Ok(transcript) = transcript {
-                        println!("Transcribed: {}", transcript);
-                        let _ = socket.send(Message::Text(transcript)).await;
                     }
                 },
                 translated = translate_rx.recv() => {
                     if let Ok(translated) = translated {
-                        println!("Translated: {}", translated);
-                        let _ = socket.send(Message::Text(translated)).await;
                     }
                 },
                 voice = voice_rx.recv() => {
@@ -220,6 +237,14 @@ async fn stream_listener(ctx: Data<&Context>, query: Query<LessonListenerQuery>,
                         let _ = socket.send(Message::Binary(voice)).await;
                     }
                 },
             }
         }
     })

 use tokio::select;
 use tokio::sync::mpsc::{Receiver, Sender};
 use tokio_stream::Stream;
+use serde::{Deserialize, Serialize};
 use lesson::{LessonsManager};
+use crate::lesson::Viseme;
 mod lesson;
                 msg = socket.next() => {
                     match msg.as_ref() {
                         Some(Ok(Message::Binary(bin))) => {
+                            if origin_tx.send(bin.to_vec()).await.is_err() {
+                                println!("tx closed");
+                                break;
+                            }
                         },
                         Some(Ok(_)) => {
                             println!("Other: {:?}", msg);
                             println!("Error: {:?}", e);
                         },
                         None => {
+                            let _ = socket.close().await;
                             println!("Other: {:?}", msg);
                             break;
                         }
     voice: String,
 }
+#[derive(Serialize)]
+#[serde(tag = "type")]
+enum LiveLessonTextEvent {
+    Transcription { text: String },
+    Translation { text: String },
+    LipSync{ visemes: Vec<Viseme> },
+}
 #[handler]
 async fn stream_listener(ctx: Data<&Context>, query: Query<LessonListenerQuery>, ws: WebSocket) -> impl IntoResponse {
     let lesson_opt = ctx.lessons_manager.get_lesson(query.id).await;
         let mut translate_rx = lang_lesson.translated_channel();
         let mut voice_lesson = lang_lesson.get_or_init(voice_id).await;
         let mut voice_rx = voice_lesson.voice_channel();
+        let mut lip_sync_rx = voice_lesson.lip_sync_channel();
         loop {
             select! {
                 transcript = transcript_rx.recv() => {
                     if let Ok(transcript) = transcript {
+                        let evt = LiveLessonTextEvent::Transcription { text: transcript };
+                        let json = serde_json::to_string(&evt).expect("failed to serialize");
+                        println!("Transcribed: {}", json);
+                        let _ = socket.send(Message::Text(json)).await;
                     }
                 },
                 translated = translate_rx.recv() => {
                     if let Ok(translated) = translated {
+                        let evt = LiveLessonTextEvent::Translation { text: translated };
+                        let json = serde_json::to_string(&evt).expect("failed to serialize");
+                        println!("Translated: {}", json);
+                        let _ = socket.send(Message::Text(json)).await;
                     }
                 },
                 voice = voice_rx.recv() => {
                         let _ = socket.send(Message::Binary(voice)).await;
                     }
                 },
+                visemes = lip_sync_rx.recv() => {
+                    if let Ok(visemes) = visemes {
+                        let evt = LiveLessonTextEvent::LipSync { visemes };
+                        let json = serde_json::to_string(&evt).expect("failed to serialize");
+                        println!("Visemes: {:?}", json);
+                        let _ = socket.send(Message::Text(json)).await;
+                    }
+                },
             }
         }
     })

static/client.js CHANGED Viewed

@@ -29,7 +29,8 @@ let bufferSize = 2048,
 //vars
 let audioElement = document.querySelector('audio'),
     finalWord = false,
-    resultText = document.getElementById('ResultText'),
     removeLastSentence = true,
     streamStreaming = false;
@@ -134,14 +135,21 @@ socket.onmessage = function (msg) {
         audioQueue.next(msg.data)
     } else {
         // text
-        onSpeechData(msg.data)
     }
 }
 socket.onclose = function () {
     processor.stop()
 }
-function onSpeechData(data) {
     var dataFinal = false;
     if (dataFinal === false) {

 //vars
 let audioElement = document.querySelector('audio'),
     finalWord = false,
+    translationText = document.getElementById('Translation'),
+    transcriptionText = document.getElementById('Transcription'),
     removeLastSentence = true,
     streamStreaming = false;
         audioQueue.next(msg.data)
     } else {
         // text
+        const evt = JSON.parse(msg.data)
+        if (evt.type === 'Translation') {
+            onSpeechData(transcriptionText, evt.text)
+        } else if (evt.type === 'Transcription') {
+            onSpeechData(translationText, evt.text)
+        } else {
+            console.log(evt.visemes)
+        }
     }
 }
 socket.onclose = function () {
     processor.stop()
 }
+function onSpeechData(resultText, data) {
     var dataFinal = false;
     if (dataFinal === false) {

static/index.html CHANGED Viewed

@@ -25,7 +25,13 @@
   <br>
   <div>
-    <p id="ResultText">
                 <span class="greyText">No Speech to Text yet
                     <span>
     </p>

   <br>
   <div>
+    <h1>Translation</h1>
+    <p id="Translation">
+                <span class="greyText">No Speech to Text yet
+                    <span>
+    </p>
+    <h1>Transcription</h1>
+    <p id="Transcription">
                 <span class="greyText">No Speech to Text yet
                     <span>
     </p>