Spaces:

mingyang91
/

polyhedron

Sleeping

App Files Files Community

mingyang91 commited on Nov 21, 2023

Commit

6647d0d

verified ·

1 Parent(s): 81301f1

split whisper into a standalone crate

Browse files

Files changed (4) hide show

src/asr/aws.rs +167 -4
src/lesson.rs +11 -115
src/main.rs +21 -25
whisper/src/handler.rs +0 -15

src/asr/aws.rs CHANGED Viewed

@@ -1,17 +1,180 @@
 use async_trait::async_trait;
 use tokio::sync::broadcast::Receiver;
 use crate::asr::{ASR, Event};
-struct AWS_ASR {
-    aws: aws_sdk_transcribestreaming::Client,
 }
 #[async_trait]
 impl ASR for AWS_ASR {
     async fn frame(&mut self, frame: &[i16]) -> anyhow::Result<()> {
-        todo!()
     }
     fn subscribe(&mut self) -> Receiver<Event> {
-        todo!()
     }
 }

+use std::error::Error;
+use std::fmt::{Display, Formatter};
+use async_stream::stream;
 use async_trait::async_trait;
+use aws_sdk_transcribestreaming::operation::start_stream_transcription::StartStreamTranscriptionOutput;
+use aws_sdk_transcribestreaming::primitives::Blob;
+use aws_sdk_transcribestreaming::types::{
+    AudioEvent, AudioStream, LanguageCode, MediaEncoding, TranscriptResultStream,
+};
+use tokio::select;
 use tokio::sync::broadcast::Receiver;
+use tokio_stream::Stream;
+use futures_util::TryStreamExt;
 use crate::asr::{ASR, Event};
+pub struct AWS_ASR {
+    client: aws_sdk_transcribestreaming::Client,
+    speaker_voice_channel: tokio::sync::mpsc::Sender<Vec<i16>>,
+    speaker_transcript: tokio::sync::broadcast::Sender<Event>,
+    drop_handler: Option<tokio::sync::oneshot::Sender<()>>,
 }
+impl AWS_ASR {
+    pub async fn from_env(lang: LanguageCode) -> anyhow::Result<Self> {
+        let config = aws_config::load_from_env().await;
+        let transcript_client = aws_sdk_transcribestreaming::Client::new(&config);
+        let client = transcript_client.clone();
+        let (speaker_voice_channel, mut speaker_voice_rx) = tokio::sync::mpsc::channel::<Vec<i16>>(128);
+        let (speaker_transcript, _) = tokio::sync::broadcast::channel::<Event>(128);
+        let shared_speaker_transcript = speaker_transcript.clone();
+        let (drop_handler, drop_rx) = tokio::sync::oneshot::channel::<()>();
+        tokio::spawn(async move {
+            let fut = async {
+                let input_stream = stream! {
+                    while let Some(raw) = speaker_voice_rx.recv().await {
+                        let reshape = slice_i16_to_u8(&raw);
+                        yield Ok(AudioStream::AudioEvent(AudioEvent::builder().audio_chunk(Blob::new(reshape)).build()));
+                    }
+                };
+                let output = transcript_client
+                    .start_stream_transcription()
+                    .language_code(lang) //LanguageCode::EnGb
+                    .media_sample_rate_hertz(16000)
+                    .media_encoding(MediaEncoding::Pcm)
+                    .audio_stream(input_stream.into())
+                    .send()
+                    .await
+                    .map_err(|e| StreamTranscriptionError::EstablishStreamError(Box::new(e)))?;
+                let output_stream = to_stream(output);
+                output_stream
+                    .try_for_each(|text| async {
+                        let _ = shared_speaker_transcript.send(text);
+                        Ok(())
+                    })
+                    .await?;
+                Ok(()) as anyhow::Result<()>
+            };
+            select! {
+                res = fut => {
+                    if let Err(e) = res {
+                        println!("Error: {:?}", e);
+                    }
+                }
+                _ = drop_rx => {}
+            }
+        });
+        Ok(Self {
+            client,
+            speaker_voice_channel,
+            speaker_transcript,
+            drop_handler: Some(drop_handler)
+        })
+    }
+}
+#[allow(dead_code)]
+fn slice_i16_to_u8(slice: &[i16]) -> Vec<u8> {
+    slice
+        .iter()
+        .flat_map(|&sample| {
+            [(sample >> 8) as u8, sample as u8]
+        })
+        .collect()
+}
+impl Drop for AWS_ASR {
+    fn drop(&mut self) {
+        if let Some(drop_handler) = self.drop_handler.take() {
+            let _ = drop_handler.send(());
+        }
+    }
+}
 #[async_trait]
 impl ASR for AWS_ASR {
     async fn frame(&mut self, frame: &[i16]) -> anyhow::Result<()> {
+        Ok(self.speaker_voice_channel.send(frame.to_vec()).await?)
     }
     fn subscribe(&mut self) -> Receiver<Event> {
+        self.speaker_transcript.subscribe()
+    }
+}
+#[allow(dead_code)]
+fn to_stream(
+    mut output: StartStreamTranscriptionOutput,
+) -> impl Stream<Item = Result<Event, StreamTranscriptionError>> {
+    stream! {
+        while let Some(event) = output
+            .transcript_result_stream
+            .recv()
+            .await
+            .map_err(|e| StreamTranscriptionError::TranscriptResultStreamError(Box::new(e)))? {
+            match event {
+                TranscriptResultStream::TranscriptEvent(transcript_event) => {
+                    let Some(transcript) = transcript_event.transcript else {
+                        continue
+                    };
+                    for result in transcript.results.unwrap_or_default() {
+                        let Some(alternatives) = result.alternatives else {
+                            continue
+                        };
+                        let Some(first_alternative) = alternatives.first() else {
+                            continue
+                        };
+                        let Some(text) = &first_alternative.transcript else {
+                            continue
+                        };
+                        let evt = Event {
+                            transcript: text.clone(),
+                            is_final: !result.is_partial,
+                        };
+                        yield Ok(evt);
+                    }
+                }
+                _ => yield Err(StreamTranscriptionError::Unknown),
+            }
+        }
+    }
+}
+#[derive(Debug)]
+enum StreamTranscriptionError {
+    EstablishStreamError(Box<dyn Error + Send + Sync>),
+    TranscriptResultStreamError(Box<dyn Error + Send + Sync>),
+    Unknown,
+}
+impl Display for StreamTranscriptionError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            StreamTranscriptionError::EstablishStreamError(e) => {
+                write!(f, "EstablishStreamError: {}", e)
+            }
+            StreamTranscriptionError::TranscriptResultStreamError(e) => {
+                write!(f, "TranscriptResultStreamError: {}", e)
+            }
+            StreamTranscriptionError::Unknown => write!(f, "Unknown"),
+        }
+    }
+}
+impl Error for StreamTranscriptionError {
+    fn source(&self) -> Option<&(dyn Error + 'static)> {
+        match self {
+            StreamTranscriptionError::EstablishStreamError(e) => Some(e.as_ref()),
+            StreamTranscriptionError::TranscriptResultStreamError(e) => Some(e.as_ref()),
+            StreamTranscriptionError::Unknown => None,
+        }
     }
 }

src/lesson.rs CHANGED Viewed

@@ -1,41 +1,32 @@
-use async_stream::stream;
 use aws_config::SdkConfig;
 use aws_sdk_polly::primitives::ByteStream;
 use aws_sdk_polly::types::{Engine, OutputFormat, SpeechMarkType, VoiceId};
-use aws_sdk_transcribestreaming::operation::start_stream_transcription::StartStreamTranscriptionOutput;
-use aws_sdk_transcribestreaming::primitives::Blob;
-use aws_sdk_transcribestreaming::types::{
-    AudioEvent, AudioStream, LanguageCode, MediaEncoding, TranscriptResultStream,
-};
 use futures_util::future::try_join;
-use futures_util::{Stream, TryStreamExt};
 use serde::{Deserialize, Serialize};
 use std::collections::BTreeMap;
-use std::error::Error;
-use std::fmt::{Display, Formatter};
 use std::io::BufRead;
 use std::sync::{Arc, Weak};
 use tokio::sync::RwLock;
 use tokio::select;
 #[derive(Clone, Debug)]
 pub struct LessonsManager {
     translate_client: aws_sdk_translate::Client,
     polly_client: aws_sdk_polly::Client,
-    transcript_client: aws_sdk_transcribestreaming::Client,
     lessons: Arc<RwLock<BTreeMap<u32, Lesson>>>,
 }
 impl LessonsManager {
     pub(crate) fn new(sdk_config: &SdkConfig) -> Self {
-        let transcript_client = aws_sdk_transcribestreaming::Client::new(sdk_config);
         let translate_client = aws_sdk_translate::Client::new(sdk_config);
         let polly_client = aws_sdk_polly::Client::new(sdk_config);
         LessonsManager {
             translate_client,
             polly_client,
-            transcript_client,
             lessons: Arc::new(RwLock::new(BTreeMap::new())),
         }
     }
@@ -78,11 +69,11 @@ impl Lesson {
         }
     }
-    pub(crate) fn voice_channel(&self) -> tokio::sync::mpsc::Sender<Vec<u8>> {
-        self.inner.speaker_voice_channel.clone()
     }
-    pub(crate) fn transcript_channel(&self) -> tokio::sync::broadcast::Receiver<String> {
         self.inner.speaker_transcript.subscribe()
     }
 }
@@ -99,56 +90,17 @@ impl From<InnerLesson> for Lesson {
 struct InnerLesson {
     parent: LessonsManager,
     speaker_lang: LanguageCode,
-    speaker_voice_channel: tokio::sync::mpsc::Sender<Vec<u8>>,
-    speaker_transcript: tokio::sync::broadcast::Sender<String>,
     lang_lessons: RwLock<BTreeMap<String, Weak<InnerLangLesson>>>,
     drop_handler: Option<tokio::sync::oneshot::Sender<Signal>>,
 }
 impl InnerLesson {
     fn new(parent: LessonsManager, speaker_lang: LanguageCode) -> InnerLesson {
-        let (speaker_transcript, _) = tokio::sync::broadcast::channel::<String>(128);
-        let shared_speaker_transcript = speaker_transcript.clone();
         let (speaker_voice_channel, mut speaker_voice_rx) = tokio::sync::mpsc::channel(128);
         let (drop_handler, drop_rx) = tokio::sync::oneshot::channel::<Signal>();
-        let transcript_client = parent.transcript_client.clone();
-        let shared_speak_lang = speaker_lang.clone();
-        tokio::spawn(async move {
-            let fut = async {
-                let input_stream = stream! {
-                    while let Some(raw) = speaker_voice_rx.recv().await {
-                        yield Ok(AudioStream::AudioEvent(AudioEvent::builder().audio_chunk(Blob::new(raw)).build()));
-                    }
-                };
-                let output = transcript_client
-                    .start_stream_transcription()
-                    .language_code(shared_speak_lang) //LanguageCode::EnGb
-                    .media_sample_rate_hertz(16000)
-                    .media_encoding(MediaEncoding::Pcm)
-                    .audio_stream(input_stream.into())
-                    .send()
-                    .await
-                    .map_err(|e| StreamTranscriptionError::EstablishStreamError(Box::new(e)))?;
-                let output_stream = to_stream(output);
-                output_stream
-                    .try_for_each(|text| async {
-                        let _ = shared_speaker_transcript.send(text);
-                        Ok(())
-                    })
-                    .await?;
-                Ok(()) as Result<(), StreamTranscriptionError>
-            };
-            select! {
-                res = fut => {
-                    if let Err(e) = res {
-                        println!("Error: {:?}", e);
-                    }
-                }
-                _ = drop_rx => {}
-            }
-        });
         InnerLesson {
             parent,
@@ -220,10 +172,10 @@ impl LangLesson {
         let (drop_handler, drop_rx) = tokio::sync::oneshot::channel::<Signal>();
         tokio::spawn(async move {
             let fut = async {
-                while let Ok(text) = transcript_rx.recv().await {
                     let output = translate_client
                         .translate_text()
-                        .text(text)
                         .source_language_code(shared_speaker_lang.as_str())
                         .target_language_code(shared_lang.clone())
                         .send()
@@ -375,31 +327,6 @@ impl Drop for InnerVoiceLesson {
     }
 }
-fn to_stream(
-    mut output: StartStreamTranscriptionOutput,
-) -> impl Stream<Item = Result<String, StreamTranscriptionError>> {
-    stream! {
-        while let Some(event) = output
-            .transcript_result_stream
-            .recv()
-            .await
-            .map_err(|e| StreamTranscriptionError::TranscriptResultStreamError(Box::new(e)))? {
-            match event {
-                TranscriptResultStream::TranscriptEvent(transcript_event) => {
-                    let transcript = transcript_event.transcript.expect("transcript");
-                    for result in transcript.results.unwrap_or_default() {
-                        if !result.is_partial {
-                            let first_alternative = &result.alternatives.as_ref().expect("should have")[0];
-                            let slice = first_alternative.transcript.as_ref().expect("should have");
-                            yield Ok(slice.clone());
-                        }
-                    }
-                }
-                _ => yield Err(StreamTranscriptionError::Unknown),
-            }
-        }
-    }
-}
 // {"time":180,"type":"viseme","value":"r"}
 #[derive(Debug, Deserialize, Clone, Serialize)]
@@ -449,34 +376,3 @@ async fn synthesize_speech(
         .collect();
     Ok((parsed, audio.audio_stream))
 }
-#[derive(Debug)]
-enum StreamTranscriptionError {
-    EstablishStreamError(Box<dyn Error + Send + Sync>),
-    TranscriptResultStreamError(Box<dyn Error + Send + Sync>),
-    Unknown,
-}
-impl Display for StreamTranscriptionError {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        match self {
-            StreamTranscriptionError::EstablishStreamError(e) => {
-                write!(f, "EstablishStreamError: {}", e)
-            }
-            StreamTranscriptionError::TranscriptResultStreamError(e) => {
-                write!(f, "TranscriptResultStreamError: {}", e)
-            }
-            StreamTranscriptionError::Unknown => write!(f, "Unknown"),
-        }
-    }
-}
-impl Error for StreamTranscriptionError {
-    fn source(&self) -> Option<&(dyn Error + 'static)> {
-        match self {
-            StreamTranscriptionError::EstablishStreamError(e) => Some(e.as_ref()),
-            StreamTranscriptionError::TranscriptResultStreamError(e) => Some(e.as_ref()),
-            StreamTranscriptionError::Unknown => None,
-        }
-    }
-}

 use aws_config::SdkConfig;
 use aws_sdk_polly::primitives::ByteStream;
 use aws_sdk_polly::types::{Engine, OutputFormat, SpeechMarkType, VoiceId};
+use aws_sdk_transcribestreaming::types::{LanguageCode};
 use futures_util::future::try_join;
 use serde::{Deserialize, Serialize};
 use std::collections::BTreeMap;
+use std::fmt::{Display};
 use std::io::BufRead;
 use std::sync::{Arc, Weak};
 use tokio::sync::RwLock;
 use tokio::select;
+use crate::asr::Event;
 #[derive(Clone, Debug)]
 pub struct LessonsManager {
     translate_client: aws_sdk_translate::Client,
     polly_client: aws_sdk_polly::Client,
     lessons: Arc<RwLock<BTreeMap<u32, Lesson>>>,
 }
 impl LessonsManager {
     pub(crate) fn new(sdk_config: &SdkConfig) -> Self {
         let translate_client = aws_sdk_translate::Client::new(sdk_config);
         let polly_client = aws_sdk_polly::Client::new(sdk_config);
         LessonsManager {
             translate_client,
             polly_client,
             lessons: Arc::new(RwLock::new(BTreeMap::new())),
         }
     }
         }
     }
+    pub(crate) async fn send(&self, frame: Vec<i16>) -> anyhow::Result<()> {
+        Ok(self.inner.speaker_voice_channel.send(frame).await?)
     }
+    pub(crate) fn transcript_channel(&self) -> tokio::sync::broadcast::Receiver<Event> {
         self.inner.speaker_transcript.subscribe()
     }
 }
 struct InnerLesson {
     parent: LessonsManager,
     speaker_lang: LanguageCode,
+    speaker_voice_channel: tokio::sync::mpsc::Sender<Vec<i16>>,
+    speaker_transcript: tokio::sync::broadcast::Sender<Event>,
     lang_lessons: RwLock<BTreeMap<String, Weak<InnerLangLesson>>>,
     drop_handler: Option<tokio::sync::oneshot::Sender<Signal>>,
 }
 impl InnerLesson {
     fn new(parent: LessonsManager, speaker_lang: LanguageCode) -> InnerLesson {
+        let (speaker_transcript, _) = tokio::sync::broadcast::channel::<Event>(128);
         let (speaker_voice_channel, mut speaker_voice_rx) = tokio::sync::mpsc::channel(128);
         let (drop_handler, drop_rx) = tokio::sync::oneshot::channel::<Signal>();
         InnerLesson {
             parent,
         let (drop_handler, drop_rx) = tokio::sync::oneshot::channel::<Signal>();
         tokio::spawn(async move {
             let fut = async {
+                while let Ok(evt) = transcript_rx.recv().await {
                     let output = translate_client
                         .translate_text()
+                        .text(evt.transcript)
                         .source_language_code(shared_speaker_lang.as_str())
                         .target_language_code(shared_lang.clone())
                         .send()
     }
 }
 // {"time":180,"type":"viseme","value":"r"}
 #[derive(Debug, Deserialize, Clone, Serialize)]
         .collect();
     Ok((parsed, audio.audio_stream))
 }

src/main.rs CHANGED Viewed

@@ -25,8 +25,6 @@ use tracing::debug;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
 use crate::{config::*, lesson::*};
-#[cfg(feature = "whisper")]
-use crate::whisper::*;
 mod config;
 mod lesson;
@@ -105,30 +103,17 @@ async fn stream_speaker(
     let prompt = query.prompt.clone().unwrap_or_default();
     ws.on_upgrade(|mut socket| async move {
-        let _origin_tx = lesson.voice_channel();
         let mut transcribe_rx = lesson.transcript_channel();
-        #[cfg(feature = "whisper")]
-        let mut whisper = asr::whisper::whisper_asr::CONTEXT.create_handler(&SETTINGS.whisper, prompt)
-            .expect("failed to create whisper");
-        #[cfg(feature = "whisper")]
-        let mut whisper_transcribe_rx = whisper.subscribe();
         loop {
             select! {
-                // w = whisper_transcribe_rx.recv() => {
-                //     let Ok(_txt) = w else {
-                //         // TODO: handle msg
-                //         continue
-                //     };
-                // }
                 msg = socket.next() => {
                     match msg.as_ref() {
                         Some(Ok(Message::Binary(bin))) => {
-                            #[cfg(feature = "whisper")]
-                            let _ = whisper.send_bytes(bin.to_vec()).await; // whisper test
-                            // if let Err(e) = origin_tx.send(bin.to_vec()).await {
-                            //     tracing::warn!("failed to send voice: {}", e);
-                            //     break;
-                            // }
                         },
                         Some(Ok(_)) => {
                             tracing::warn!("Other: {:?}", msg);
@@ -145,9 +130,9 @@ async fn stream_speaker(
                     }
                 },
                 output = transcribe_rx.recv() => {
-                    if let Ok(transcript) = output {
-                        tracing::trace!("Transcribed: {}", transcript);
-                        let evt = LiveLessonTextEvent::Transcription { text: transcript.clone() };
                         let json = serde_json::to_string(&evt).expect("failed to serialize");
                         let _ = socket.send(Message::Text(json)).await.expect("failed to send");
                     }
@@ -207,8 +192,8 @@ async fn stream_listener(
         loop {
             select! {
                 transcript = transcript_rx.recv() => {
-                    if let Ok(transcript) = transcript {
-                        let evt = LiveLessonTextEvent::Transcription { text: transcript };
                         match serde_json::to_string(&evt) {
                             Ok(json) => {
                                 tracing::debug!("Transcribed: {}", json);
@@ -250,3 +235,14 @@ async fn stream_listener(
         }
     })
 }

 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
 use crate::{config::*, lesson::*};
 mod config;
 mod lesson;
     let prompt = query.prompt.clone().unwrap_or_default();
     ws.on_upgrade(|mut socket| async move {
         let mut transcribe_rx = lesson.transcript_channel();
         loop {
             select! {
                 msg = socket.next() => {
                     match msg.as_ref() {
                         Some(Ok(Message::Binary(bin))) => {
+                            let frame = u8_to_i16(bin);
+                            if let Err(e) = lesson.send(frame).await {
+                                tracing::warn!("failed to send voice: {}", e);
+                                break;
+                            }
                         },
                         Some(Ok(_)) => {
                             tracing::warn!("Other: {:?}", msg);
                     }
                 },
                 output = transcribe_rx.recv() => {
+                    if let Ok(evt) = output {
+                        tracing::trace!("Transcribed: {}", evt.transcript);
+                        let evt = LiveLessonTextEvent::Transcription { text: evt.transcript };
                         let json = serde_json::to_string(&evt).expect("failed to serialize");
                         let _ = socket.send(Message::Text(json)).await.expect("failed to send");
                     }
         loop {
             select! {
                 transcript = transcript_rx.recv() => {
+                    if let Ok(evt) = transcript {
+                        let evt = LiveLessonTextEvent::Transcription { text: evt.transcript };
                         match serde_json::to_string(&evt) {
                             Ok(json) => {
                                 tracing::debug!("Transcribed: {}", json);
         }
     })
 }
+fn u8_to_i16(input: &[u8]) -> Vec<i16> {
+    input
+        .chunks_exact(2)
+        .map(|chunk| {
+            let mut buf = [0u8; 2];
+            buf.copy_from_slice(chunk);
+            i16::from_le_bytes(buf)
+        })
+        .collect::<Vec<i16>>()
+}

whisper/src/handler.rs CHANGED Viewed

@@ -77,16 +77,6 @@ impl std::error::Error for Error {
     }
 }
-fn u8_to_i16(input: &[u8]) -> Vec<i16> {
-    input
-        .chunks_exact(2)
-        .map(|chunk| {
-            let mut buf = [0u8; 2];
-            buf.copy_from_slice(chunk);
-            i16::from_le_bytes(buf)
-        })
-        .collect::<Vec<i16>>()
-}
 #[derive(Clone, Debug)]
 pub enum Output {
@@ -206,11 +196,6 @@ impl WhisperHandler {
     pub async fn send_i16(&mut self, data: Vec<i16>) -> Result<(), mpsc::error::SendError<Vec<i16>>> {
         self.tx.send(data).await
     }
-    pub async fn send_bytes(&mut self, data: Vec<u8>) -> Result<(), mpsc::error::SendError<Vec<i16>>> {
-        let i16_data = u8_to_i16(&data);
-        self.send_i16(i16_data).await
-    }
 }
 #[allow(dead_code)]

     }
 }
 #[derive(Clone, Debug)]
 pub enum Output {
     pub async fn send_i16(&mut self, data: Vec<i16>) -> Result<(), mpsc::error::SendError<Vec<i16>>> {
         self.tx.send(data).await
     }
 }
 #[allow(dead_code)]