|
import streamlit as st |
|
from audio_recorder_streamlit import audio_recorder |
|
from groq import Groq |
|
import os |
|
from dotenv import load_dotenv |
|
from langchain_groq import ChatGroq |
|
from langchain_core.output_parsers import StrOutputParser |
|
from langchain_core.prompts import ChatPromptTemplate |
|
from transformers import BarkModel, AutoProcessor, AutoModelForTextToWaveform, pipeline |
|
import scipy |
|
|
|
load_dotenv() |
|
client = Groq(api_key=os.getenv('GROQ_API_KEY')) |
|
model = 'whisper-large-v3' |
|
|
|
|
|
def frontend(): |
|
st.title("Voice AI Demo") |
|
status_placeholder = st.empty() |
|
status_placeholder.write("Press Mic button to start asking question") |
|
recorded_audio = audio_recorder() |
|
transcription = audio_to_text("bark_out.wav") |
|
response = answer(transcription) |
|
st.write("Q:" + transcription) |
|
st.write("A: " + response) |
|
if recorded_audio: |
|
status_placeholder.write("Converting audio ...") |
|
data_to_file(recorded_audio) |
|
status_placeholder.write("Audio conversion done.") |
|
status_placeholder.write("Convering audio to text and making transcription...") |
|
transcription = audio_to_text("temp_audio.wav") |
|
status_placeholder.write("Transcription is now made.") |
|
status_placeholder.write("Getting response...") |
|
response = answer(transcription) |
|
status_placeholder.write("Press mic button again to ask more questions") |
|
st.write("Q:" + transcription) |
|
st.write("A: " + response) |
|
|
|
|
|
def data_to_file(recorded_audio): |
|
temp_audio_path = "temp_audio.wav" |
|
with open(temp_audio_path, "wb") as temp_file: |
|
temp_file.write(recorded_audio) |
|
|
|
|
|
|
|
def audio_to_text(audio_path): |
|
with open(audio_path, 'rb') as file: |
|
transcription = client.audio.translations.create( |
|
file=(audio_path, file.read()), |
|
model='whisper-large-v3', |
|
) |
|
return transcription.text |
|
|
|
|
|
def answer(user_question): |
|
model = ChatGroq( |
|
model="llama-3.3-70b-versatile", |
|
temperature=0.6 |
|
) |
|
|
|
prompt = ChatPromptTemplate([ |
|
("system", "You are super knowlegable AI chat bot whuch will answer all User Query, answer with confident, also this response will get convert back to speech, so dont make point or anything, but make your answer in para form and dont make it too large, and use proper annotation, comma, full stop, question mark, so that a better text to speach can be genrate back."), |
|
("user", "User Query: {question}"), |
|
]) |
|
|
|
parser = StrOutputParser() |
|
|
|
chain = prompt|model|parser |
|
answer = chain.invoke({'question': user_question}) |
|
return answer |
|
|
|
synthesiser = pipeline("text-to-speech", "suno/bark") |
|
|
|
speech = synthesiser("Hello, my dog is cooler than you!", forward_params={"do_sample": True}) |
|
|
|
scipy.io.wavfile.write("bark_out.wav", rate=speech["sampling_rate"], data=speech["audio"]) |
|
frontend() |