File size: 3,791 Bytes
22621c3
 
 
 
 
 
 
cf197d6
 
65948ef
22621c3
 
 
 
 
3b43e04
 
 
 
 
 
 
22621c3
3b43e04
 
cf197d6
 
3b43e04
 
5590000
3b43e04
 
 
 
 
 
 
 
5590000
3b43e04
 
 
 
 
 
 
5590000
cf197d6
22621c3
cf197d6
22621c3
3b43e04
 
 
 
 
 
 
 
 
22621c3
 
 
 
 
 
 
 
 
 
65948ef
22621c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf197d6
22621c3
 
 
 
 
 
 
 
 
cf197d6
3b43e04
65948ef
3b43e04
 
cf197d6
2b08886
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import streamlit as st
from audio_recorder_streamlit import audio_recorder
from groq import Groq
import os
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import edge_tts
import asyncio
from dotenv import load_dotenv
load_dotenv()

#Front end using streamlit
def frontend():
    st.title("Voice AI Demo")

    # Initialize session state variables
    if "conversation" not in st.session_state:
        st.session_state.conversation = []  # Stores (question, answer, audio_filename)
    if "audio_count" not in st.session_state:
        st.session_state.audio_count = 1  # Start numbering audio files from output1.wav

    status_placeholder = st.empty()
    status_placeholder.write("Press Mic button to start asking a question")

    recorded_audio = audio_recorder(sample_rate=8000)
    text = st.chat_input()

    def process_input(user_input):
        status_placeholder.write("Getting response...")
        response = answer(user_input)
        status_placeholder.write("Converting response to audio...")

        # Generate unique audio filename
        audio_filename = f"output{st.session_state.audio_count}.wav"
        asyncio.run(convert_audio(response, audio_filename))
        st.session_state.audio_count += 1  # Increment for next response

        status_placeholder.write("Press mic button again to ask more questions")

        # Append (question, answer, audio_filename) to conversation history
        st.session_state.conversation.append((f"Q: {user_input}", f"A: {response}", audio_filename))

    # Handle user input
    if text:
        process_input(text)
    elif recorded_audio:
        status_placeholder.write("Converting audio...")
        data_to_file(recorded_audio)
        status_placeholder.write("Uploading audio...")
        transcription = audio_to_text("temp_audio.wav")
        status_placeholder.write("Transcription completed.")
        process_input(transcription)

    # Display full conversation history
    for i, (q, a, audio_file) in enumerate(st.session_state.conversation):
        st.write(q)
        st.write(a)
        st.audio(audio_file, format="audio/wav", loop=False, autoplay=(i == len(st.session_state.conversation) - 1))


#Fuction to convert audio data to audio file
def data_to_file(recorded_audio):
    temp_audio_path = "temp_audio.wav"
    with open(temp_audio_path, "wb") as temp_file:
        temp_file.write(recorded_audio)


#Function for audio to text
def audio_to_text(audio_path):
    client = Groq(api_key=os.getenv('GROQ_API_KEY'))
    with open(audio_path, 'rb') as file:
        transcription = client.audio.translations.create(
            file=(audio_path, file.read()),
            model='whisper-large-v3',
        )
    return transcription.text

#Function for answerig User Query
def answer(user_question):
    model = ChatGroq(
        model="llama-3.3-70b-versatile",
        temperature=0.6
    )

    prompt = ChatPromptTemplate([
        ("system", "You are super knowlegable AI chat bot which will answer all User Query, answer with confident, also this response will get convert back to speech, so dont make point or anything, but make your answer in para form and dont make it too large, and use proper annotation, comma, full stop, question mark, so that a better text to speach can be genrate back."),
        ("user", "User Query: {question}"),
    ])

    parser = StrOutputParser()

    chain = prompt|model|parser
    answer = chain.invoke({'question': user_question})
    return answer

# Audio conversion
async def convert_audio(text, filename):
    voice = "fr-FR-VivienneMultilingualNeural"
    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(filename)

frontend()