from transformers import pipeline, VitsModel, AutoTokenizer import torch import os from groq import Groq # Transcriber model transcriber = pipeline("automatic-speech-recognition", model="SamuelM0422/whisper-small-pt") # Synthesise model model = VitsModel.from_pretrained("facebook/mms-tts-por") tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-por") # LLM query function def query(text, groq_api_key): client = Groq( api_key=groq_api_key, ) chat_completion = client.chat.completions.create( messages=[ { 'role': 'system', 'content': 'Answer the following question concisely and objectively. If there are numbers in the response, WRITE THEM IN WORDS.', }, { "role": "user", "content": text, } ], model="llama-3.1-8b-instant", ) return chat_completion.choices[0].message.content # Synthesise function def synthesise(text): inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform return output.cpu() # Piecing all them together def ai_assistant(filepath, groq_key): transcription = transcriber(filepath) response = query(transcription['text'], groq_key) audio_response = synthesise(response) return (16000, audio_response.squeeze().cpu().numpy()), response