from transformers import pipeline, VitsModel, AutoTokenizer
import torch
import os
from groq import Groq

# Transcriber model
transcriber = pipeline("automatic-speech-recognition", model="SamuelM0422/whisper-small-pt")

# Synthesise model
model = VitsModel.from_pretrained("facebook/mms-tts-por")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-por")

# LLM query function
def query(text, groq_api_key):
  client = Groq(
    api_key=groq_api_key,
  )

  chat_completion = client.chat.completions.create(
      messages=[
          {
              'role': 'system',
              'content': 'Answer the following question concisely and objectively. If there are numbers in the response, WRITE THEM IN WORDS.',
          },
          {
              "role": "user",
              "content": text,
          }
      ],
      model="llama-3.1-8b-instant",
  )

  return chat_completion.choices[0].message.content

# Synthesise function
def synthesise(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        output = model(**inputs).waveform

    return output.cpu()

# Piecing all them together
def ai_assistant(filepath, groq_key):
  transcription = transcriber(filepath)
  response = query(transcription['text'], groq_key)
  audio_response = synthesise(response)

  return (16000, audio_response.squeeze().cpu().numpy()), response