Voice-To-Text

Sleeping

File size: 1,900 Bytes

309b067
ae43f08
fdbd451
 
 
309b067
ae43f08
fdbd451
 
 
 
 
 
 
 
 
 
 
 
ae43f08
a78e93c
 
fdbd451
a78e93c
ae43f08
fdbd451
a78e93c
fdbd451
a78e93c
 
fdbd451
a78e93c
 
 
 
 
ae43f08
0fe9a40
ae43f08
a78e93c
ae43f08
 
 
 
a78e93c
ae43f08
 
309b067
ae43f08
309b067
ae43f08
fdbd451
309b067
 
ae43f08
309b067
 
0fe9a40

import gradio as gr
import requests
import numpy as np
import io
import wave

# Function to send audio to Groq API and get transcription
def transcribe(audio_data):
    # Convert the NumPy audio array to bytes
    audio_bytes = io.BytesIO()
    
    # Convert NumPy array to WAV format (use appropriate rate, channels, etc.)
    with wave.open(audio_bytes, "wb") as wf:
        wf.setnchannels(1)  # Mono channel
        wf.setsampwidth(2)  # 16-bit audio
        wf.setframerate(16000)  # Assuming 16kHz sample rate
        wf.writeframes(audio_data.tobytes())
    
    audio_bytes.seek(0)  # Rewind to the beginning

    # Groq API endpoint for audio transcription
    groq_api_endpoint = "https://api.groq.com/openai/v1/audio/transcriptions"
    
    # Replace 'YOUR_GROQ_API_KEY' with your actual Groq API key
    headers = {
        "Authorization": "Bearer YOUR_GROQ_API_KEY",
    }
    
    # Prepare the files and data for the request
    files = {
        'file': ('audio.wav', audio_bytes, 'audio/wav'),
    }
    data = {
        'model': 'whisper-large-v3-turbo',  # Specify the model to use
        'response_format': 'json',          # Desired response format
        'language': 'en',                   # Language of the audio
    }

    # Send audio to Groq API
    response = requests.post(groq_api_endpoint, headers=headers, files=files, data=data)

    # Parse response
    if response.status_code == 200:
        result = response.json()
        return result.get("text", "No transcription available.")
    else:
        return f"Error: {response.status_code}, {response.text}"

# Gradio interface
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(source="microphone", type="numpy"),  # Changed to numpy
    outputs="text",
    title="Voice to Text Converter",
    description="Record your voice, and it will be transcribed into text using Groq API."
)

iface.launch()