Voice-To-Text

Running

App Files Files Community

Voice-To-Text / app.py

lodhrangpt

Update app.py

fdbd451 verified 7 months ago

raw

history blame

1.9 kB

	import gradio as gr
	import requests
	import numpy as np
	import io
	import wave

	# Function to send audio to Groq API and get transcription
	def transcribe(audio_data):
	# Convert the NumPy audio array to bytes
	audio_bytes = io.BytesIO()

	# Convert NumPy array to WAV format (use appropriate rate, channels, etc.)
	with wave.open(audio_bytes, "wb") as wf:
	wf.setnchannels(1) # Mono channel
	wf.setsampwidth(2) # 16-bit audio
	wf.setframerate(16000) # Assuming 16kHz sample rate
	wf.writeframes(audio_data.tobytes())

	audio_bytes.seek(0) # Rewind to the beginning

	# Groq API endpoint for audio transcription
	groq_api_endpoint = "https://api.groq.com/openai/v1/audio/transcriptions"

	# Replace 'YOUR_GROQ_API_KEY' with your actual Groq API key
	headers = {
	"Authorization": "Bearer YOUR_GROQ_API_KEY",
	}

	# Prepare the files and data for the request
	files = {
	'file': ('audio.wav', audio_bytes, 'audio/wav'),
	}
	data = {
	'model': 'whisper-large-v3-turbo', # Specify the model to use
	'response_format': 'json', # Desired response format
	'language': 'en', # Language of the audio
	}

	# Send audio to Groq API
	response = requests.post(groq_api_endpoint, headers=headers, files=files, data=data)

	# Parse response
	if response.status_code == 200:
	result = response.json()
	return result.get("text", "No transcription available.")
	else:
	return f"Error: {response.status_code}, {response.text}"

	# Gradio interface
	iface = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(source="microphone", type="numpy"), # Changed to numpy
	outputs="text",
	title="Voice to Text Converter",
	description="Record your voice, and it will be transcribed into text using Groq API."
	)

	iface.launch()