Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""Untitled | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/12GhPKbBzxei0ZhB0r-m5kvNOaCRyCxiM | |
""" | |
!pip install gradio openai gtts pydub numpy requests groq openai-whisper transformers | |
!apt-get install -y ffmpeg | |
import os | |
os.environ["GROQ_API_KEY"] = "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" | |
import os | |
import gradio as gr | |
import whisper | |
from gtts import gTTS | |
import io | |
from transformers import pipeline | |
from groq import Groq | |
# Initialize the Groq client | |
client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
# Load the Whisper model | |
whisper_model = whisper.load_model("base") # You can choose other models like "small", "medium", "large" | |
# Initialize the grammar correction pipeline | |
corrector = pipeline("text2text-generation", model="pszemraj/flan-t5-large-grammar-synthesis") | |
def process_audio(file_path): | |
try: | |
# Load the audio file | |
audio = whisper.load_audio(file_path) | |
# Transcribe the audio using Whisper | |
result = whisper_model.transcribe(audio) | |
user_text = result["text"] | |
# Display the user input text | |
corrected_text = corrector(user_text)[0]['generated_text'].strip() | |
# Generate a response using Groq | |
chat_completion = client.chat.completions.create( | |
messages=[{"role": "user", "content": corrected_text}], | |
model="llama3-8b-8192", # Replace with the correct model if necessary | |
) | |
# Access the response using dot notation | |
response_message = chat_completion.choices[0].message.content.strip() | |
# Convert the response text to speech | |
tts = gTTS(response_message) | |
response_audio_io = io.BytesIO() | |
tts.write_to_fp(response_audio_io) # Save the audio to the BytesIO object | |
response_audio_io.seek(0) | |
# Save audio to a file to ensure it's generated correctly | |
with open("response.mp3", "wb") as audio_file: | |
audio_file.write(response_audio_io.getvalue()) | |
# Return the original text, corrected text, and the path to the saved audio file | |
return user_text, corrected_text, "response.mp3" | |
except Exception as e: | |
return f"An error occurred: {e}", None, None | |
# Create a Gradio interface with a submit button | |
iface = gr.Interface( | |
fn=process_audio, | |
inputs=gr.Audio(type="filepath"), # Use type="filepath" | |
outputs=[ | |
gr.Textbox(label="User voice input into text"), # Original user input text | |
gr.Textbox(label="Corrected version of user input"), # Corrected text | |
gr.Audio(label="Response Audio") # Response audio | |
], | |
live=False, # Ensure live mode is off to use a submit button | |
title="Audio Processing with Grammar Correction", | |
description="Upload an audio file, which will be transcribed, corrected for grammar, and then used to generate a response.", | |
allow_flagging="never" | |
) | |
iface.launch() | |
# import os | |
# import gradio as gr | |
# import whisper | |
# from gtts import gTTS | |
# import io | |
# from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM | |
# from groq import Groq | |
# # Initialize the Groq client | |
# client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
# # Load the Whisper model | |
# whisper_model = whisper.load_model("base") # You can choose other models like "small", "medium", "large" | |
# # Initialize the grammar correction pipeline | |
# corrector = pipeline("text2text-generation", model="pszemraj/flan-t5-large-grammar-synthesis") | |
# def process_audio(file_path): | |
# try: | |
# # Load the audio file | |
# audio = whisper.load_audio(file_path) | |
# # Transcribe the audio using Whisper | |
# result = whisper_model.transcribe(audio) | |
# user_text = result["text"] | |
# # Display the user input text | |
# corrected_text = corrector(user_text)[0]['generated_text'].strip() | |
# # Generate a response using Groq | |
# chat_completion = client.chat.completions.create( | |
# messages=[{"role": "user", "content": corrected_text}], | |
# model="llama3-8b-8192", # Replace with the correct model if necessary | |
# ) | |
# # Access the response using dot notation | |
# response_message = chat_completion.choices[0].message.content.strip() | |
# # Convert the response text to speech | |
# tts = gTTS(response_message) | |
# response_audio_io = io.BytesIO() | |
# tts.write_to_fp(response_audio_io) # Save the audio to the BytesIO object | |
# response_audio_io.seek(0) | |
# # Save audio to a file to ensure it's generated correctly | |
# with open("response.mp3", "wb") as audio_file: | |
# audio_file.write(response_audio_io.getvalue()) | |
# # Return the original text, corrected text, and the path to the saved audio file | |
# return user_text, corrected_text, "response.mp3" | |
# except Exception as e: | |
# return f"An error occurred: {e}", None, None | |
# iface = gr.Interface( | |
# fn=process_audio, | |
# inputs=gr.Audio(type="filepath"), # Use type="filepath" | |
# outputs=[ | |
# gr.Textbox(label="User voice input into text"), # Original user input text | |
# gr.Textbox(label="Corrected version of user input"), # Corrected text | |
# gr.Audio(label="Response Audio") # Response audio | |
# ], | |
# live=True | |
# ) | |
# iface.launch() | |
# # import os | |
# # import gradio as gr | |
# # import whisper | |
# # from gtts import gTTS | |
# # import io | |
# # from groq import Groq | |
# # # Initialize the Groq client | |
# # client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
# # # Load the Whisper model | |
# # model = whisper.load_model("base") # You can choose other models like "small", "medium", "large" | |
# # def process_audio(file_path): | |
# # try: | |
# # # Load the audio file | |
# # audio = whisper.load_audio(file_path) | |
# # # Transcribe the audio using Whisper | |
# # result = model.transcribe(audio) | |
# # text = result["text"] | |
# # # Generate a response using Groq | |
# # chat_completion = client.chat.completions.create( | |
# # messages=[{"role": "user", "content": text}], | |
# # model="llama3-8b-8192", # Replace with the correct model if necessary | |
# # ) | |
# # # Access the response using dot notation | |
# # response_message = chat_completion.choices[0].message.content.strip() | |
# # # Convert the response text to speech | |
# # tts = gTTS(response_message) | |
# # response_audio_io = io.BytesIO() | |
# # tts.write_to_fp(response_audio_io) # Save the audio to the BytesIO object | |
# # response_audio_io.seek(0) | |
# # # Save audio to a file to ensure it's generated correctly | |
# # with open("response.mp3", "wb") as audio_file: | |
# # audio_file.write(response_audio_io.getvalue()) | |
# # # Return the response text and the path to the saved audio file | |
# # return response_message, "response.mp3" | |
# # except Exception as e: | |
# # return f"An error occurred: {e}", None | |
# # iface = gr.Interface( | |
# # fn=process_audio, | |
# # inputs=gr.Audio(type="filepath"), # Use type="filepath" | |
# # outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")], | |
# # live=True | |
# # ) | |
# # iface.launch() | |