Spaces:

m-adil-ali
/

SpeakSmart

Sleeping

File size: 7,331 Bytes

54b3eb0

# -*- coding: utf-8 -*-
"""Untitled

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/12GhPKbBzxei0ZhB0r-m5kvNOaCRyCxiM
"""

!pip install gradio openai gtts pydub numpy requests groq openai-whisper transformers
!apt-get install -y ffmpeg

import os
os.environ["GROQ_API_KEY"] = "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"

import os
import gradio as gr
import whisper
from gtts import gTTS
import io
from transformers import pipeline
from groq import Groq

# Initialize the Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# Load the Whisper model
whisper_model = whisper.load_model("base")  # You can choose other models like "small", "medium", "large"

# Initialize the grammar correction pipeline
corrector = pipeline("text2text-generation", model="pszemraj/flan-t5-large-grammar-synthesis")

def process_audio(file_path):
    try:
        # Load the audio file
        audio = whisper.load_audio(file_path)

        # Transcribe the audio using Whisper
        result = whisper_model.transcribe(audio)
        user_text = result["text"]

        # Display the user input text
        corrected_text = corrector(user_text)[0]['generated_text'].strip()

        # Generate a response using Groq
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": corrected_text}],
            model="llama3-8b-8192",  # Replace with the correct model if necessary
        )

        # Access the response using dot notation
        response_message = chat_completion.choices[0].message.content.strip()

        # Convert the response text to speech
        tts = gTTS(response_message)
        response_audio_io = io.BytesIO()
        tts.write_to_fp(response_audio_io)  # Save the audio to the BytesIO object
        response_audio_io.seek(0)

        # Save audio to a file to ensure it's generated correctly
        with open("response.mp3", "wb") as audio_file:
            audio_file.write(response_audio_io.getvalue())

        # Return the original text, corrected text, and the path to the saved audio file
        return user_text, corrected_text, "response.mp3"

    except Exception as e:
        return f"An error occurred: {e}", None, None

# Create a Gradio interface with a submit button
iface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath"),  # Use type="filepath"
    outputs=[
        gr.Textbox(label="User voice input into text"),  # Original user input text
        gr.Textbox(label="Corrected version of user input"),  # Corrected text
        gr.Audio(label="Response Audio")  # Response audio
    ],
    live=False,  # Ensure live mode is off to use a submit button
    title="Audio Processing with Grammar Correction",
    description="Upload an audio file, which will be transcribed, corrected for grammar, and then used to generate a response.",
    allow_flagging="never"
)

iface.launch()




# import os
# import gradio as gr
# import whisper
# from gtts import gTTS
# import io
# from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
# from groq import Groq

# # Initialize the Groq client
# client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# # Load the Whisper model
# whisper_model = whisper.load_model("base")  # You can choose other models like "small", "medium", "large"

# # Initialize the grammar correction pipeline
# corrector = pipeline("text2text-generation", model="pszemraj/flan-t5-large-grammar-synthesis")

# def process_audio(file_path):
#     try:
#         # Load the audio file
#         audio = whisper.load_audio(file_path)

#         # Transcribe the audio using Whisper
#         result = whisper_model.transcribe(audio)
#         user_text = result["text"]

#         # Display the user input text
#         corrected_text = corrector(user_text)[0]['generated_text'].strip()

#         # Generate a response using Groq
#         chat_completion = client.chat.completions.create(
#             messages=[{"role": "user", "content": corrected_text}],
#             model="llama3-8b-8192",  # Replace with the correct model if necessary
#         )

#         # Access the response using dot notation
#         response_message = chat_completion.choices[0].message.content.strip()

#         # Convert the response text to speech
#         tts = gTTS(response_message)
#         response_audio_io = io.BytesIO()
#         tts.write_to_fp(response_audio_io)  # Save the audio to the BytesIO object
#         response_audio_io.seek(0)

#         # Save audio to a file to ensure it's generated correctly
#         with open("response.mp3", "wb") as audio_file:
#             audio_file.write(response_audio_io.getvalue())

#         # Return the original text, corrected text, and the path to the saved audio file
#         return user_text, corrected_text, "response.mp3"

#     except Exception as e:
#         return f"An error occurred: {e}", None, None

# iface = gr.Interface(
#     fn=process_audio,
#     inputs=gr.Audio(type="filepath"),  # Use type="filepath"
#     outputs=[
#         gr.Textbox(label="User voice input into text"),  # Original user input text
#         gr.Textbox(label="Corrected version of user input"),  # Corrected text
#         gr.Audio(label="Response Audio")  # Response audio
#     ],
#     live=True
# )

# iface.launch()




# # import os
# # import gradio as gr
# # import whisper
# # from gtts import gTTS
# # import io
# # from groq import Groq

# # # Initialize the Groq client
# # client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# # # Load the Whisper model
# # model = whisper.load_model("base")  # You can choose other models like "small", "medium", "large"

# # def process_audio(file_path):
# #     try:
# #         # Load the audio file
# #         audio = whisper.load_audio(file_path)

# #         # Transcribe the audio using Whisper
# #         result = model.transcribe(audio)
# #         text = result["text"]

# #         # Generate a response using Groq
# #         chat_completion = client.chat.completions.create(
# #             messages=[{"role": "user", "content": text}],
# #             model="llama3-8b-8192",  # Replace with the correct model if necessary
# #         )

# #         # Access the response using dot notation
# #         response_message = chat_completion.choices[0].message.content.strip()

# #         # Convert the response text to speech
# #         tts = gTTS(response_message)
# #         response_audio_io = io.BytesIO()
# #         tts.write_to_fp(response_audio_io)  # Save the audio to the BytesIO object
# #         response_audio_io.seek(0)

# #         # Save audio to a file to ensure it's generated correctly
# #         with open("response.mp3", "wb") as audio_file:
# #             audio_file.write(response_audio_io.getvalue())

# #         # Return the response text and the path to the saved audio file
# #         return response_message, "response.mp3"

# #     except Exception as e:
# #         return f"An error occurred: {e}", None

# # iface = gr.Interface(
# #     fn=process_audio,
# #     inputs=gr.Audio(type="filepath"),  # Use type="filepath"
# #     outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")],
# #     live=True
# # )

# # iface.launch()