|
import subprocess |
|
|
|
|
|
subprocess.check_call(["pip", "install", "torch>=1.11.0"]) |
|
subprocess.check_call(["pip", "install", "transformers>=4.31.0"]) |
|
subprocess.check_call(["pip", "install", "diffusers>=0.14.0"]) |
|
subprocess.check_call(["pip", "install", "librosa"]) |
|
subprocess.check_call(["pip", "install", "accelerate>=0.20.1"]) |
|
subprocess.check_call(["pip", "install", "gradio>=3.35.2"]) |
|
subprocess.check_call(["pip", "install", "huggingface_hub"]) |
|
|
|
import os |
|
import threading |
|
import numpy as np |
|
import librosa |
|
import torch |
|
import gradio as gr |
|
from functools import lru_cache |
|
from transformers import pipeline |
|
from huggingface_hub import login |
|
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler |
|
|
|
|
|
def install_missing_packages(): |
|
required_packages = { |
|
"librosa": None, |
|
"diffusers": ">=0.14.0", |
|
"gradio": ">=3.35.2", |
|
"huggingface_hub": None, |
|
"accelerate": ">=0.20.1", |
|
"transformers": ">=4.31.0" |
|
} |
|
for package, version in required_packages.items(): |
|
try: |
|
__import__(package) |
|
except ImportError: |
|
package_name = f"{package}{version}" if version else package |
|
subprocess.check_call(["pip", "install", package_name]) |
|
|
|
install_missing_packages() |
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
if hf_token: |
|
login(hf_token) |
|
else: |
|
raise ValueError("HF_TOKEN environment variable not set.") |
|
|
|
|
|
speech_to_text = pipeline( |
|
"automatic-speech-recognition", |
|
model="openai/whisper-tiny", |
|
return_timestamps=True |
|
) |
|
|
|
|
|
text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
text_to_image.to(device) |
|
text_to_image.enable_attention_slicing() |
|
text_to_image.safety_checker = None |
|
text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config) |
|
|
|
|
|
def preprocess_audio(audio_path): |
|
try: |
|
audio, sr = librosa.load(audio_path, sr=16000) |
|
return np.array(audio, dtype=np.float32) |
|
except Exception as e: |
|
return f"Error in preprocessing audio: {str(e)}" |
|
|
|
|
|
@lru_cache(maxsize=10) |
|
def transcribe_audio(audio_path): |
|
try: |
|
audio_array = preprocess_audio(audio_path) |
|
if isinstance(audio_array, str): |
|
return audio_array |
|
result = speech_to_text(audio_array) |
|
|
|
transcription = " ".join(segment["text"] for segment in result["chunks"]) |
|
return transcription |
|
except Exception as e: |
|
return f"Error in transcription: {str(e)}" |
|
|
|
|
|
@lru_cache(maxsize=10) |
|
def generate_image_from_text(text): |
|
try: |
|
image = text_to_image(text, height=256, width=256).images[0] |
|
return image |
|
except Exception as e: |
|
return f"Error in image generation: {str(e)}" |
|
|
|
|
|
def process_audio_and_generate_results(audio_path): |
|
transcription_result = {"result": None} |
|
image_result = {"result": None} |
|
|
|
|
|
def transcription_thread(): |
|
transcription_result["result"] = transcribe_audio(audio_path) |
|
|
|
def image_generation_thread(): |
|
transcription = transcription_result["result"] |
|
if transcription and "Error" not in transcription: |
|
image_result["result"] = generate_image_from_text(transcription) |
|
|
|
|
|
t1 = threading.Thread(target=transcription_thread) |
|
t2 = threading.Thread(target=image_generation_thread) |
|
|
|
t1.start() |
|
t2.start() |
|
|
|
t1.join() |
|
t2.join() |
|
|
|
transcription = transcription_result["result"] |
|
image = image_result["result"] |
|
|
|
if "Error" in transcription: |
|
return None, transcription |
|
if isinstance(image, str) and "Error" in image: |
|
return None, image |
|
|
|
return image, transcription |
|
|
|
|
|
speech_to_text_iface = gr.Interface( |
|
fn=transcribe_audio, |
|
inputs=gr.Audio(type="filepath", label="Upload audio file for transcription (WAV/MP3)"), |
|
outputs=gr.Textbox(label="Transcription"), |
|
title="Speech-to-Text Transcription", |
|
description="Upload an audio file to transcribe speech into text.", |
|
) |
|
|
|
|
|
voice_to_image_iface = gr.Interface( |
|
fn=process_audio_and_generate_results, |
|
inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"), |
|
outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")], |
|
title="Voice-to-Image", |
|
description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.", |
|
) |
|
|
|
|
|
iface = gr.TabbedInterface( |
|
interface_list=[speech_to_text_iface, voice_to_image_iface], |
|
tab_names=["Speech-to-Text", "Voice-to-Image"] |
|
) |
|
|
|
|
|
iface.launch(debug=True, share=True) |
|
|