Spaces:
Sleeping
Sleeping
File size: 3,188 Bytes
cf49c4b 801ca00 33da63d 1269470 cf49c4b fdd85c1 801ca00 fdd85c1 a1ff6d2 cf49c4b 5d7c9cd cf49c4b 801ca00 fdd85c1 5614a83 313d70a 2997b62 9f9ad3d 33da63d 78384ac ba6a087 33da63d fafe326 33da63d 2997b62 fdd85c1 abc4ec6 e8f96e7 abc4ec6 8fb404e c3f5319 cf49c4b b3fa9dd e02c7a0 8fb404e c3f5319 313d70a b3fa9dd c3f5319 69c8b67 045b430 69c8b67 045b430 69c8b67 045b430 69c8b67 ba6a087 7ea039d abc4ec6 7ea039d abc4ec6 2abedae e02c7a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import torch
import os
import random
import gradio as gr
from TTS.api import TTS
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, pipeline
import base64
from datasets import load_dataset
from diffusers import DiffusionPipeline
from huggingface_hub import login
import numpy as np
import spaces
@spaces.GPU
def guessanImage(model, image):
imgclassifier = pipeline("image-classification", model=model)
if image is not None:
description = imgclassifier(image)
return description
@spaces.GPU
def guessanAge(model, image):
imgclassifier = pipeline("image-classification", model=model)
if image is not None:
description = imgclassifier(image)
return description
@spaces.GPU(duration=120)
def text2speech(text, sample):
os.environ["COQUI_TOS_AGREED"] = "1"
if len(text) > 0:
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
wav = tts.tts_to_file(text=text, file_path="output.wav", speaker_wav="sampleaudio/abraham.wav", language="en")
return wav
@spaces.GPU
def ImageGenFromText(text, model):
api_key = os.getenv("fluxauth")
login(token=api_key)
if len(text) > 0:
dtype = torch.bfloat16
device = "cuda" if torch.cuda.is_available() else "cpu"
MAX_SEED = np.iinfo(np.int32).max
seed = random.randint(0, MAX_SEED)
pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=dtype).to(device)
generator = torch.Generator().manual_seed(seed)
image = pipe(
prompt = text,
width = 512,
height = 512,
num_inference_steps = 4,
generator = generator,
guidance_scale=0.0
).images[0]
print(image)
return image
radio1 = gr.Radio(["microsoft/resnet-50", "google/vit-base-patch16-224", "apple/mobilevit-small"], value="microsoft/resnet-50", label="Select a Classifier", info="Image Classifier")
tab1 = gr.Interface(
fn=guessanImage,
inputs=[radio1, gr.Image(type="pil")],
outputs=["text"],
)
radio2 = gr.Radio(["nateraw/vit-age-classifier"], value="nateraw/vit-age-classifier", label="Select an Age Classifier", info="Age Classifier")
tab2 = gr.Interface(
fn=guessanAge,
inputs=[radio2, gr.Image(type="pil")],
outputs=["text"],
)
with gr.Blocks() as tab3:
gr.Markdown("type the text to speak:")
textbox = gr.Textbox(value="good morning pineapple! looking very good very nice!")
gr.Markdown("record your voice sample:")
micinput = gr.Audio(sources=['microphone'], type="filepath", format="wav", value="sampleaudio/abraham.wav")
gr.Interface(
fn=text2speech,
inputs=[textbox, micinput],
outputs=["audio"],
)
radio4 = gr.Radio(["black-forest-labs/FLUX.1-schnell"], value="black-forest-labs/FLUX.1-schnell", label="Select", info="text to image")
tab4 = gr.Interface(
fn=ImageGenFromText,
inputs=["text", radio4],
outputs=["image"],
)
demo = gr.TabbedInterface([tab1, tab2, tab3, tab4], ["Describe", "Estimage Age", "Speak", "Generate Image"])
demo.launch()
|