File size: 3,188 Bytes
cf49c4b
 
 
801ca00
33da63d
1269470
cf49c4b
 
 
 
 
fdd85c1
801ca00
fdd85c1
a1ff6d2
cf49c4b
 
5d7c9cd
cf49c4b
801ca00
fdd85c1
5614a83
313d70a
 
 
 
2997b62
9f9ad3d
33da63d
78384ac
ba6a087
33da63d
fafe326
33da63d
2997b62
fdd85c1
abc4ec6
e8f96e7
abc4ec6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fb404e
c3f5319
cf49c4b
b3fa9dd
e02c7a0
 
 
8fb404e
c3f5319
313d70a
b3fa9dd
c3f5319
 
69c8b67
 
 
045b430
69c8b67
045b430
 
 
69c8b67
 
045b430
69c8b67
 
ba6a087
7ea039d
abc4ec6
 
7ea039d
abc4ec6
 
 
2abedae
e02c7a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import torch
import os
import random
import gradio as gr
from TTS.api import TTS
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, pipeline
import base64
from datasets import load_dataset
from diffusers import DiffusionPipeline
from huggingface_hub import login
import numpy as np
import spaces

@spaces.GPU
def guessanImage(model, image):
    imgclassifier  = pipeline("image-classification", model=model)
    if image is not None:  
        description = imgclassifier(image)
    return description

@spaces.GPU
def guessanAge(model, image):
    imgclassifier  = pipeline("image-classification", model=model)
    if image is not None:  
        description = imgclassifier(image)
    return description    

@spaces.GPU(duration=120)
def text2speech(text, sample):
    os.environ["COQUI_TOS_AGREED"] = "1"
    if len(text) > 0:
        tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
        wav = tts.tts_to_file(text=text, file_path="output.wav", speaker_wav="sampleaudio/abraham.wav", language="en")
        return wav

@spaces.GPU
def ImageGenFromText(text, model):
    api_key = os.getenv("fluxauth")
    login(token=api_key)
    
    if len(text) > 0:
        dtype = torch.bfloat16
        device = "cuda" if torch.cuda.is_available() else "cpu"
        MAX_SEED = np.iinfo(np.int32).max
        seed = random.randint(0, MAX_SEED)
        pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=dtype).to(device)
        generator = torch.Generator().manual_seed(seed)
        image = pipe(
                prompt = text, 
                width = 512,
                height = 512,
                num_inference_steps = 4, 
                generator = generator,
                guidance_scale=0.0
        ).images[0]
        print(image)
        return image


radio1 = gr.Radio(["microsoft/resnet-50", "google/vit-base-patch16-224", "apple/mobilevit-small"], value="microsoft/resnet-50", label="Select a Classifier", info="Image Classifier")
tab1 = gr.Interface(
    fn=guessanImage,
    inputs=[radio1, gr.Image(type="pil")],
    outputs=["text"],
)

radio2 = gr.Radio(["nateraw/vit-age-classifier"], value="nateraw/vit-age-classifier", label="Select an Age Classifier", info="Age Classifier")
tab2 = gr.Interface(
    fn=guessanAge,
    inputs=[radio2, gr.Image(type="pil")],
    outputs=["text"],
)


with gr.Blocks() as tab3:
        gr.Markdown("type the text to speak:")
        textbox = gr.Textbox(value="good morning pineapple! looking very good very nice!")
        gr.Markdown("record your voice sample:")
        micinput = gr.Audio(sources=['microphone'], type="filepath", format="wav", value="sampleaudio/abraham.wav")

        gr.Interface(
            fn=text2speech,
            inputs=[textbox, micinput],
            outputs=["audio"],
        )

radio4 = gr.Radio(["black-forest-labs/FLUX.1-schnell"], value="black-forest-labs/FLUX.1-schnell", label="Select", info="text to image")
tab4 = gr.Interface(
    fn=ImageGenFromText,
    inputs=["text", radio4],
    outputs=["image"],
)

demo = gr.TabbedInterface([tab1, tab2, tab3, tab4], ["Describe", "Estimage Age", "Speak", "Generate Image"])
demo.launch()