File size: 4,369 Bytes
e7693f3
 
 
 
e63873f
 
 
 
 
 
 
 
 
 
e7693f3
 
 
 
e63873f
 
e7693f3
 
 
 
 
 
 
e63873f
 
 
 
 
 
 
 
 
 
 
 
e7693f3
e63873f
 
 
 
 
e7693f3
e63873f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#from huggingfaceinferenceclient import HuggingFaceInferenceClient
#from outpaintprocessor import DynamicImageOutpainter
#from aivideopipeline import AIImageVideoPipeline
#from mmig import MultiModelImageGenerator


import os
import requests
from PIL import Image
from io import BytesIO
from huggingface_hub import InferenceClient
from IPython.display import Audio, display
import gradio as gr

read_token = os.getenv('HF_READ')
write_token = os.getenv('HF_WRITE')
#chatmodel
chatmodel="mistralai/Mistral-Nemo-Instruct-2407"
# Whisper for Speech-to-Text
WHISPER_API_URL = "https://api-inference.huggingface.co/models/distil-whisper/distil-large-v2"
WHISPER_HEADERS = {"Authorization": "Bearer " + read_token}
# Bark for Text-to-Speech
BARK_API_URL = "https://api-inference.huggingface.co/models/suno/bark"
BARK_HEADERS = {"Authorization": "Bearer "+read_token}
# Flux for Image Generation
FLUX_API_URL = "https://api-inference.huggingface.co/models/enhanceaiteam/Flux-uncensored"
FLUX_HEADERS = {"Authorization": "Bearer "+read_token}

def speech_to_text(filename):
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.post(WHISPER_API_URL, headers=WHISPER_HEADERS, data=data)
    if response.status_code == 200:
        return response.json().get("text", "Could not recognize speech")
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

# Chatbot Logic with Hugging Face InferenceClient
client = InferenceClient(api_key=read_token)

def chatbot_logic(input_text):
    messages = [{"role": "user", "content": input_text}]
    try:
        completion = client.chat.completions.create(
            model=chatmodel, 
            messages=messages, 
            max_tokens=500
        )
        return completion.choices[0].message["content"]
    except Exception as e:
        print(f"Error: {e}")
        return None


def text_to_speech(text):
    payload = {"inputs": text}
    response = requests.post(BARK_API_URL, headers=BARK_HEADERS, json=payload)
    if response.status_code == 200:
        return response.content
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

def generate_image(prompt):
    data = {"inputs": prompt}
    response = requests.post(FLUX_API_URL, headers=FLUX_HEADERS, json=data)
    if response.status_code == 200:
        image_bytes = BytesIO(response.content)
        return Image.open(image_bytes)
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

# Gradio Interface for Chatbot and Image Generator
def create_ui():
    def process_chat(audio_file):
        # Step 1: Speech to Text
        recognized_text = speech_to_text(audio_file)
        if not recognized_text:
            return "Could not recognize speech", None, None
        
        # Step 2: Chatbot Logic
        response_text = chatbot_logic(recognized_text)
        if not response_text:
            return f"Error generating response for: {recognized_text}", None, None
        
        # Step 3: Text to Speech
        audio_output = text_to_speech(response_text)
        if not audio_output:
            return f"Error synthesizing response: {response_text}", None, None
        
        # Step 4: Image Generation
        generated_image = generate_image(response_text)
        
        return response_text, Audio(audio_output, autoplay=True), generated_image

    with gr.Blocks(title="Voice-to-Voice Chatbot with Image Generation") as ui:
        gr.Markdown("## Voice-to-Voice Chatbot with Image Generation\nUpload an audio file to interact with the chatbot.")
        
        audio_input = gr.Audio(source="upload", type="filepath", label="Input Audio File")
        submit_button = gr.Button("Process")
        
        with gr.Row():
            chatbot_response = gr.Textbox(label="Chatbot Response", lines=2)
        
        with gr.Row():
            audio_output = gr.Audio(label="Generated Audio Response")
            image_output = gr.Image(label="Generated Image")
        
        submit_button.click(
            fn=process_chat,
            inputs=audio_input,
            outputs=[chatbot_response, audio_output, image_output],
            show_progress=True
        )
    
    return ui

# Run the Gradio Interface
if __name__ == "__main__":
    create_ui().launch(debug=True)