File size: 7,761 Bytes
9fe6b65
 
 
613b43a
 
 
9fe6b65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613b43a
 
9fe6b65
 
 
 
 
613b43a
 
 
9fe6b65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613b43a
9fe6b65
613b43a
9fe6b65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613b43a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fe6b65
 
 
 
613b43a
9fe6b65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613b43a
 
 
 
 
 
 
 
9fe6b65
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import streamlit as st
import requests
import os
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, VitsModel, AutoTokenizer
import torch
import soundfile as sf

# API keys for other features (optional)
Image_Token = os.getenv('Image_generation')
Content_Token = os.getenv('ContentGeneration')
Image_prompt_token = os.getenv('Prompt_generation')

# API Headers for external services (optional)
Image_generation = {"Authorization": f"Bearer {Image_Token}"}
Content_generation = {
    "Authorization": f"Bearer {Content_Token}",
    "Content-Type": "application/json"
}
Image_Prompt = {
    "Authorization": f"Bearer {Image_prompt_token}",
    "Content-Type": "application/json"
}

# Text-to-Image Model API URLs
image_generation_urls = {
    "black-forest-labs/FLUX.1-schnell": "https://api-inference.huggingface.co/models/black-forest-labs/FLUX.1-schnell",
    "CompVis/stable-diffusion-v1-4": "https://api-inference.huggingface.co/models/CompVis/stable-diffusion-v1-4",
    "black-forest-labs/FLUX.1-dev": "https://api-inference.huggingface.co/models/black-forest-labs/FLUX.1-dev"
}

# Default content generation model
content_models = {
    "llama-3.1-70b-versatile": "llama-3.1-70b-versatile",
    "llama3-8b-8192": "llama3-8b-8192",
    "gemma2-9b-it": "gemma2-9b-it",
    "mixtral-8x7b-32768": "mixtral-8x7b-32768"
}

# Load the translation model and tokenizer locally
@st.cache_resource
def load_translation_model():
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
    return model, tokenizer

# Function to perform translation locally
def translate_text_local(text):
    model, tokenizer = load_translation_model()
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return translated_text

# Function to query Groq content generation model (optional)
def generate_content(english_text, max_tokens, temperature, model):
    url = "https://api.groq.com/openai/v1/chat/completions"
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a creative and insightful writer."},
            {"role": "user", "content": f"Write educational content about {english_text} within {max_tokens} tokens."}
        ],
        "max_tokens": max_tokens,
        "temperature": temperature
    }
    response = requests.post(url, json=payload, headers=Content_generation)
    if response.status_code == 200:
        result = response.json()
        return result['choices'][0]['message']['content']
    else:
        st.error(f"Content Generation Error: {response.status_code}")
        return None

# Function to generate image prompt (optional)
def generate_image_prompt(english_text):
    payload = {
        "model": "mixtral-8x7b-32768",
        "messages": [
            {"role": "system", "content": "You are a professional Text to image prompt generator."},
            {"role": "user", "content": f"Create a text to image generation prompt about {english_text} within 30 tokens."}
        ],
        "max_tokens": 30
    }
    response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=Image_Prompt)
    if response.status_code == 200:
        result = response.json()
        return result['choices'][0]['message']['content']
    else:
        st.error(f"Prompt Generation Error: {response.status_code}")
        return None

# Function to generate an image from the prompt (optional)
def generate_image(image_prompt, model_url):
    data = {"inputs": image_prompt}
    response = requests.post(model_url, headers=Image_generation, json=data)
    if response.status_code == 200:
        return response.content
    else:
        st.error(f"Image Generation Error {response.status_code}: {response.text}")
        return None

# New Function to generate speech from text using VitsModel
def generate_speech(text):
    model = VitsModel.from_pretrained("facebook/mms-tts-eng")
    tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
    
    inputs = tokenizer(text, return_tensors="pt")

    # Generate the speech waveform
    with torch.no_grad():
        output = model(**inputs).waveform
    
    # Save the waveform as an audio file
    audio_path = "output.wav"
    sf.write(audio_path, output.numpy().flatten(), 16000)

    return audio_path

# User Guide Section
def show_user_guide():
    st.title("FusionMind User Guide")
    st.write("""
        ... [content unchanged] ...
    """)

# Main Streamlit app
def main():
    # Sidebar Menu
    st.sidebar.title("FusionMind Options")
    page = st.sidebar.radio("Select a page:", ["Main App", "User Guide"])

    if page == "User Guide":
        show_user_guide()
        return

    st.title("🅰️ℹ️ FusionMind ➡️ Multimodal")

    # Sidebar for temperature, token adjustment, and model selection
    st.sidebar.header("Settings")
    temperature = st.sidebar.slider("Select Temperature", 0.1, 1.0, 0.7)
    max_tokens = st.sidebar.slider("Max Tokens for Content Generation", 100, 400, 200)

    # Content generation model selection
    content_model = st.sidebar.selectbox("Select Content Generation Model", list(content_models.keys()), index=0)

    # Image generation model selection
    image_model = st.sidebar.selectbox("Select Image Generation Model", list(image_generation_urls.keys()), index=0)

    # Suggested inputs
    st.write("## Suggested Inputs")
    suggestions = ["தரவு அறிவியல்", "உளவியல்", "ராக்கெட் எப்படி வேலை செய்கிறது"]
    selected_suggestion = st.selectbox("Select a suggestion or enter your own:", [""] + suggestions)

    # Input box for user
    tamil_input = st.text_input("Enter Tamil text (or select a suggestion):", selected_suggestion)

    if st.button("Generate"):
        # Step 1: Translation (Tamil to English)
        if tamil_input:
            st.write("### Translated English Text:")
            english_text = translate_text_local(tamil_input)
            if english_text:
                st.success(english_text)

                # Step 2: Generate Educational Content
                st.write("### Generated Content:")
                with st.spinner('Generating content...'):
                    content_output = generate_content(english_text, max_tokens, temperature, content_models[content_model])
                    if content_output:
                        st.success(content_output)

                        # Step 4: Generate speech from the content
                        st.write("### Generated Speech:")
                        with st.spinner('Generating speech...'):
                            audio_path = generate_speech(content_output)
                            audio_file = open(audio_path, 'rb')
                            audio_bytes = audio_file.read()
                            st.audio(audio_bytes, format="audio/wav")

                # Step 3: Generate Image from the prompt (optional)
                st.write("### Generated Image:")
                with st.spinner('Generating image...'):
                    image_prompt = generate_image_prompt(english_text)
                    image_data = generate_image(image_prompt, image_generation_urls[image_model])
                    if image_data:
                        st.image(image_data, caption="Generated Image")

if __name__ == "__main__":
    main()