import gradio as gr
import torch
from transformers import (
    BlipProcessor, 
    BlipForQuestionAnswering,
    pipeline,
    AutoTokenizer, 
    AutoModelForCausalLM
)
from modelscope.pipelines import pipeline as ms_pipeline
from PIL import Image

def load_models():
    # Chargement des modèles
    blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
    blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
    
    # Modèle de transcription audio
    audio_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")
    
    # Modèle de génération de texte (version gratuite GPT-2)
    text_generator = pipeline("text-generation", model="gpt2")
    
    return blip_processor, blip_model, audio_transcriber, text_generator

def analyze_image(image, blip_processor, blip_model):
    # Questions pour l'analyse d'image
    questions = [
        "What is in the picture?",
        "What are the main colors?",
        "What is the setting or background?",
        "What is happening in the image?",
    ]
    
    responses = {}
    for question in questions:
        inputs = blip_processor(images=image, text=question, return_tensors="pt")
        outputs = blip_model.generate(**inputs)
        answer = blip_processor.decode(outputs[0], skip_special_tokens=True)
        responses[question] = answer
    
    description = f"This image shows {responses['What is in the picture?']}. "
    description += f"The main colors are {responses['What are the main colors?']}. "
    description += f"The setting is {responses['What is the setting or background?']}. "
    description += f"In the scene, {responses['What is happening in the image?']}."
    
    return description

def process_inputs(image, audio, text, models):
    blip_processor, blip_model, audio_transcriber, text_generator = models
    
    final_prompt = ""
    
    # Analyse de l'image si présente
    if image is not None:
        image_description = analyze_image(image, blip_processor, blip_model)
        final_prompt += f"Visual description: {image_description}\n"
    
    # Transcription audio si présent
    if audio is not None:
        audio_text = audio_transcriber(audio)["text"]
        final_prompt += f"Audio content: {audio_text}\n"
    
    # Ajout du texte si présent
    if text:
        final_prompt += f"Additional context: {text}\n"
    
    # Génération du prompt optimisé avec GPT-2
    prompt_enhancement = text_generator(
        final_prompt,
        max_length=200,
        num_return_sequences=1
    )[0]["generated_text"]
    
    # Création de la vidéo avec ModelScope
    video_pipeline = ms_pipeline(
        'text-to-video-synthesis',
        model='damo/text-to-video-synthesis'
    )
    
    result = video_pipeline({
        'text': prompt_enhancement,
        'output_video_path': 'output.mp4'
    })
    
    return 'output.mp4', prompt_enhancement

# Interface Gradio
def create_interface():
    models = load_models()
    
    interface = gr.Interface(
        fn=lambda img, audio, txt: process_inputs(img, audio, txt, models),
        inputs=[
            gr.Image(type="pil", label="Upload Image"),
            gr.Audio(type="filepath", label="Upload Audio"),
            gr.Textbox(label="Enter Additional Text")
        ],
        outputs=[
            gr.Video(label="Generated Video"),
            gr.Textbox(label="Generated Prompt")
        ],
        title="Multimodal Content to Video Generator",
        description="Upload an image, audio, or text (or any combination) to generate a video."
    )
    
    return interface

# Lancement de l'application
if __name__ == "__main__":
    interface = create_interface()
    interface.launch()