import gradio as gr import torch from transformers import ( BlipProcessor, BlipForQuestionAnswering, pipeline, AutoTokenizer, AutoModelForCausalLM ) from modelscope.pipelines import pipeline as ms_pipeline from PIL import Image def load_models(): # Chargement des modèles blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") # Modèle de transcription audio audio_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small") # Modèle de génération de texte (version gratuite GPT-2) text_generator = pipeline("text-generation", model="gpt2") return blip_processor, blip_model, audio_transcriber, text_generator def analyze_image(image, blip_processor, blip_model): # Questions pour l'analyse d'image questions = [ "What is in the picture?", "What are the main colors?", "What is the setting or background?", "What is happening in the image?", ] responses = {} for question in questions: inputs = blip_processor(images=image, text=question, return_tensors="pt") outputs = blip_model.generate(**inputs) answer = blip_processor.decode(outputs[0], skip_special_tokens=True) responses[question] = answer description = f"This image shows {responses['What is in the picture?']}. " description += f"The main colors are {responses['What are the main colors?']}. " description += f"The setting is {responses['What is the setting or background?']}. " description += f"In the scene, {responses['What is happening in the image?']}." return description def process_inputs(image, audio, text, models): blip_processor, blip_model, audio_transcriber, text_generator = models final_prompt = "" # Analyse de l'image si présente if image is not None: image_description = analyze_image(image, blip_processor, blip_model) final_prompt += f"Visual description: {image_description}\n" # Transcription audio si présent if audio is not None: audio_text = audio_transcriber(audio)["text"] final_prompt += f"Audio content: {audio_text}\n" # Ajout du texte si présent if text: final_prompt += f"Additional context: {text}\n" # Génération du prompt optimisé avec GPT-2 prompt_enhancement = text_generator( final_prompt, max_length=200, num_return_sequences=1 )[0]["generated_text"] # Création de la vidéo avec ModelScope video_pipeline = ms_pipeline( 'text-to-video-synthesis', model='damo/text-to-video-synthesis' ) result = video_pipeline({ 'text': prompt_enhancement, 'output_video_path': 'output.mp4' }) return 'output.mp4', prompt_enhancement # Interface Gradio def create_interface(): models = load_models() interface = gr.Interface( fn=lambda img, audio, txt: process_inputs(img, audio, txt, models), inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Audio(type="filepath", label="Upload Audio"), gr.Textbox(label="Enter Additional Text") ], outputs=[ gr.Video(label="Generated Video"), gr.Textbox(label="Generated Prompt") ], title="Multimodal Content to Video Generator", description="Upload an image, audio, or text (or any combination) to generate a video." ) return interface # Lancement de l'application if __name__ == "__main__": interface = create_interface() interface.launch()