Spaces:

Sergidev
/

Qwen2.5-OmniAgent

Runtime error

f037fe5 3 months ago

5.73 kB

	import gradio as gr
	import torch
	from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor, TextStreamer
	from qwen_omni_utils import process_mm_info
	import soundfile as sf
	import tempfile
	import spaces
	import gc

	# Initialize the model and processor
	device = "cuda" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16

	def get_model():
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	gc.collect()
	model = Qwen2_5OmniModel.from_pretrained(
	"Qwen/Qwen2.5-Omni-7B",
	torch_dtype=torch_dtype,
	device_map="auto",
	enable_audio_output=True,
	low_cpu_mem_usage=True,
	attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
	)
	return model

	model = get_model()
	processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")

	# System prompt
	SYSTEM_PROMPT = {
	"role": "system",
	"content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
	}

	# Voice options
	VOICE_OPTIONS = {
	"Chelsie (Female)": "Chelsie",
	"Ethan (Male)": "Ethan"
	}

	@spaces.GPU(duration=120)
	def process_input(video, text, voice_type, enable_audio_output):
	try:
	# Clear GPU memory before processing
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	gc.collect()

	# Prepare multimodal input
	user_input = {
	"text": text,
	"video": video if video is not None else None,
	}

	# Prepare conversation history for model processing
	conversation = [SYSTEM_PROMPT]
	conversation.append({"role": "user", "content": user_input})

	# Process multimedia information
	try:
	audios, images, videos = process_mm_info(conversation, use_audio_in_video=False)
	except Exception as e:
	print(f"Error processing multimedia: {str(e)}")
	audios, images, videos = [], [], []

	inputs = processor(
	text=processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False),
	videos=videos,
	return_tensors="pt",
	padding=True
	)

	# Move inputs to device and convert dtype
	inputs = {k: v.to(device=model.device, dtype=model.dtype) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

	# Generate response with streaming and audio output
	text_ids = None
	audio_path = None

	if enable_audio_output:
	voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
	try:
	generation_output = model.generate(
	**inputs,
	use_audio_in_video=False,
	return_audio=True,
	spk=voice_type_value,
	max_new_tokens=512,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	streamer=TextStreamer(processor, skip_prompt=True)
	)
	if isinstance(generation_output, tuple) and len(generation_output) == 2:
	text_ids, audio = generation_output
	if audio is not None:
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	sf.write(tmp_file.name, audio.reshape(-1).detach().cpu().numpy(), samplerate=24000)
	audio_path = tmp_file.name
	except Exception as e:
	print(f"Error during audio generation: {str(e)}")

	# Fall back to text-only generation if audio fails
	if text_ids is None:
	try:
	text_ids = model.generate(
	**inputs,
	use_audio_in_video=False,
	return_audio=False,
	max_new_tokens=512,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	streamer=TextStreamer(processor, skip_prompt=True)
	)
	except Exception as e:
	print(f"Error during fallback text generation: {str(e)}")

	# Decode text response
	text_response = processor.batch_decode(text_ids, skip_special_tokens=True)[0] if text_ids is not None else "Error generating response."

	return text_response.strip(), audio_path

	except Exception as e:
	print(f"Error in process_input: {str(e)}")
	return "Error processing input.", None

	# Gradio interface setup
	with gr.Blocks() as demo:
	gr.Markdown("## Qwen2.5-Omni-7B Multimodal Demo")

	with gr.Row():
	video_input = gr.Video(label="Upload Video (max 120s)", sources=["upload"], max_length=120)
	prompt_input = gr.Textbox(label="Analysis Prompt", placeholder="Describe or ask about the video...")

	voice_selection = gr.Dropdown(label="Voice Type", choices=list(VOICE_OPTIONS.keys()), value="Chelsie (Female)")
	enable_audio_checkbox = gr.Checkbox(label="Enable Audio Output", value=True)

	submit_btn = gr.Button("Analyze", variant="primary")

	with gr.Column():
	text_output = gr.Textbox(label="Analysis Results", interactive=False)
	audio_output = gr.Audio(label="Speech Response", autoplay=True)

	submit_btn.click(
	process_input,
	inputs=[video_input, prompt_input, voice_selection, enable_audio_checkbox],
	outputs=[text_output, audio_output]
	)

	demo.queue(concurrency_count=2)
	demo.launch(server_name="0.0.0.0", server_port=7860)