import os import gradio as gr import spaces import torch import tempfile import imageio from decord import VideoReader, cpu from transformers import pipeline hf_token = os.environ.get("HUGGINGFACE_TOKEN") model_id = "google/gemma-3-27b-it" NUM_FRAMES = 8 # 从视频中采样 N 帧 def sample_video_frames(video_path, num_frames=NUM_FRAMES): vr = VideoReader(video_path, ctx=cpu(0)) total_frames = len(vr) indices = [int(i) for i in torch.linspace(0, total_frames - 1, steps=num_frames)] frames = [vr[i].asnumpy() for i in indices] pil_frames = [imageio.core.util.Array(frame) for frame in frames] return pil_frames # 推理函数:加载模型、采样视频帧、推理 @spaces.GPU def analyze_video(video_file): # 从上传的视频中采样图像帧 frames = sample_video_frames(video_file.name) # 构造单轮 prompt(可改为你需要的评估内容) system_prompt = ( "You are a helpful AI assistant that analyzes AR effects in videos. " "Evaluate the realism and placement of virtual objects in the provided video frames." ) user_prompt = "Based on the frames, describe how well the AR objects blend into the real environment." # 构造输入对话历史(含图像) history = [ { "role": "system", "content": [{"type": "text", "text": system_prompt}] }, { "role": "user", "content": [{"type": "text", "text": user_prompt}] + [{"type": "image", "image": frame} for frame in frames] } ] # 调用 pipeline 推理 pipe = pipeline( "image-text-to-text", model=model_id, token=hf_token, torch_dtype=torch.bfloat16, model_kwargs={"device_map": "auto"} ) result = pipe(text=history, max_new_tokens=512) return result[0]["generated_text"][-1]["content"] # Gradio 界面 gr.Interface( fn=analyze_video, inputs=gr.Video(label="Upload an AR Video (.mp4 only)"), outputs=gr.Textbox(label="Gemma Analysis Result"), title="Gemma-3-27B Video Analysis (ZeroGPU)", description="Uploads a video, extracts 8 frames, and uses Gemma-3-27B to analyze AR realism." ).launch()