Spaces:

LoufAn
/

AR_Testing

Sleeping

App Files Files Community

XiaoyiYangRIT commited on May 8

Commit

741cc94

1 Parent(s): 377de41

Update some files

Browse files

Files changed (2) hide show

app.py +51 -22
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,10 +1,27 @@
-import math
 import torch
 from transformers import AutoTokenizer, AutoModel, AutoProcessor
-import gradio as gr
 from PIL import Image
-# === 分配层到多 GPU ===
 def split_model(model_path):
     from transformers import AutoConfig
     device_map = {}
@@ -30,13 +47,10 @@ def split_model(model_path):
     device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
     return device_map
-# === 模型路径 ===
-model_path = "OpenGVLab/InternVL3-14B"
-device_map = split_model(model_path)
-# === 加载模型和处理器 ===
 model = AutoModel.from_pretrained(
-    model_path,
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     use_flash_attn=True,
@@ -44,24 +58,39 @@ model = AutoModel.from_pretrained(
     device_map=device_map
 ).eval()
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 # === 推理函数 ===
-def infer(image: Image.Image, prompt: str):
-    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
-    output = model.generate(**inputs, max_new_tokens=512)
-    answer = tokenizer.decode(output[0], skip_special_tokens=True)
-    return answer
 # === Gradio 界面 ===
 gr.Interface(
-    fn=infer,
-    inputs=[
-        gr.Image(type="pil", label="Upload Image"),
-        gr.Textbox(label="Your Prompt", placeholder="Ask a question about the image...")
-    ],
     outputs="text",
-    title="InternVL3-14B Multimodal Demo",
-    description="Upload an image and ask a question. InternVL3-14B will answer using vision + language."
 ).launch()

+import gradio as gr
 import torch
+import math
 from transformers import AutoTokenizer, AutoModel, AutoProcessor
+from decord import VideoReader, cpu
 from PIL import Image
+from torchvision.transforms import Compose, Resize, ToTensor, Normalize
+# === 视觉预处理 ===
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+transform = Compose([
+    Resize((448, 448)),
+    ToTensor(),
+    Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+])
+# === 模型加载 ===
+MODEL_NAME = "OpenGVLab/InternVL3-14B"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
 def split_model(model_path):
     from transformers import AutoConfig
     device_map = {}
     device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
     return device_map
+device_map = split_model(MODEL_NAME)
 model = AutoModel.from_pretrained(
+    MODEL_NAME,
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     use_flash_attn=True,
     device_map=device_map
 ).eval()
+# === 视频帧采样 ===
+def extract_frames(video_path, num_frames=8):
+    vr = VideoReader(video_path, ctx=cpu(0))
+    total_frames = len(vr)
+    frame_indices = list(torch.linspace(0, total_frames - 1, num_frames).int().tolist())
+    images = []
+    for idx in frame_indices:
+        img = Image.fromarray(vr[idx].asnumpy()).convert("RGB")
+        img_tensor = transform(img)
+        images.append(img_tensor)
+    return torch.stack(images)
 # === 推理函数 ===
+def evaluate_ar(video):
+    frames = extract_frames(video.name).to(torch.bfloat16).cuda()
+    prompt = "Evaluate the quality of AR occlusion and rendering in the uploaded video."  # 可换成具体任务
+    num_patches = [1] * frames.shape[0]
+    output, _ = model.chat(
+        tokenizer,
+        frames,
+        prompt,
+        generation_config=dict(max_new_tokens=512),
+        num_patches_list=num_patches,
+        history=None,
+        return_history=True
+    )
+    return output
 # === Gradio 界面 ===
 gr.Interface(
+    fn=evaluate_ar,
+    inputs=gr.Video(label="Upload your AR video"),
     outputs="text",
+    title="InternVL3 AR Evaluation (Single-turn)",
+    description="Upload a video clip. The model will analyze AR occlusion and rendering quality."
 ).launch()

requirements.txt CHANGED Viewed

@@ -18,4 +18,5 @@ pillow>=10.0.0
 # Optional: To avoid tokenizer warnings
 sentencepiece>=0.1.99
 einops
-timm

 # Optional: To avoid tokenizer warnings
 sentencepiece>=0.1.99
 einops
+timm
+Pillow