Spaces:

LoufAn
/

AR_Testing

Sleeping

App Files Files Community

XiaoyiYangRIT commited on May 8

Commit

7aa5317

1 Parent(s): e041131

Update some files

Browse files

Files changed (2) hide show

app.py +12 -99
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,105 +1,21 @@
-import os
 import gradio as gr
-import torch
-import math
-import time
-from PIL import Image
-from decord import VideoReader, cpu
-from torchvision.transforms import Compose, Resize, ToTensor, Normalize
-from transformers import (
-    AutoModel,
-    AutoTokenizer,
-    AutoProcessor,
-    AutoConfig
-)
-from huggingface_hub import snapshot_download
-start_time = time.time()
-# === 常量设定 ===
-MODEL_NAME = "OpenGVLab/InternVL3-14B"
-CACHE_DIR = "/data/internvl3_model"
-# === 视觉预处理 ===
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-transform = Compose([
-    Resize((448, 448)),
-    ToTensor(),
-    Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
-])
-# === 模型下载与缓存 ===
-if not os.path.exists(CACHE_DIR):
-    print("⏬ First run: downloading model to persistent storage...")
-    snapshot_download(repo_id=MODEL_NAME, local_dir=CACHE_DIR)
-else:
-    print("✅ Loaded model from persistent cache.")
-# === GPU层级分配（多GPU支持） ===
-def split_model(model_path):
-    device_map = {}
-    world_size = torch.cuda.device_count()
-    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-    num_layers = config.llm_config.num_hidden_layers
-    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
-    num_layers_per_gpu = [num_layers_per_gpu] * world_size
-    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
-    layer_cnt = 0
-    for i, num_layer in enumerate(num_layers_per_gpu):
-        for _ in range(num_layer):
-            device_map[f'language_model.model.layers.{layer_cnt}'] = i
-            layer_cnt += 1
-    device_map['vision_model'] = 0
-    device_map['mlp1'] = 0
-    device_map['language_model.model.tok_embeddings'] = 0
-    device_map['language_model.model.embed_tokens'] = 0
-    device_map['language_model.output'] = 0
-    device_map['language_model.model.norm'] = 0
-    device_map['language_model.model.rotary_emb'] = 0
-    device_map['language_model.lm_head'] = 0
-    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
-    return device_map
-# === 加载组件（已缓存） ===
-print("🚀 Loading tokenizer/processor/model from cache...")
-tokenizer = AutoTokenizer.from_pretrained(CACHE_DIR, trust_remote_code=True)
-processor = AutoProcessor.from_pretrained(CACHE_DIR, trust_remote_code=True)
-device_map = split_model(CACHE_DIR)
-model = AutoModel.from_pretrained(
-    CACHE_DIR,
-    torch_dtype=torch.bfloat16,
-    low_cpu_mem_usage=True,
-    use_flash_attn=True,
-    trust_remote_code=True,
-    device_map=device_map
-).eval()
-# === 视频帧提取函数 ===
-def extract_frames(video_path, num_frames=8):
-    vr = VideoReader(video_path, ctx=cpu(0))
-    total_frames = len(vr)
-    frame_indices = list(torch.linspace(0, total_frames - 1, num_frames).int().tolist())
-    images = []
-    for idx in frame_indices:
-        img = Image.fromarray(vr[idx].asnumpy()).convert("RGB")
-        img_tensor = transform(img)
-        images.append(img_tensor)
-    return torch.stack(images)
-# === 主推理函数 ===
 def evaluate_ar(video):
-    frames = extract_frames(video).to(torch.bfloat16).cuda()
-    prompt = "Evaluate the quality of AR occlusion and rendering in the uploaded video."
-    num_patches = [1] * frames.shape[0]
     output, _ = model.chat(
         tokenizer,
-        frames,
         prompt,
-        generation_config=dict(max_new_tokens=512),
-        num_patches_list=num_patches,
         history=None,
         return_history=True
     )
@@ -112,7 +28,4 @@ gr.Interface(
     outputs="text",
     title="InternVL3 AR Evaluation (Single-turn)",
     description="Upload a short AR video clip. The model will sample frames and assess occlusion/rendering quality."
-).launch()
-# (在模型加载完成后)
-print(f"✅ Model fully loaded. Time elapsed: {time.time() - start_time:.2f} sec.")

+# app.py（主入口简化版）
 import gradio as gr
+from src.model_loader import load_model
+from src.video_utils import process_video_for_internvl3
+# === 初始化模型 ===
+tokenizer, model = load_model()
+# === 推理接口 ===
 def evaluate_ar(video):
+    pixel_values, num_patches_list, prompt = process_video_for_internvl3(video)
+    generation_config = dict(max_new_tokens=512)
     output, _ = model.chat(
         tokenizer,
+        pixel_values,
         prompt,
+        generation_config=generation_config,
+        num_patches_list=num_patches_list,
         history=None,
         return_history=True
     )
     outputs="text",
     title="InternVL3 AR Evaluation (Single-turn)",
     description="Upload a short AR video clip. The model will sample frames and assess occlusion/rendering quality."
+).launch()

requirements.txt CHANGED Viewed

@@ -19,4 +19,5 @@ pillow>=10.0.0
 sentencepiece>=0.1.99
 einops
 timm
-Pillow

 sentencepiece>=0.1.99
 einops
 timm
+Pillow
+flash-attn