XiaoyiYangRIT commited on
Commit
7aa5317
·
1 Parent(s): e041131

Update some files

Browse files
Files changed (2) hide show
  1. app.py +12 -99
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,105 +1,21 @@
1
- import os
2
  import gradio as gr
3
- import torch
4
- import math
5
- import time
6
- from PIL import Image
7
- from decord import VideoReader, cpu
8
- from torchvision.transforms import Compose, Resize, ToTensor, Normalize
9
 
10
- from transformers import (
11
- AutoModel,
12
- AutoTokenizer,
13
- AutoProcessor,
14
- AutoConfig
15
- )
16
- from huggingface_hub import snapshot_download
17
 
18
- start_time = time.time()
19
-
20
- # === 常量设定 ===
21
- MODEL_NAME = "OpenGVLab/InternVL3-14B"
22
- CACHE_DIR = "/data/internvl3_model"
23
-
24
- # === 视觉预处理 ===
25
- IMAGENET_MEAN = (0.485, 0.456, 0.406)
26
- IMAGENET_STD = (0.229, 0.224, 0.225)
27
-
28
- transform = Compose([
29
- Resize((448, 448)),
30
- ToTensor(),
31
- Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
32
- ])
33
-
34
- # === 模型下载与缓存 ===
35
- if not os.path.exists(CACHE_DIR):
36
- print("⏬ First run: downloading model to persistent storage...")
37
- snapshot_download(repo_id=MODEL_NAME, local_dir=CACHE_DIR)
38
- else:
39
- print("✅ Loaded model from persistent cache.")
40
-
41
- # === GPU层级分配(多GPU支持) ===
42
- def split_model(model_path):
43
- device_map = {}
44
- world_size = torch.cuda.device_count()
45
- config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
46
- num_layers = config.llm_config.num_hidden_layers
47
- num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
48
- num_layers_per_gpu = [num_layers_per_gpu] * world_size
49
- num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
50
- layer_cnt = 0
51
- for i, num_layer in enumerate(num_layers_per_gpu):
52
- for _ in range(num_layer):
53
- device_map[f'language_model.model.layers.{layer_cnt}'] = i
54
- layer_cnt += 1
55
- device_map['vision_model'] = 0
56
- device_map['mlp1'] = 0
57
- device_map['language_model.model.tok_embeddings'] = 0
58
- device_map['language_model.model.embed_tokens'] = 0
59
- device_map['language_model.output'] = 0
60
- device_map['language_model.model.norm'] = 0
61
- device_map['language_model.model.rotary_emb'] = 0
62
- device_map['language_model.lm_head'] = 0
63
- device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
64
- return device_map
65
-
66
- # === 加载组件(已缓存) ===
67
- print("🚀 Loading tokenizer/processor/model from cache...")
68
- tokenizer = AutoTokenizer.from_pretrained(CACHE_DIR, trust_remote_code=True)
69
- processor = AutoProcessor.from_pretrained(CACHE_DIR, trust_remote_code=True)
70
- device_map = split_model(CACHE_DIR)
71
- model = AutoModel.from_pretrained(
72
- CACHE_DIR,
73
- torch_dtype=torch.bfloat16,
74
- low_cpu_mem_usage=True,
75
- use_flash_attn=True,
76
- trust_remote_code=True,
77
- device_map=device_map
78
- ).eval()
79
-
80
- # === 视频帧提取函数 ===
81
- def extract_frames(video_path, num_frames=8):
82
- vr = VideoReader(video_path, ctx=cpu(0))
83
- total_frames = len(vr)
84
- frame_indices = list(torch.linspace(0, total_frames - 1, num_frames).int().tolist())
85
- images = []
86
- for idx in frame_indices:
87
- img = Image.fromarray(vr[idx].asnumpy()).convert("RGB")
88
- img_tensor = transform(img)
89
- images.append(img_tensor)
90
- return torch.stack(images)
91
-
92
- # === 主推理函数 ===
93
  def evaluate_ar(video):
94
- frames = extract_frames(video).to(torch.bfloat16).cuda()
95
- prompt = "Evaluate the quality of AR occlusion and rendering in the uploaded video."
96
- num_patches = [1] * frames.shape[0]
97
  output, _ = model.chat(
98
  tokenizer,
99
- frames,
100
  prompt,
101
- generation_config=dict(max_new_tokens=512),
102
- num_patches_list=num_patches,
103
  history=None,
104
  return_history=True
105
  )
@@ -112,7 +28,4 @@ gr.Interface(
112
  outputs="text",
113
  title="InternVL3 AR Evaluation (Single-turn)",
114
  description="Upload a short AR video clip. The model will sample frames and assess occlusion/rendering quality."
115
- ).launch()
116
-
117
- # (在模型加载完成后)
118
- print(f"✅ Model fully loaded. Time elapsed: {time.time() - start_time:.2f} sec.")
 
1
+ # app.py(主入口简化版)
2
  import gradio as gr
3
+ from src.model_loader import load_model
4
+ from src.video_utils import process_video_for_internvl3
 
 
 
 
5
 
6
+ # === 初始化模型 ===
7
+ tokenizer, model = load_model()
 
 
 
 
 
8
 
9
+ # === 推理接口 ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def evaluate_ar(video):
11
+ pixel_values, num_patches_list, prompt = process_video_for_internvl3(video)
12
+ generation_config = dict(max_new_tokens=512)
 
13
  output, _ = model.chat(
14
  tokenizer,
15
+ pixel_values,
16
  prompt,
17
+ generation_config=generation_config,
18
+ num_patches_list=num_patches_list,
19
  history=None,
20
  return_history=True
21
  )
 
28
  outputs="text",
29
  title="InternVL3 AR Evaluation (Single-turn)",
30
  description="Upload a short AR video clip. The model will sample frames and assess occlusion/rendering quality."
31
+ ).launch()
 
 
 
requirements.txt CHANGED
@@ -19,4 +19,5 @@ pillow>=10.0.0
19
  sentencepiece>=0.1.99
20
  einops
21
  timm
22
- Pillow
 
 
19
  sentencepiece>=0.1.99
20
  einops
21
  timm
22
+ Pillow
23
+ flash-attn