XiaoyiYangRIT commited on
Commit
741cc94
Β·
1 Parent(s): 377de41

Update some files

Browse files
Files changed (2) hide show
  1. app.py +51 -22
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,10 +1,27 @@
1
- import math
2
  import torch
 
3
  from transformers import AutoTokenizer, AutoModel, AutoProcessor
4
- import gradio as gr
5
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- # === εˆ†ι…ε±‚εˆ°ε€š GPU ===
8
  def split_model(model_path):
9
  from transformers import AutoConfig
10
  device_map = {}
@@ -30,13 +47,10 @@ def split_model(model_path):
30
  device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
31
  return device_map
32
 
33
- # === ζ¨‘εž‹θ·―εΎ„ ===
34
- model_path = "OpenGVLab/InternVL3-14B"
35
- device_map = split_model(model_path)
36
 
37
- # === εŠ θ½½ζ¨‘εž‹ε’Œε€„η†ε™¨ ===
38
  model = AutoModel.from_pretrained(
39
- model_path,
40
  torch_dtype=torch.bfloat16,
41
  low_cpu_mem_usage=True,
42
  use_flash_attn=True,
@@ -44,24 +58,39 @@ model = AutoModel.from_pretrained(
44
  device_map=device_map
45
  ).eval()
46
 
47
- tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
48
- processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
49
 
50
  # === ζŽ¨η†ε‡½ζ•° ===
51
- def infer(image: Image.Image, prompt: str):
52
- inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
53
- output = model.generate(**inputs, max_new_tokens=512)
54
- answer = tokenizer.decode(output[0], skip_special_tokens=True)
55
- return answer
 
 
 
 
 
 
 
 
 
56
 
57
  # === Gradio η•Œι’ ===
58
  gr.Interface(
59
- fn=infer,
60
- inputs=[
61
- gr.Image(type="pil", label="Upload Image"),
62
- gr.Textbox(label="Your Prompt", placeholder="Ask a question about the image...")
63
- ],
64
  outputs="text",
65
- title="InternVL3-14B Multimodal Demo",
66
- description="Upload an image and ask a question. InternVL3-14B will answer using vision + language."
67
  ).launch()
 
1
+ import gradio as gr
2
  import torch
3
+ import math
4
  from transformers import AutoTokenizer, AutoModel, AutoProcessor
5
+ from decord import VideoReader, cpu
6
  from PIL import Image
7
+ from torchvision.transforms import Compose, Resize, ToTensor, Normalize
8
+
9
+ # === 视觉钄倄理 ===
10
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
11
+ IMAGENET_STD = (0.229, 0.224, 0.225)
12
+
13
+ transform = Compose([
14
+ Resize((448, 448)),
15
+ ToTensor(),
16
+ Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
17
+ ])
18
+
19
+ # === ζ¨‘εž‹εŠ θ½½ ===
20
+ MODEL_NAME = "OpenGVLab/InternVL3-14B"
21
+
22
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
23
+ processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
24
 
 
25
  def split_model(model_path):
26
  from transformers import AutoConfig
27
  device_map = {}
 
47
  device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
48
  return device_map
49
 
50
+ device_map = split_model(MODEL_NAME)
 
 
51
 
 
52
  model = AutoModel.from_pretrained(
53
+ MODEL_NAME,
54
  torch_dtype=torch.bfloat16,
55
  low_cpu_mem_usage=True,
56
  use_flash_attn=True,
 
58
  device_map=device_map
59
  ).eval()
60
 
61
+ # === 视钑帧采样 ===
62
+ def extract_frames(video_path, num_frames=8):
63
+ vr = VideoReader(video_path, ctx=cpu(0))
64
+ total_frames = len(vr)
65
+ frame_indices = list(torch.linspace(0, total_frames - 1, num_frames).int().tolist())
66
+ images = []
67
+ for idx in frame_indices:
68
+ img = Image.fromarray(vr[idx].asnumpy()).convert("RGB")
69
+ img_tensor = transform(img)
70
+ images.append(img_tensor)
71
+ return torch.stack(images)
72
 
73
  # === ζŽ¨η†ε‡½ζ•° ===
74
+ def evaluate_ar(video):
75
+ frames = extract_frames(video.name).to(torch.bfloat16).cuda()
76
+ prompt = "Evaluate the quality of AR occlusion and rendering in the uploaded video." # ε―ζ’ζˆε…·δ½“δ»»εŠ‘
77
+ num_patches = [1] * frames.shape[0]
78
+ output, _ = model.chat(
79
+ tokenizer,
80
+ frames,
81
+ prompt,
82
+ generation_config=dict(max_new_tokens=512),
83
+ num_patches_list=num_patches,
84
+ history=None,
85
+ return_history=True
86
+ )
87
+ return output
88
 
89
  # === Gradio η•Œι’ ===
90
  gr.Interface(
91
+ fn=evaluate_ar,
92
+ inputs=gr.Video(label="Upload your AR video"),
 
 
 
93
  outputs="text",
94
+ title="InternVL3 AR Evaluation (Single-turn)",
95
+ description="Upload a video clip. The model will analyze AR occlusion and rendering quality."
96
  ).launch()
requirements.txt CHANGED
@@ -18,4 +18,5 @@ pillow>=10.0.0
18
  # Optional: To avoid tokenizer warnings
19
  sentencepiece>=0.1.99
20
  einops
21
- timm
 
 
18
  # Optional: To avoid tokenizer warnings
19
  sentencepiece>=0.1.99
20
  einops
21
+ timm
22
+ Pillow