Spaces:
Runtime error
Runtime error
File size: 6,174 Bytes
5ef4c3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import gradio as gr
import io
import numpy as np
import torch
from decord import cpu, VideoReader, bridge
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
import json
# Model Configuration
MODEL_PATH = "THUDM/cogvlm2-llama3-caption"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
# Define delay reasons for each step
DELAY_REASONS = {
"Step 1": ["No raw material available", "Person repatching the tire"],
"Step 2": ["Person repatching the tire", "Lack of raw material"],
"Step 3": ["Person repatching the tire", "Lack of raw material"],
"Step 4": ["Person repatching the tire", "Lack of raw material"],
"Step 5": ["Person repatching the tire", "Lack of raw material"],
"Step 6": ["Person repatching the tire", "Lack of raw material"],
"Step 7": ["Person repatching the tire", "Lack of raw material"],
"Step 8": ["No person available to collect tire", "Person repatching the tire"]
}
def load_video(video_data, strategy='chat'):
bridge.set_bridge('torch')
mp4_stream = video_data
num_frames = 24
decord_vr = VideoReader(io.BytesIO(mp4_stream), ctx=cpu(0))
frame_id_list = []
total_frames = len(decord_vr)
timestamps = [i[0] for i in decord_vr.get_frame_timestamp(np.arange(total_frames))]
max_second = round(max(timestamps)) + 1
for second in range(max_second):
closest_num = min(timestamps, key=lambda x: abs(x - second))
index = timestamps.index(closest_num)
frame_id_list.append(index)
if len(frame_id_list) >= num_frames:
break
video_data = decord_vr.get_batch(frame_id_list)
video_data = video_data.permute(3, 0, 1, 2)
return video_data
def load_model():
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=TORCH_TYPE,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True,
quantization_config=quantization_config,
device_map="auto"
).eval()
return model, tokenizer
def predict(prompt, video_data, temperature, model, tokenizer):
video = load_video(video_data, strategy='chat')
inputs = model.build_conversation_input_ids(
tokenizer=tokenizer,
query=prompt,
images=[video],
history=[],
template_version='chat'
)
inputs = {
'input_ids': inputs['input_ids'].unsqueeze(0).to(DEVICE),
'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to(DEVICE),
'attention_mask': inputs['attention_mask'].unsqueeze(0).to(DEVICE),
'images': [[inputs['images'][0].to(DEVICE).to(TORCH_TYPE)]],
}
gen_kwargs = {
"max_new_tokens": 2048,
"pad_token_id": 128002,
"top_k": 1,
"do_sample": False,
"top_p": 0.1,
"temperature": temperature,
}
with torch.no_grad():
outputs = model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
def get_analysis_prompt(step_number, possible_reasons):
return f"""Analyze the video of Step {step_number} in the tire manufacturing process.
Possible delay reasons for this step are:
{', '.join(possible_reasons)}
Based on the video evidence, determine which of these reasons best explains the delay.
Please provide:
1. Your chosen reason from the list above
2. Specific visual evidence supporting this choice
3. Brief explanation of why other reasons are less likely
Focus your analysis on visual cues that support your conclusion."""
def inference(video, step_number, selected_reason):
if not video:
return "Please upload a video first."
try:
model, tokenizer = load_model()
video_data = video.read()
# Get possible reasons for the selected step
possible_reasons = DELAY_REASONS[step_number]
# Generate the analysis prompt
prompt = get_analysis_prompt(step_number, possible_reasons)
# Get model prediction
temperature = 0.8
response = predict(prompt, video_data, temperature, model, tokenizer)
return response
except Exception as e:
return f"An error occurred: {str(e)}"
def update_reasons(step):
"""Update the dropdown choices based on the selected step"""
return gr.Dropdown(choices=DELAY_REASONS[step])
# Gradio Interface
def create_interface():
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
video = gr.Video(label="Upload Manufacturing Video", sources=["upload"])
step_number = gr.Dropdown(
choices=list(DELAY_REASONS.keys()),
label="Manufacturing Step",
value="Step 1"
)
reason = gr.Dropdown(
choices=DELAY_REASONS["Step 1"],
label="Select Delay Reason",
value=DELAY_REASONS["Step 1"][0]
)
analyze_btn = gr.Button("Analyze Delay", variant="primary")
with gr.Column():
output = gr.Textbox(label="Analysis Result", lines=10)
# Update reasons when step changes
step_number.change(
fn=update_reasons,
inputs=[step_number],
outputs=[reason]
)
# Trigger analysis when button is clicked
analyze_btn.click(
fn=inference,
inputs=[video, step_number, reason],
outputs=[output]
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch() |