import gradio as gr import io import numpy as np import torch from decord import cpu, VideoReader, bridge from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import BitsAndBytesConfig MODEL_PATH = "THUDM/cogvlm2-llama3-caption" DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16 # Delay Reasons for Each Manufacturing Step DELAY_REASONS = { "Step 1": ["Delay in Bead Insertion", "Lack of raw material"], "Step 2": ["Inner Liner Adjustment by Technician", "Person rebuilding defective Tire Sections"], "Step 3": ["Manual Adjustment in Ply1 apply", "Technician repairing defective Tire Sections"], "Step 4": ["Delay in Bead set", "Lack of raw material"], "Step 5": ["Delay in Turnup", "Lack of raw material"], "Step 6": ["Person Repairing sidewall", "Person rebuilding defective Tire Sections"], "Step 7": ["Delay in sidewall stitching", "Lack of raw material"], "Step 8": ["No person available to load Carcass", "No person available to collect tire"] } def get_step_info(step_number): """Returns detailed information about a manufacturing step.""" step_details = { 1: { "Name": "Bead Insertion", "Standard Time": "4 seconds", "Video_substeps_expected": { "0-1 second": "Machine starts bead insertion process.", "1-3 seconds": "Beads are aligned and positioned.", "3-4 seconds": "Final adjustment and confirmation of bead placement." } }, 2: { "Name": "Inner Liner Apply", "Standard Time": "4 seconds", "Video_substeps_expected": { "0-1 second": "Machine applies the first layer of the liner.", "1-3 seconds": "Technician checks alignment and adjusts if needed.", "3-4 seconds": "Final inspection and confirmation." } }, 3: { "Name": "Ply1 Apply", "Standard Time": "4 seconds", "Video_substeps_expected": { "0-2 seconds": "First ply is loaded onto the machine.", "2-4 seconds": "Technician inspects and adjusts ply placement." } }, 4: { "Name": "Bead Set", "Standard Time": "8 seconds", "Video_substeps_expected": { "0-3 seconds": "Bead is positioned and pre-set.", "3-6 seconds": "Machine secures the bead in place.", "6-8 seconds": "Technician confirms the bead alignment." } }, 5: { "Name": "Turnup", "Standard Time": "4 seconds", "Video_substeps_expected": { "0-2 seconds": "Turnup process begins with machine handling.", "2-4 seconds": "Technician inspects the turnup and makes adjustments if necessary." } }, 6: { "Name": "Sidewall Apply", "Standard Time": "14 seconds", "Video_substeps_expected": { "0-5 seconds": "Sidewall material is positioned by the machine.", "5-10 seconds": "Technician checks for alignment and begins application.", "10-14 seconds": "Final adjustments and confirmation of sidewall placement." } }, 7: { "Name": "Sidewall Stitching", "Standard Time": "5 seconds", "Video_substeps_expected": { "0-2 seconds": "Stitching process begins automatically.", "2-4 seconds": "Technician inspects stitching for any irregularities.", "4-5 seconds": "Machine completes stitching process." } }, 8: { "Name": "Carcass Unload", "Standard Time": "7 seconds", "Video_substeps_expected": { "0-3 seconds": "Technician unloads(removes) carcass(tire) from the machine." } } } return step_details.get(step_number, {"Error": "Invalid step number. Please provide a valid step number."}) def load_video(video_data, strategy='chat'): """Loads and processes video data into a format suitable for model input.""" bridge.set_bridge('torch') num_frames = 24 if isinstance(video_data, str): decord_vr = VideoReader(video_data, ctx=cpu(0)) else: decord_vr = VideoReader(io.BytesIO(video_data), ctx=cpu(0)) total_frames = len(decord_vr) if total_frames < num_frames: raise ValueError("Uploaded video is too short for meaningful analysis.") timestamps = [i[0] for i in decord_vr.get_frame_timestamp(np.arange(total_frames))] max_second = round(max(timestamps)) + 1 frame_id_list = [] for second in range(max_second): closest_num = min(timestamps, key=lambda x: abs(x - second)) index = timestamps.index(closest_num) frame_id_list.append(index) if len(frame_id_list) >= num_frames: break video_data = decord_vr.get_batch(frame_id_list) video_data = video_data.permute(3, 0, 1, 2) return video_data def load_model(): """Loads the pre-trained model and tokenizer with quantization configurations.""" quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=TORCH_TYPE, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4" ) tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_PATH, torch_dtype=TORCH_TYPE, trust_remote_code=True, quantization_config=quantization_config, device_map="auto" ).eval() return model, tokenizer def predict(prompt, video_data, temperature, model, tokenizer): """Generates predictions based on the video and textual prompt.""" video = load_video(video_data, strategy='chat') inputs = model.build_conversation_input_ids( tokenizer=tokenizer, query=prompt, images=[video], history=[], template_version='chat' ) inputs = { 'input_ids': inputs['input_ids'].unsqueeze(0).to(DEVICE), 'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to(DEVICE), 'attention_mask': inputs['attention_mask'].unsqueeze(0).to(DEVICE), 'images': [[inputs['images'][0].to(DEVICE).to(TORCH_TYPE)]], } gen_kwargs = { "max_new_tokens": 2048, "pad_token_id": tokenizer.pad_token_id, "top_k": 1, "do_sample": False, "top_p": 0.1, "temperature": 0.3, } with torch.no_grad(): outputs = model.generate(**inputs, **gen_kwargs) outputs = outputs[:, inputs['input_ids'].shape[1]:] response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip() return f"Analysis Result:\n{response}" def get_analysis_prompt(step_number): """Constructs the prompt for analyzing delay reasons based on the selected step.""" step_info = get_step_info(step_number) if "Error" in step_info: return step_info["Error"] step_name = step_info["Name"] standard_time = step_info["Standard Time"] substeps = step_info["Video_substeps_expected"] delay_reasons = DELAY_REASONS.get(f"Step {step_number}", ["No specific reasons provided."]) substeps_text = "\n".join([f"- {time}: {action}" for time, action in substeps.items()]) reasons_text = "\n".join([f"- {reason}" for reason in delay_reasons]) return f""" You are an AI expert system analyzing manufacturing delays in tire production. Below are the details: Step: {step_number} - {step_name} Standard Time: {standard_time} Substeps Expected in Video: {substeps_text} Potential Delay Reasons: {reasons_text} Task: Analyze the provided video to identify the delay reason. Use the following format: 1. **Selected Reason:** [Choose the most likely reason from the list above] 2. **Visual Evidence:** [Describe specific visual cues from the