VCL3D commited on
Commit
6d51761
·
verified ·
1 Parent(s): a692629

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +3 -67
README.md CHANGED
@@ -3,7 +3,7 @@ license: apache-2.0
3
  ---
4
  ONNX format of voxerality/rgb_language_cap model
5
 
6
- Run on CPU
7
 
8
  ```python
9
  import onnxruntime as ort
@@ -64,71 +64,7 @@ for _ in range(max_length):
64
 
65
  # decode the predicted token IDs into text
66
  predicted_text = tokenizer.decode(decoder_input_ids[0], skip_special_tokens=True)
67
- # print the generated caption
68
- print(predicted_text)
69
- ```
70
-
71
-
72
- Run on GPU
73
-
74
- ```python
75
- import onnxruntime as ort
76
- from transformers import AutoTokenizer,AutoImageProcessor
77
- from PIL import Image
78
- import numpy as np
79
-
80
- # load the ONNX models (encoder and decoder)
81
- encoder_onnx_path = 'models/rgb_language_cap_onnx/encoder_model.onnx' # load from local path
82
- decoder_onnx_path = 'models/rgb_language_cap_onnx/decoder_model.onnx' # load from local path
83
- encoder_session = ort.InferenceSession(encoder_onnx_path, providers=["CUDAExecutionProvider"])
84
- decoder_session = ort.InferenceSession(decoder_onnx_path, providers=["CUDAExecutionProvider"])
85
-
86
- # load the tokenizer and image processor
87
- model_id = "models/rgb_language_cap_onnx"
88
- processor = AutoImageProcessor.from_pretrained(model_id)
89
- tokenizer = AutoTokenizer.from_pretrained(model_id)
90
-
91
- # load image
92
- image_path = "img2.jpg"
93
- image = Image.open(image_path)
94
- inputs = processor(images=image, return_tensors="np").pixel_values
95
-
96
- # run encoder model
97
- encoder_outputs = encoder_session.run(
98
- None,
99
- {"pixel_values": inputs}
100
- )
101
-
102
- # extract the encoder hidden states (encoder outputs)
103
- encoder_hidden_states = encoder_outputs[0]
104
-
105
- # prepare decoder inputs
106
- decoder_input_ids = np.array([[tokenizer.bos_token_id]], dtype=np.int64)
107
-
108
- # run decoder model
109
- max_length = 200 # define maximum length of the sequence
110
-
111
- for _ in range(max_length):
112
- decoder_outputs = decoder_session.run(
113
- None,
114
- {
115
- "input_ids": decoder_input_ids, # input for the decoder
116
- "encoder_hidden_states": encoder_hidden_states # outputs from the encoder
117
- }
118
- )
119
-
120
- # extract logits and predict next token
121
- logits = decoder_outputs[0]
122
- predicted_token_id = np.argmax(logits[0, -1, :]) # get the predicted token ID from the logits
123
 
124
- # if the predicted token is the EOS token, stop the generation
125
- if predicted_token_id == tokenizer.eos_token_id:
126
- break
127
-
128
- # append predicted token ID to the decoder inputs for the next step
129
- decoder_input_ids = np.concatenate([decoder_input_ids, np.array([[predicted_token_id]])], axis=-1)
130
-
131
- # decode the predicted token IDs into text
132
- predicted_text = tokenizer.decode(decoder_input_ids[0], skip_special_tokens=True)
133
  # print the generated caption
134
- print(predicted_text)
 
 
3
  ---
4
  ONNX format of voxerality/rgb_language_cap model
5
 
6
+ Model inference example:
7
 
8
  ```python
9
  import onnxruntime as ort
 
64
 
65
  # decode the predicted token IDs into text
66
  predicted_text = tokenizer.decode(decoder_input_ids[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
 
 
 
 
 
 
 
 
 
68
  # print the generated caption
69
+ print(predicted_text)
70
+ ```