VCL3D commited on
Commit
839fd89
·
verified ·
1 Parent(s): 44e0318

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +136 -3
README.md CHANGED
@@ -1,3 +1,136 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ ONNX format of voxerality/rgb_language_cap model
5
+
6
+ Run on CPU
7
+
8
+ ```
9
+ python
10
+ import onnxruntime as ort
11
+ from transformers import AutoTokenizer,AutoImageProcessor
12
+ from PIL import Image
13
+ import numpy as np
14
+
15
+ # load the ONNX models (encoder and decoder)
16
+ encoder_onnx_path = 'models/rgb_language_cap_onnx/encoder_model.onnx' # load from local path
17
+ decoder_onnx_path = 'models/rgb_language_cap_onnx/decoder_model.onnx' # load from local path
18
+ encoder_session = ort.InferenceSession(encoder_onnx_path, providers=["CPUExecutionProvider"])
19
+ decoder_session = ort.InferenceSession(decoder_onnx_path, providers=["CPUExecutionProvider"])
20
+
21
+ # load the tokenizer and image processor
22
+ model_id = "models/rgb_language_cap_onnx"
23
+ processor = AutoImageProcessor.from_pretrained(model_id)
24
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
25
+
26
+ # load image
27
+ image_path = "img2.jpg"
28
+ image = Image.open(image_path)
29
+ inputs = processor(images=image, return_tensors="np").pixel_values
30
+
31
+ # run encoder model
32
+ encoder_outputs = encoder_session.run(
33
+ None,
34
+ {"pixel_values": inputs}
35
+ )
36
+
37
+ # extract the encoder hidden states (encoder outputs)
38
+ encoder_hidden_states = encoder_outputs[0]
39
+
40
+ # prepare decoder inputs
41
+ decoder_input_ids = np.array([[tokenizer.bos_token_id]], dtype=np.int64)
42
+
43
+ # run decoder model
44
+ max_length = 200 # define maximum length of the sequence
45
+
46
+ for _ in range(max_length):
47
+ decoder_outputs = decoder_session.run(
48
+ None,
49
+ {
50
+ "input_ids": decoder_input_ids, # input for the decoder
51
+ "encoder_hidden_states": encoder_hidden_states # outputs from the encoder
52
+ }
53
+ )
54
+
55
+ # extract logits and predict next token
56
+ logits = decoder_outputs[0]
57
+ predicted_token_id = np.argmax(logits[0, -1, :]) # get the predicted token ID from the logits
58
+
59
+ # if the predicted token is the EOS token, stop the generation
60
+ if predicted_token_id == tokenizer.eos_token_id:
61
+ break
62
+
63
+ # append predicted token ID to the decoder inputs for the next step
64
+ decoder_input_ids = np.concatenate([decoder_input_ids, np.array([[predicted_token_id]])], axis=-1)
65
+
66
+ # decode the predicted token IDs into text
67
+ predicted_text = tokenizer.decode(decoder_input_ids[0], skip_special_tokens=True)
68
+ # print the generated caption
69
+ print(predicted_text)
70
+ ```
71
+
72
+
73
+ Run on GPU
74
+
75
+ ```
76
+ python
77
+ import onnxruntime as ort
78
+ from transformers import AutoTokenizer,AutoImageProcessor
79
+ from PIL import Image
80
+ import numpy as np
81
+
82
+ # load the ONNX models (encoder and decoder)
83
+ encoder_onnx_path = 'models/rgb_language_cap_onnx/encoder_model.onnx' # load from local path
84
+ decoder_onnx_path = 'models/rgb_language_cap_onnx/decoder_model.onnx' # load from local path
85
+ encoder_session = ort.InferenceSession(encoder_onnx_path, providers=["CUDAExecutionProvider"])
86
+ decoder_session = ort.InferenceSession(decoder_onnx_path, providers=["CUDAExecutionProvider"])
87
+
88
+ # load the tokenizer and image processor
89
+ model_id = "models/rgb_language_cap_onnx"
90
+ processor = AutoImageProcessor.from_pretrained(model_id)
91
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
92
+
93
+ # load image
94
+ image_path = "img2.jpg"
95
+ image = Image.open(image_path)
96
+ inputs = processor(images=image, return_tensors="np").pixel_values
97
+
98
+ # run encoder model
99
+ encoder_outputs = encoder_session.run(
100
+ None,
101
+ {"pixel_values": inputs}
102
+ )
103
+
104
+ # extract the encoder hidden states (encoder outputs)
105
+ encoder_hidden_states = encoder_outputs[0]
106
+
107
+ # prepare decoder inputs
108
+ decoder_input_ids = np.array([[tokenizer.bos_token_id]], dtype=np.int64)
109
+
110
+ # run decoder model
111
+ max_length = 200 # define maximum length of the sequence
112
+
113
+ for _ in range(max_length):
114
+ decoder_outputs = decoder_session.run(
115
+ None,
116
+ {
117
+ "input_ids": decoder_input_ids, # input for the decoder
118
+ "encoder_hidden_states": encoder_hidden_states # outputs from the encoder
119
+ }
120
+ )
121
+
122
+ # extract logits and predict next token
123
+ logits = decoder_outputs[0]
124
+ predicted_token_id = np.argmax(logits[0, -1, :]) # get the predicted token ID from the logits
125
+
126
+ # if the predicted token is the EOS token, stop the generation
127
+ if predicted_token_id == tokenizer.eos_token_id:
128
+ break
129
+
130
+ # append predicted token ID to the decoder inputs for the next step
131
+ decoder_input_ids = np.concatenate([decoder_input_ids, np.array([[predicted_token_id]])], axis=-1)
132
+
133
+ # decode the predicted token IDs into text
134
+ predicted_text = tokenizer.decode(decoder_input_ids[0], skip_special_tokens=True)
135
+ # print the generated caption
136
+ print(predicted_text)