codeShare commited on
Commit
811a69b
·
verified ·
1 Parent(s): 1028385

Upload sd_token_similarity_calculator.ipynb

Browse files
sd_token_similarity_calculator.ipynb CHANGED
@@ -78,8 +78,8 @@
78
  {
79
  "cell_type": "code",
80
  "source": [
81
- "print(vocab[12432]) #the vocab item for ID 12432\n",
82
- "print(token[12432].shape) #dimension of the token"
83
  ],
84
  "metadata": {
85
  "id": "S_Yh9gH_OUA1"
@@ -104,7 +104,7 @@
104
  "\n",
105
  "from transformers import AutoTokenizer\n",
106
  "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
107
- "prompt= \"blah\" # @param {type:'string'}\n",
108
  "tokenizer_output = tokenizer(text = prompt)\n",
109
  "input_ids = tokenizer_output['input_ids']\n",
110
  "print(input_ids)"
@@ -135,7 +135,7 @@
135
  "\n",
136
  "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
137
  "#----#\n",
138
- "print(f'Calculated all cosine-similarities between ID = {id_A} the rest of the IDs as a 1x{sorted.shape[0]} tensor')\n",
139
  "print(f'Calculated indices as a 1x{indices.shape[0]} tensor')"
140
  ],
141
  "metadata": {
@@ -149,8 +149,9 @@
149
  "source": [
150
  "list_size = 10 # @param {type:'number'}\n",
151
  "for index in range(list_size):\n",
152
- " print(f'{vocab[indices[index]]}') # vocab item\n",
153
- " print(f'ID = {indices[index]}') # IDs\n",
 
154
  " print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
155
  " print('--------')\n"
156
  ],
 
78
  {
79
  "cell_type": "code",
80
  "source": [
81
+ "print(vocab[8922]) #the vocab item for ID 8922\n",
82
+ "print(token[8922].shape) #dimension of the token"
83
  ],
84
  "metadata": {
85
  "id": "S_Yh9gH_OUA1"
 
104
  "\n",
105
  "from transformers import AutoTokenizer\n",
106
  "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
107
+ "prompt= \"banana\" # @param {type:'string'}\n",
108
  "tokenizer_output = tokenizer(text = prompt)\n",
109
  "input_ids = tokenizer_output['input_ids']\n",
110
  "print(input_ids)"
 
135
  "\n",
136
  "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
137
  "#----#\n",
138
+ "print(f'Calculated all cosine-similarities between the token {vocab[id_A]} with ID = {id_A} the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n",
139
  "print(f'Calculated indices as a 1x{indices.shape[0]} tensor')"
140
  ],
141
  "metadata": {
 
149
  "source": [
150
  "list_size = 10 # @param {type:'number'}\n",
151
  "for index in range(list_size):\n",
152
+ " id = indices[index].item()\n",
153
+ " print(f'{vocab[id]}') # vocab item\n",
154
+ " print(f'ID = {id}') # IDs\n",
155
  " print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
156
  " print('--------')\n"
157
  ],