codeShare
/

JupyterNotebooks

codeShare commited on Sep 6, 2024

Commit

811a69b

verified ·

1 Parent(s): 1028385

Upload sd_token_similarity_calculator.ipynb

Files changed (1) hide show

sd_token_similarity_calculator.ipynb CHANGED Viewed

@@ -78,8 +78,8 @@
     {
       "cell_type": "code",
       "source": [
-        "print(vocab[12432]) #the vocab item for ID 12432\n",
-        "print(token[12432].shape)  #dimension of the token"
       ],
       "metadata": {
         "id": "S_Yh9gH_OUA1"
@@ -104,7 +104,7 @@
         "\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
-        "prompt= \"blah\" # @param {type:'string'}\n",
         "tokenizer_output = tokenizer(text = prompt)\n",
         "input_ids = tokenizer_output['input_ids']\n",
         "print(input_ids)"
@@ -135,7 +135,7 @@
         "\n",
         "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
         "#----#\n",
-        "print(f'Calculated all cosine-similarities between ID = {id_A} the rest of the IDs as a 1x{sorted.shape[0]} tensor')\n",
         "print(f'Calculated indices as a 1x{indices.shape[0]} tensor')"
       ],
       "metadata": {
@@ -149,8 +149,9 @@
       "source": [
         "list_size = 10 # @param {type:'number'}\n",
         "for index in range(list_size):\n",
-        "  print(f'{vocab[indices[index]]}') # vocab item\n",
-        "  print(f'ID = {indices[index]}') # IDs\n",
         "  print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
         "  print('--------')\n"
       ],

     {
       "cell_type": "code",
       "source": [
+        "print(vocab[8922]) #the vocab item for ID 8922\n",
+        "print(token[8922].shape)  #dimension of the token"
       ],
       "metadata": {
         "id": "S_Yh9gH_OUA1"
         "\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
+        "prompt= \"banana\" # @param {type:'string'}\n",
         "tokenizer_output = tokenizer(text = prompt)\n",
         "input_ids = tokenizer_output['input_ids']\n",
         "print(input_ids)"
         "\n",
         "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
         "#----#\n",
+        "print(f'Calculated all cosine-similarities between the token {vocab[id_A]} with ID = {id_A} the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n",
         "print(f'Calculated indices as a 1x{indices.shape[0]} tensor')"
       ],
       "metadata": {
       "source": [
         "list_size = 10 # @param {type:'number'}\n",
         "for index in range(list_size):\n",
+        "  id = indices[index].item()\n",
+        "  print(f'{vocab[id]}') # vocab item\n",
+        "  print(f'ID = {id}') # IDs\n",
         "  print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
         "  print('--------')\n"
       ],