{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "This Notebook is a Stable-diffusion tool which allows you to find similiar tokens from the SD 1.5 vocab.json that you can use for text-to-image generation. Try this Free online SD 1.5 generator with the results: https://perchance.org/fusion-ai-image-generator" ], "metadata": { "id": "L7JTcbOdBPfh" } }, { "cell_type": "code", "source": [ "# @title Load/initialize values\n", "# Load the tokens into the colab\n", "!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n", "import torch\n", "from torch import linalg as LA\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "%cd /content/sd_tokens\n", "token = torch.load('sd15_tensors.pt', map_location=device, weights_only=True)\n", "#-----#\n", "\n", "#Import the vocab.json\n", "import json\n", "import pandas as pd\n", "with open('vocab.json', 'r') as f:\n", " data = json.load(f)\n", "\n", "_df = pd.DataFrame({'count': data})['count']\n", "\n", "vocab = {\n", " value: key for key, value in _df.items()\n", "}\n", "#-----#\n", "\n", "# Define functions/constants\n", "NUM_TOKENS = 49407\n", "\n", "def absolute_value(x):\n", " return max(x, -x)\n", "\n", "\n", "def token_similarity(A, B):\n", " #Tensor vector length (2nd order, i.e (a^2 + b^2 + ....)^(1/2)\n", " _A = LA.vector_norm(A, ord=2)\n", " _B = LA.vector_norm(B, ord=2)\n", " #----#\n", " result = torch.dot(A,B)/(_A*_B)\n", " #similarity_pcnt = absolute_value(result.item()*100)\n", " similarity_pcnt = result.item()*100\n", " similarity_pcnt_aprox = round(similarity_pcnt, 3)\n", " result = f'{similarity_pcnt_aprox} %'\n", " return result\n", "\n", "def similarity(id_A , id_B):\n", " #Tensors\n", " A = token[id_A]\n", " B = token[id_B]\n", " return token_similarity(A, B)\n", "#----#\n", "\n", "#print(vocab[8922]) #the vocab item for ID 8922\n", "#print(token[8922].shape) #dimension of the token\n", "\n", "mix_with = \"\"\n", "mix_method = \"None\"" ], "metadata": { "id": "Ch9puvwKH1s3", "collapsed": true, "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "982a9210-a3fd-4d90-bef7-5aa6f5864797" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cloning into 'sd_tokens'...\n", "remote: Enumerating objects: 10, done.\u001b[K\n", "remote: Counting objects: 100% (7/7), done.\u001b[K\n", "remote: Compressing objects: 100% (7/7), done.\u001b[K\n", "remote: Total 10 (delta 1), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n", "Unpacking objects: 100% (10/10), 306.93 KiB | 4.72 MiB/s, done.\n", "/content/sd_tokens\n" ] } ] }, { "cell_type": "code", "source": [ "# @title 📝 -> 🆔 Tokenize prompt into IDs\n", "from transformers import AutoTokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n", "\n", "prompt= \"banana\" # @param {type:'string'}\n", "\n", "tokenizer_output = tokenizer(text = prompt)\n", "input_ids = tokenizer_output['input_ids']\n", "print(input_ids)\n", "\n", "\n", "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n", "\n", "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID." ], "metadata": { "id": "RPdkYzT2_X85", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "86f2f01e-6a04-4292-cee7-70fd8398e07f" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[49406, 8922, 49407]\n" ] } ] }, { "cell_type": "code", "source": [ "# @title 🆔->🥢 Take the ID at index 1 from above result and get its corresponding tensor value\n", "\n", "id_A = input_ids[1]\n", "A = token[id_A]\n", "_A = LA.vector_norm(A, ord=2)\n", "\n", "#if no imput exists we just randomize the entire thing\n", "if (prompt == \"\"):\n", " id_A = -1\n", " print(\"Tokenized prompt tensor A is a random valued tensor with no ID\")\n", " R = torch.rand(768)\n", " _R = LA.vector_norm(R, ord=2)\n", " A = R*(_A/_R)\n", "\n", "#Save a copy of the tensor A\n", "id_P = id_A\n", "P = A\n", "_P = LA.vector_norm(A, ord=2)\n" ], "metadata": { "id": "YqdiF8DIz9Wu" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "# @title 🥢 -> 🥢🔀 Take the ID at index 1 from above result and modify it (optional)\n", "mix_with = \"\" # @param {type:'string'}\n", "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n", "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n", "\n", "#------#\n", "#If set to TRUE , this will use the output of this cell , tensor A, as the input of this cell the 2nd time we run it. Use this feature to mix many tokens into A\n", "re_iterate_tensor_A = True # @param {\"type\":\"boolean\"}\n", "if (re_iterate_tensor_A == False) :\n", " #prevent re-iterating A by reading from stored copy\n", " id_A = id_P\n", " A = P\n", " _A = _P\n", "#----#\n", "\n", "tokenizer_output = tokenizer(text = mix_with)\n", "input_ids = tokenizer_output['input_ids']\n", "id_C = input_ids[1]\n", "C = token[id_C]\n", "_C = LA.vector_norm(C, ord=2)\n", "\n", "#if no imput exists we just randomize the entire thing\n", "if (mix_with == \"\"):\n", " id_C = -1\n", " print(\"Tokenized prompt 'mix_with' tensor C is a random valued tensor with no ID\")\n", " R = torch.rand(768)\n", " _R = LA.vector_norm(R, ord=2)\n", " C = R*(_C/_R)\n", "\n", "if (mix_method == \"None\"):\n", " print(\"No operation\")\n", "\n", "if (mix_method == \"Average\"):\n", " A = w*A + (1-w)*C\n", " _A = LA.vector_norm(A, ord=2)\n", " print(\"Tokenized prompt tensor A has been recalculated as A = w*A + (1-w)*C , where C is the tokenized prompt 'mix_with' tensor C\")\n", "\n", "if (mix_method == \"Subtract\"):\n", " tmp = (A/_A) - (C/_C)\n", " _tmp = LA.vector_norm(tmp, ord=2)\n", " A = tmp*((w*_A + (1-w)*_C)/_tmp)\n", " _A = LA.vector_norm(A, ord=2)\n", " print(\"Tokenized prompt tensor A has been recalculated as A = (w*_A + (1-w)*_C) * norm(w*A - (1-w)*C) , where C is the tokenized prompt 'mix_with' tensor C\")\n", "\n", "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor" ], "metadata": { "id": "oXbNSRSKPgRr", "collapsed": true, "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "76f8ec94-d29c-46d9-893b-49875f3a1949" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Tokenized prompt 'mix_with' tensor C is a random valued tensor with no ID\n", "No operation\n" ] } ] }, { "cell_type": "code", "source": [ "\n", "# @title 🥢->🧾🥢 Find Similiar Tokens to ID at index 1 from above result\n", "dots = torch.zeros(NUM_TOKENS)\n", "for index in range(NUM_TOKENS):\n", " id_B = index\n", " B = token[id_B]\n", " _B = LA.vector_norm(B, ord=2)\n", " result = torch.dot(A,B)/(_A*_B)\n", " #result = absolute_value(result.item())\n", " result = result.item()\n", " dots[index] = result\n", "\n", "name_A = \"A of random type\"\n", "if (id_A>-1):\n", " name_A = vocab[id_A]\n", "\n", "name_C = \"token C of random type\"\n", "if (id_C>-1):\n", " name_C = vocab[id_C]\n", "\n", "\n", "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n", "#----#\n", "if (mix_method == \"Average\"):\n", " print(f'Calculated all cosine-similarities between the average of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n", "if (mix_method == \"Subtract\"):\n", " print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n", "if (mix_method == \"None\"):\n", " print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n", "\n", "#Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result" ], "metadata": { "id": "juxsvco9B0iV", "collapsed": true, "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "dc893bbf-e9cb-425c-95b8-ffafd3ab2fbc" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Calculated all cosine-similarities between the token banana with Id_A = 8922 with the the rest of the 49407 tokens as a 1x49407 tensor\n" ] } ] }, { "cell_type": "markdown", "source": [], "metadata": { "id": "cYYu5C5C6MHH" } }, { "cell_type": "code", "source": [ "# @title 🥢🧾 -> 🖨️ Print Result from the 'Similiar Tokens' list from above result\n", "list_size = 100 # @param {type:'number'}\n", "print_ID = False # @param {type:\"boolean\"}\n", "print_Similarity = True # @param {type:\"boolean\"}\n", "print_Name = True # @param {type:\"boolean\"}\n", "print_Divider = True # @param {type:\"boolean\"}\n", "\n", "for index in range(list_size):\n", " id = indices[index].item()\n", " if (print_Name):\n", " print(f'{vocab[id]}') # vocab item\n", " if (print_ID):\n", " print(f'ID = {id}') # IDs\n", " if (print_Similarity):\n", " print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n", " if (print_Divider):\n", " print('--------')\n", "\n", "#Print the sorted list from above result" ], "metadata": { "id": "YIEmLAzbHeuo", "collapsed": true, "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "4a2fa70f-16ff-4bba-fb01-d39ad697d4ff" }, "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "banana\n", "similiarity = 100.0 %\n", "--------\n", "bananas\n", "similiarity = 38.93 %\n", "--------\n", "banan\n", "similiarity = 30.8 %\n", "--------\n", "ðŁįĮ\n", "similiarity = 27.12 %\n", "--------\n", "pineapple\n", "similiarity = 19.7 %\n", "--------\n", "chicken\n", "similiarity = 19.24 %\n", "--------\n", "potassium\n", "similiarity = 19.21 %\n", "--------\n", "sausage\n", "similiarity = 19.07 %\n", "--------\n", "lemon\n", "similiarity = 18.82 %\n", "--------\n", "orange\n", "similiarity = 18.42 %\n", "--------\n", "peanut\n", "similiarity = 17.84 %\n", "--------\n", "parachute\n", "similiarity = 17.19 %\n", "--------\n", "duck\n", "similiarity = 16.8 %\n", "--------\n", "yellow\n", "similiarity = 16.21 %\n", "--------\n", "grape\n", "similiarity = 16.19 %\n", "--------\n", "kangaroo\n", "similiarity = 16.13 %\n", "--------\n", "apple\n", "similiarity = 16.13 %\n", "--------\n", "tangerine\n", "similiarity = 16.08 %\n", "--------\n", "giraffe\n", "similiarity = 16.04 %\n", "--------\n", "mango\n", "similiarity = 16.03 %\n", "--------\n", "rubber\n", "similiarity = 15.95 %\n", "--------\n", "bamboo\n", "similiarity = 15.88 %\n", "--------\n", "umbrella\n", "similiarity = 15.82 %\n", "--------\n", "nutella\n", "similiarity = 15.69 %\n", "--------\n", "ferrari\n", "similiarity = 15.69 %\n", "--------\n", "oranges\n", "similiarity = 15.65 %\n", "--------\n", "peanuts\n", "similiarity = 15.62 %\n", "--------\n", "ali\n", "similiarity = 15.49 %\n", "--------\n", "cucumber\n", "similiarity = 15.32 %\n", "--------\n", "potato\n", "similiarity = 15.22 %\n", "--------\n", "monkey\n", "similiarity = 15.2 %\n", "--------\n", "croissant\n", "similiarity = 15.18 %\n", "--------\n", "papaya\n", "similiarity = 15.17 %\n", "--------\n", "christmas\n", "similiarity = 15.12 %\n", "--------\n", "sandwich\n", "similiarity = 15.0 %\n", "--------\n", "rainbow\n", "similiarity = 14.98 %\n", "--------\n", "tomato\n", "similiarity = 14.96 %\n", "--------\n", "martini\n", "similiarity = 14.93 %\n", "--------\n", "cabaret\n", "similiarity = 14.83 %\n", "--------\n", "ginger\n", "similiarity = 14.82 %\n", "--------\n", "animal\n", "similiarity = 14.76 %\n", "--------\n", "vanilla\n", "similiarity = 14.73 %\n", "--------\n", "mustache\n", "similiarity = 14.64 %\n", "--------\n", "lime\n", "similiarity = 14.62 %\n", "--------\n", "sickle\n", "similiarity = 14.6 %\n", "--------\n", "vista\n", "similiarity = 14.53 %\n", "--------\n", "coconut\n", "similiarity = 14.52 %\n", "--------\n", "kara\n", "similiarity = 14.46 %\n", "--------\n", "alligator\n", "similiarity = 14.39 %\n", "--------\n", "blueberry\n", "similiarity = 14.34 %\n", "--------\n", "squirrel\n", "similiarity = 14.29 %\n", "--------\n", "atore\n", "similiarity = 14.19 %\n", "--------\n", "watermelon\n", "similiarity = 14.13 %\n", "--------\n", "nana\n", "similiarity = 14.09 %\n", "--------\n", "latex\n", "similiarity = 14.08 %\n", "--------\n", "agricultural\n", "similiarity = 14.02 %\n", "--------\n", "zucchini\n", "similiarity = 14.0 %\n", "--------\n", "saxophone\n", "similiarity = 13.93 %\n", "--------\n", "mozzarella\n", "similiarity = 13.91 %\n", "--------\n", "eggplant\n", "similiarity = 13.9 %\n", "--------\n", "pickle\n", "similiarity = 13.89 %\n", "--------\n", "tortilla\n", "similiarity = 13.88 %\n", "--------\n", "maniac\n", "similiarity = 13.84 %\n", "--------\n", "milk\n", "similiarity = 13.83 %\n", "--------\n", "cellphone\n", "similiarity = 13.78 %\n", "--------\n", "duck\n", "similiarity = 13.73 %\n", "--------\n", "umbrel\n", "similiarity = 13.71 %\n", "--------\n", "fanny\n", "similiarity = 13.69 %\n", "--------\n", "twister\n", "similiarity = 13.67 %\n", "--------\n", "moustache\n", "similiarity = 13.66 %\n", "--------\n", "manafort\n", "similiarity = 13.66 %\n", "--------\n", "grapefruit\n", "similiarity = 13.6 %\n", "--------\n", "broom\n", "similiarity = 13.59 %\n", "--------\n", "scorpion\n", "similiarity = 13.59 %\n", "--------\n", "fruit\n", "similiarity = 13.57 %\n", "--------\n", "agan\n", "similiarity = 13.53 %\n", "--------\n", "sunflower\n", "similiarity = 13.49 %\n", "--------\n", "banc\n", "similiarity = 13.46 %\n", "--------\n", "literature\n", "similiarity = 13.45 %\n", "--------\n", "pelican\n", "similiarity = 13.43 %\n", "--------\n", "breakfast\n", "similiarity = 13.42 %\n", "--------\n", "pear\n", "similiarity = 13.42 %\n", "--------\n", "orange\n", "similiarity = 13.4 %\n", "--------\n", "monet\n", "similiarity = 13.4 %\n", "--------\n", "snake\n", "similiarity = 13.32 %\n", "--------\n", "vampire\n", "similiarity = 13.32 %\n", "--------\n", "cinnamon\n", "similiarity = 13.3 %\n", "--------\n", "strawberries\n", "similiarity = 13.29 %\n", "--------\n", "butternut\n", "similiarity = 13.22 %\n", "--------\n", "sausages\n", "similiarity = 13.22 %\n", "--------\n", "iphone\n", "similiarity = 13.21 %\n", "--------\n", "egg\n", "similiarity = 13.2 %\n", "--------\n", "capu\n", "similiarity = 13.2 %\n", "--------\n", "mannequin\n", "similiarity = 13.19 %\n", "--------\n", "cucumbers\n", "similiarity = 13.16 %\n", "--------\n", "champagne\n", "similiarity = 13.15 %\n", "--------\n", "triangle\n", "similiarity = 13.14 %\n", "--------\n", "apples\n", "similiarity = 13.09 %\n", "--------\n", "dynamite\n", "similiarity = 13.08 %\n", "--------\n", "chocolate\n", "similiarity = 13.08 %\n", "--------\n" ] } ] }, { "cell_type": "code", "source": [ "\n", "# @title 🆔 Get similarity % of two token IDs\n", "id_for_token_A = 4567 # @param {type:'number'}\n", "id_for_token_B = 4343 # @param {type:'number'}\n", "\n", "similarity_str = 'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)\n", "\n", "print(similarity_str)\n", "\n", "#Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407" ], "metadata": { "id": "MwmOdC9cNZty", "collapsed": true, "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "0dd984d0-e253-4981-d72f-40aa83d57d8b" }, "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The similarity between tokens A and B is 3.671 %\n" ] } ] }, { "cell_type": "code", "source": [ "# @title 💫 Compare Text encodings\n", "\n", "prompt_A = \"\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n", "prompt_B = \"\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n", "use_token_padding = True # @param {type:\"boolean\"}\n", "\n", "from transformers import CLIPProcessor, CLIPModel\n", "\n", "\n", "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n", "\n", "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n", "\n", "ids_A = processor.tokenizer(text=prompt_A, padding=use_token_padding, return_tensors=\"pt\")\n", "text_encoding_A = model.get_text_features(**ids_A)\n", "\n", "ids_B = processor.tokenizer(text=prompt_B, padding=use_token_padding, return_tensors=\"pt\")\n", "text_encoding_B = model.get_text_features(**ids_B)\n", "\n", "similarity_str = 'The similarity between the text_encoding for A:\"' + prompt_A + '\" and B: \"' + prompt_B +'\" is ' + token_similarity(text_encoding_A[0] , text_encoding_B[0])\n", "\n", "\n", "print(similarity_str)\n", "#outputs = model(**inputs)\n", "#logits_per_image = outputs.logits_per_image # this is the image-text similarity score\n", "#probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities\n", "\n", "\n", "\n" ], "metadata": { "id": "QQOjh5BvnG8M", "collapsed": true, "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "8bd6eb94-c5a7-47e6-913b-346941b144a6" }, "execution_count": 11, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The similarity between the text_encoding for A:\"one ripe banana\" and B: \"a long yellow fruit\" is 83.696 %\n" ] } ] }, { "cell_type": "markdown", "source": [ "\n", "\n", "This is how the notebook works:\n", "\n", "Similiar vectors = similiar output in the SD 1.5 / SDXL / FLUX model\n", "\n", "CLIP converts the prompt text to vectors (“tensors”) , with float32 values usually ranging from -1 to 1\n", "\n", "Dimensions are [ 1x768 ] tensors for SD 1.5 , and a [ 1x768 , 1x1024 ] tensor for SDXL and FLUX.\n", "\n", "The SD models and FLUX converts these vectors to an image.\n", "\n", "This notebook takes an input string , tokenizes it and matches the first token against the 49407 token vectors in the vocab.json : https://huggingface.co/black-forest-labs/FLUX.1-dev/tree/main/tokenizer\n", "\n", "It finds the “most similiar tokens” in the list. Similarity is the theta angle between the token vectors.\n", "\n", "\n", "