{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "cskYkw0zXHEm"
      },
      "outputs": [],
      "source": [
        "# @title Make your own text_encodings .safetensor file for later use (using GPU is recommended to speed things up)\n",
        "\n",
        "import json\n",
        "import pandas as pd\n",
        "import os\n",
        "import shelve\n",
        "import torch\n",
        "from safetensors.torch import save_file\n",
        "import json\n",
        "\n",
        "# Determine if this notebook is running on Colab or Kaggle\n",
        "#Use https://www.kaggle.com/ if Google Colab GPU is busy\n",
        "home_directory = '/content/'\n",
        "using_Kaggle = os.environ.get('KAGGLE_URL_BASE','')\n",
        "if using_Kaggle : home_directory = '/kaggle/working/'\n",
        "%cd {home_directory}\n",
        "#-------#\n",
        "\n",
        "# User input\n",
        "target = home_directory + 'text-to-image-prompts/names/fullnames/'\n",
        "output_folder = home_directory + 'output/fullnames/'\n",
        "root_filename = 'names_fullnames_text_👱_♀️female_fullnames'\n",
        "NUM_FILES = 9\n",
        "#--------#\n",
        "\n",
        "# Setup environment\n",
        "def my_mkdirs(folder):\n",
        "    if os.path.exists(folder)==False:\n",
        "        os.makedirs(folder)\n",
        "#--------#\n",
        "output_folder_text = output_folder + 'text/'\n",
        "output_folder_text = output_folder + 'text/'\n",
        "output_folder_text_encodings = output_folder + 'text_encodings/'\n",
        "target_raw = target + 'raw/'\n",
        "%cd {home_directory}\n",
        "my_mkdirs(output_folder)\n",
        "my_mkdirs(output_folder_text)\n",
        "my_mkdirs(output_folder_text_encodings)\n",
        "#-------#\n",
        "\n",
        "# Load the data if not already loaded\n",
        "try:\n",
        "    loaded\n",
        "except:\n",
        "    %cd {home_directory}\n",
        "    !git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts\n",
        "    loaded = True\n",
        "#--------#\n",
        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
        "from transformers import AutoTokenizer\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
        "from transformers import  CLIPProcessor, CLIPModel\n",
        "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
        "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\").to(device)\n",
        "#---------#\n",
        "for  file_index in range(NUM_FILES + 1):\n",
        "    if (file_index < 1): continue\n",
        "    filename = f'{root_filename}-{file_index}'\n",
        "\n",
        "    # Read {filename}.json\n",
        "    %cd {target_raw}\n",
        "    with open(filename + '.json', 'r') as f:\n",
        "        data = json.load(f)\n",
        "    _df = pd.DataFrame({'count': data})['count']\n",
        "    prompts = {\n",
        "        key : value.replace(\"</w>\",\" \") for key, value in _df.items()\n",
        "    }\n",
        "    index = 0\n",
        "    for key in prompts:\n",
        "        index = index + 1\n",
        "    #----------#\n",
        "    NUM_ITEMS = index\n",
        "    #------#\n",
        "\n",
        "  # Calculate text_encoding for .json file contents and results as .db file\n",
        "    names_dict = {}\n",
        "    text_encoding_dict = {}\n",
        "    segments = {}\n",
        "    index = 0;\n",
        "    subby = 1;\n",
        "    NUM_HEADERS = 2\n",
        "    CHUNKS_SIZE = 1000\n",
        "    _filename = ''\n",
        "    for _index in range(NUM_ITEMS):\n",
        "        if (index % 100 == 0) : print(index)\n",
        "        if (index == 0 and _index>0) : index = index + 2 #make space for headers\n",
        "        if (_index % (CHUNKS_SIZE-NUM_HEADERS) == 0 and _index > 0) :\n",
        "\n",
        "            # Write headers in the .json\n",
        "            names_dict[f'{0}'] = f'{_index}'\n",
        "            names_dict[f'{1}'] = f'{filename}-{subby}'\n",
        "\n",
        "            # Encode the headers into text_encoding\n",
        "            inputs = tokenizer(text = '' + names_dict[f'{0}'], padding=True, return_tensors=\"pt\").to(device)\n",
        "            text_features = model.get_text_features(**inputs).to(device)\n",
        "            text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
        "            text_encoding_dict[f'{0}'] = text_features.to(torch.device('cpu'))\n",
        "            inputs = tokenizer(text = '' + names_dict[f'{1}'], padding=True, return_tensors=\"pt\").to(device)\n",
        "            text_features = model.get_text_features(**inputs).to(device)\n",
        "            text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
        "            text_encoding_dict[f'{1}'] = text_features.to(torch.device('cpu'))\n",
        "            #-------#\n",
        "\n",
        "            # Write .json\n",
        "            _filename = f'{filename}-{subby}.json'\n",
        "            %cd {output_folder_text}\n",
        "            print(f'Saving segment {_filename} to {output_folder_text}...')\n",
        "            with open(_filename, 'w') as f:\n",
        "                json.dump(names_dict, f)\n",
        "            #-------#\n",
        "\n",
        "            # Write .safetensors\n",
        "            _filename = f'{filename}-{subby}.safetensors'\n",
        "            %cd {output_folder_text_encodings}\n",
        "            print(f'Saving segment {_filename} to {output_folder_text_encodings}...')\n",
        "            save_file(text_encoding_dict, _filename)\n",
        "            #--------#\n",
        "\n",
        "            #Iterate\n",
        "            subby = subby + 1\n",
        "            segments[f'{subby}'] = _filename\n",
        "            text_encoding_dict = {}\n",
        "            names_dict = {}\n",
        "            index = 0\n",
        "            #------#\n",
        "            #------#\n",
        "        else: index = index + 1\n",
        "        #--------#\n",
        "        inputs = tokenizer(text = '' + prompts[f'{_index}'], padding=True, return_tensors=\"pt\").to(device)\n",
        "        text_features = model.get_text_features(**inputs).to(device)\n",
        "        text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
        "        text_encoding_dict[f'{index}'] = text_features.to(torch.device('cpu'))\n",
        "        names_dict[f'{index}'] = prompts[f'{_index}']\n",
        "        continue\n",
        "    #-----#\n",
        "    #-----#\n",
        "    # Write headers in the .json\n",
        "    names_dict[f'{0}'] = f'{_index}'\n",
        "    names_dict[f'{1}'] = f'{filename}-{subby}'\n",
        "\n",
        "    # Encode the headers into text_encoding\n",
        "    inputs = tokenizer(text = '' + names_dict[f'{0}'], padding=True, return_tensors=\"pt\").to(device)\n",
        "    text_features = model.get_text_features(**inputs).to(device)\n",
        "    text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
        "    text_encoding_dict[f'{0}'] = text_features.to(torch.device('cpu'))\n",
        "    inputs = tokenizer(text = '' + names_dict[f'{1}'], padding=True, return_tensors=\"pt\").to(device)\n",
        "    text_features = model.get_text_features(**inputs).to(device)\n",
        "    text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
        "    text_encoding_dict[f'{1}'] = text_features.to(torch.device('cpu'))\n",
        "    #-------#\n",
        "\n",
        "    # Write .json\n",
        "    _filename = f'{filename}-{subby}.json'\n",
        "    %cd {output_folder_text}\n",
        "    print(f'Saving segment {_filename} to {output_folder_text}...')\n",
        "    with open(_filename, 'w') as f:\n",
        "        json.dump(names_dict, f)\n",
        "    #-------#\n",
        "\n",
        "    # Write .safetensors\n",
        "    _filename = f'{filename}-{subby}.safetensors'\n",
        "    %cd {output_folder_text_encodings}\n",
        "    print(f'Saving segment {_filename} to {output_folder_text_encodings}...')\n",
        "    save_file(text_encoding_dict, _filename)\n",
        "    #--------#\n",
        "\n",
        "    #Iterate\n",
        "    subby = subby + 1\n",
        "    segments[f'{subby}'] = _filename\n",
        "    text_encoding_dict = {}\n",
        "    names_dict = {}\n",
        "    index = 0\n",
        "    #------#\n",
        "  #----#"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# @title Download the text_encodings as .zip\n",
        "import os\n",
        "%cd {home_directory}\n",
        "#os.remove(f'{home_directory}results.zip')\n",
        "zip_dest = f'{home_directory}results.zip'\n",
        "!zip -r {zip_dest} {output_folder}"
      ],
      "metadata": {
        "id": "cR-ed0CGhekk"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}