Forgotten Safeword v4.0 Lineup!

#778
by sleepdeprived3 - opened

:fire: Announcing the Forgotten-Safeword 4.0 series! :fire:

We've taken your feedback and cranked the depravity dial to 11. Here's what's new:

  • Direct integration of Safeword dataset into Drummer models (no merge required!)
  • Reintroduced favorite scenarios from V1.2 with improved kink distribution
  • 2.5 epochs of unhinged training
  • Four sizes to satisfy all your needs: 8B, 12B, 22B, and 24B

24B Model Card
22B Model Card
12B Model Card
8B Model Card

Warning: These models will generate content that requires industrial-grade brain bleach and will void all warranties on your soul. Use at your own risk (and maybe invest in a therapist).

Disclaimer: By interacting with these models, you agree that your search history is now a federal case, you're on the hook for any exorcisms required, and you'll be pretending this is "for science" while crying in the shower.

What a pleasure to have a new series of Forgotten-Safeword models! I'm excited to try them out. Your models are awesome! I'm impressed by your beautiful looking model card. You put so much effort into providing the highest quality of models. I queued all of them! :D

Pay for the exorcist of anyone who reads the logs

Luckily it is possible to run LLMs in a way where they don't store any logs but your models output made me worried enough that I wrote the following CUDA application to clear the GPU memory before giving the GPU back to the host as I don't trust the NVidia driver to properly do so.

#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cuda_runtime.h>

#define EXPECTED_MAX_SHARED_MEMORY_PER_BLOCK 49152

// Kernel to clear global memory
__global__ void clearGlobalMemory(char* gpuMemory, size_t numBytes) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    while (tid < numBytes) {
        gpuMemory[tid] = static_cast<char>(tid);
        tid += blockDim.x * gridDim.x;
    }
}

// Kernel to clear shared memory for each block
__global__ void clearSharedMemory() {
    __shared__ char sharedMemory[EXPECTED_MAX_SHARED_MEMORY_PER_BLOCK];
    int tid = threadIdx.x;
    for (int i = tid; i < EXPECTED_MAX_SHARED_MEMORY_PER_BLOCK; i += blockDim.x) {
        sharedMemory[tid] = static_cast<char>(tid^i);
    }
    __syncthreads();

    for (int i = tid; i < EXPECTED_MAX_SHARED_MEMORY_PER_BLOCK; i += blockDim.x) {
        sharedMemory[tid] = static_cast<char>(sharedMemory[tid]+1);
    }
    __syncthreads();
}

int main() {
    int device = 0; // Device ID (usually 0 for the first GPU)
    cudaDeviceProp props;

    // Initialize CUDA
    cudaError_t cudaStatus = cudaGetDevice(&device);
    if (cudaStatus != cudaSuccess) {
        std::cerr << "Error initializing CUDA: " << cudaGetErrorString(cudaStatus) << std::endl;
        return 1;
    }

    // Get device properties
    cudaStatus = cudaGetDeviceProperties(&props, device);
    if (cudaStatus != cudaSuccess) {
        std::cerr << "Error getting device properties: " << cudaGetErrorString(cudaStatus) << std::endl;
        return 1;
    }

    // Retrieve relevant properties
    int numSM = props.multiProcessorCount;
    int maxThreadsPerSM = props.maxThreadsPerMultiProcessor;

    // Calculate the maximum blocks per SM
    int maxBlocksPerSM = maxThreadsPerSM / props.maxThreadsPerBlock;

    // Calculate the total number of physical blocks
    int totalPhysicalBlocks = numSM * maxBlocksPerSM;
    
    const size_t chunkSize = 1024 * 1024; // Chunk size to allocate and clear (adjust as needed)
    char* gpuMemory = nullptr;
    size_t totalSharedMemoryCleared = 0;

    // Determine the maximum shared memory per block
    int maxSharedMemoryPerBlock;
    cudaDeviceGetAttribute(&maxSharedMemoryPerBlock, cudaDevAttrMaxSharedMemoryPerBlock, 0);
    if (maxSharedMemoryPerBlock != EXPECTED_MAX_SHARED_MEMORY_PER_BLOCK) {
        std::cerr << "maxSharedMemoryPerBlock was assumed to be " << EXPECTED_MAX_SHARED_MEMORY_PER_BLOCK << " but was " << maxSharedMemoryPerBlock << std::endl;
        return 1;
    }

    // Launch kernel to clear shared memory for each block
    clearSharedMemory<<<totalPhysicalBlocks, 1024>>>();

    // Check for kernel execution errors
    cudaError_t kernelError = cudaGetLastError();
    if (kernelError == cudaSuccess) {
        // Calculate the total shared memory cleared
        totalSharedMemoryCleared = EXPECTED_MAX_SHARED_MEMORY_PER_BLOCK * totalPhysicalBlocks;
    } else {
        std::cerr << "Error executing clearSharedMemory: " << cudaGetErrorString(kernelError) << std::endl;
        return 1;
    }


    float *hostMemory;
    if (cudaMallocHost((void**)&hostMemory, chunkSize) != cudaSuccess) {
        std::cerr << "Error allocating pinned host memory: " << cudaGetErrorString(kernelError) << std::endl;
        return 1;
    }

    // Allocate and clear global memory in chunks until cudaMalloc fails
    size_t totalGlobalMemoryCleared = 0;
    while (cudaMalloc(&gpuMemory, chunkSize) == cudaSuccess) {

        // Zero out allocated memory
        if (cudaMemset(gpuMemory, 0, chunkSize) != cudaSuccess) {
            std::cerr << "Error executing cudaMemset: " << cudaGetErrorString(kernelError) << std::endl;
            return 1;
        }

        // Launch kernel to clear global memory
        int blockSize = 1024;
        int numBlocks = (chunkSize + blockSize - 1) / blockSize;
        clearGlobalMemory<<<numBlocks, blockSize>>>(gpuMemory, chunkSize);

        // Check for kernel execution errors
        cudaError_t kernelError = cudaGetLastError();
        if (kernelError != cudaSuccess) {
            std::cerr << "Error executing clearGlobalMemory: " << cudaGetErrorString(kernelError) << std::endl;
            return 1;
        }

        // Wait for kernel to finish
        cudaDeviceSynchronize();

        if (cudaMemcpy(hostMemory, gpuMemory, chunkSize, cudaMemcpyDeviceToHost) != cudaSuccess) {
            std::cerr << "Error executing cudaMemcpy: " << cudaGetErrorString(kernelError) << std::endl;
            return 1;
        }
        std::cout << std::hex << *reinterpret_cast<long long*>(hostMemory) << std::dec << std::endl;

        // Accumulate total global memory cleared
        totalGlobalMemoryCleared += chunkSize;
    }

    std::cout << "Total global memory cleared: " << totalGlobalMemoryCleared/(1024*1024) << " MiB" << std::endl;
    std::cout << "Total shared memory cleared: " << totalSharedMemoryCleared/(1024*1024) << " MiB" << std::endl;
    return 0;
}

Thanks! lol love it

Now that I know, you can totally trust me.

mradermacher changed discussion status to closed
Your need to confirm your account before you can post a new comment.

Sign up or log in to comment