Sprite batching with SDL3

Sprite batching is an optimisation that allow draw multiple sprites with one draw call, in this example we will use a simple texture but you can use a atlas texture and draw only a region of it. This enable to draw lot of different sprites with one draw call, it can be useful for a bullet hell game for example.

Setting up the project

You need to use this cmake file to set up the project:

cmake_minimum_required(VERSION 3.16)
project(SDL3_SpriteBatchGPU)

# Set C++ standard
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Find SDL3
find_package(SDL3 REQUIRED)

# Add executable
add_executable(SDL3_SpriteBatchGPU main.cpp)

# Link SDL3 and SDL3_ttf
target_link_libraries(SDL3_SpriteBatchGPU PRIVATE SDL3::SDL3)

After the base project is created in your IDE we can start the project.

Context creation and helpers function

Static varriables

There is a lot of set up to do before being able to do the sprite batching for that we will use static variables:

static SDL_GPUGraphicsPipeline* RenderPipeline;
static SDL_GPUSampler* Sampler;
static SDL_GPUTexture* Texture;
static SDL_GPUTransferBuffer* SpriteDataTransferBuffer;
static SDL_GPUBuffer* SpriteDataBuffer;
static const Uint32 SPRITE_COUNT = 512;
static const char* BasePath = NULL;

The render pipeline is responsible of the graphic workflow, setting up the shaders and others options.
The sampler is useful to apply some filter when doing lookups, clamping the LOD and others options.
The transfer buffer is th buffer used to transferr the data to the GPU.
The sprite data buffer is used for vertices, indices, indirect draw commands, and general compute data.

Structs

There are also so structs used to store the data for the sprites and the projection matrix:

typedef struct SpriteInstance
{
    float x, y, z;
    float rotation;
    float w, h, padding_a, padding_b;
    float tex_u, tex_v, tex_w, tex_h;
    float r, g, b, a;
} SpriteInstance;

typedef struct Matrix4x4
{
    float m11, m12, m13, m14;
    float m21, m22, m23, m24;
    float m31, m32, m33, m34;
    float m41, m42, m43, m44;
} Matrix4x4;

Helpers functions

We need some functions for creating a matrix :

Matrix4x4 Matrix4x4_CreateOrthographicOffCenter(
    float left,
    float right,
    float bottom,
    float top,
    float zNearPlane,
    float zFarPlane
) {
    return (Matrix4x4) {
        2.0f / (right - left), 0, 0, 0,
        0, 2.0f / (top - bottom), 0, 0,
        0, 0, 1.0f / (zNearPlane - zFarPlane), 0,
        (left + right) / (left - right), (top + bottom) / (bottom - top), zNearPlane / (zNearPlane - zFarPlane), 1
    };
}

A function to load the shaders:

SDL_GPUShader* LoadShader(
    SDL_GPUDevice* device,
    const char* shaderFilename,
    Uint32 samplerCount,
    Uint32 uniformBufferCount,
    Uint32 storageBufferCount,
    Uint32 storageTextureCount
) {
    // Auto-detect the shader stage from the file name for convenience
    SDL_GPUShaderStage stage;
    if (SDL_strstr(shaderFilename, ".vert"))
    {
        stage = SDL_GPU_SHADERSTAGE_VERTEX;
    }
    else if (SDL_strstr(shaderFilename, ".frag"))
    {
        stage = SDL_GPU_SHADERSTAGE_FRAGMENT;
    }
    else
    {
        SDL_Log("Invalid shader stage!");
        return NULL;
    }

    char fullPath[256];
    SDL_GPUShaderFormat backendFormats = SDL_GetGPUShaderFormats(device);
    SDL_GPUShaderFormat format = SDL_GPU_SHADERFORMAT_INVALID;
    const char *entrypoint;

    if (backendFormats & SDL_GPU_SHADERFORMAT_SPIRV) {
        SDL_snprintf(fullPath, sizeof(fullPath), "%s/%s.spv", BasePath, shaderFilename);
        format = SDL_GPU_SHADERFORMAT_SPIRV;
        entrypoint = "main";
    } else if (backendFormats & SDL_GPU_SHADERFORMAT_MSL) {
        SDL_snprintf(fullPath, sizeof(fullPath), "%s/%s.msl", BasePath, shaderFilename);
        format = SDL_GPU_SHADERFORMAT_MSL;
        entrypoint = "main0";
    } else if (backendFormats & SDL_GPU_SHADERFORMAT_DXIL) {
        SDL_snprintf(fullPath, sizeof(fullPath), "%s/%s.dxil", BasePath, shaderFilename);
        format = SDL_GPU_SHADERFORMAT_DXIL;
        entrypoint = "main";
    } else {
        SDL_Log("%s", "Unrecognized backend shader format!");
        return NULL;
    }

    size_t codeSize;
    void* code = SDL_LoadFile(fullPath, &codeSize);
    if (code == NULL)
    {
        SDL_Log("Failed to load shader from disk! %s", fullPath);
        return NULL;
    }

    SDL_GPUShaderCreateInfo shaderInfo = {
        .code =  (Uint8*)code,
        .code_size = codeSize,
        .entrypoint = entrypoint,
        .format = format,
        .stage = stage,
        .num_samplers = samplerCount,
        .num_uniform_buffers = uniformBufferCount,
        .num_storage_buffers = storageBufferCount,
        .num_storage_textures = storageTextureCount
    };
    SDL_GPUShader* shader = SDL_CreateGPUShader(device, &shaderInfo);
    if (shader == NULL)
    {
        SDL_Log("Failed to create shader!");
        SDL_free(code);
        return NULL;
    }

    SDL_free(code);
    return shader;
}

A function to load the image:

SDL_Surface* LoadImage(const char* imageFilename, int desiredChannels)
{
    char fullPath[256];
    SDL_Surface *result;
    SDL_PixelFormat format;

    SDL_snprintf(fullPath, sizeof(fullPath), "%s/%s", BasePath, imageFilename);

    result = SDL_LoadBMP(fullPath);
    if (result == NULL)
    {
        SDL_Log("Failed to load BMP: %s", SDL_GetError());
        return NULL;
    }

    if (desiredChannels == 4)
    {
        format = SDL_PIXELFORMAT_ABGR8888;
    }
    else
    {
        SDL_assert(!"Unexpected desiredChannels");
        SDL_DestroySurface(result);
        return NULL;
    }
    if (result->format != format)
    {
        SDL_Surface *next = SDL_ConvertSurface(result, format);
        SDL_DestroySurface(result);
        result = next;
    }

    return result;
}

The main function

Context initialization

The first thing to do is to init SDL, create the window and the renderer.
To use the GPU functions we need to create a gpu device and heen associate the windows with the gpu device.

SDL_Init(SDL_INIT_VIDEO);
    
BasePath = SDL_GetBasePath();

// Create SDL Window
SDL_Window* window = SDL_CreateWindow("SDL3 Unicode Text", 420, 300, SDL_WINDOW_VULKAN  | SDL_WINDOW_RESIZABLE);
if (!window) {
    std::cerr << "Failed to create window: " << SDL_GetError() << std::endl;
    return -1;
}

// Create Renderer
SDL_Renderer* renderer = SDL_CreateRenderer(window, NULL);
if (!renderer) {
    std::cerr << "Failed to create renderer: " << SDL_GetError() << std::endl;
    SDL_DestroyWindow(window);
    return -1;
}

SDL_GPUDevice* device = SDL_CreateGPUDevice(
                                            SDL_GPU_SHADERFORMAT_SPIRV | SDL_GPU_SHADERFORMAT_DXIL | SDL_GPU_SHADERFORMAT_MSL,
                                            false,
                                            NULL);

if (!device)
{
    printf("SDL_CreateGPUDevice failed: %s\n", SDL_GetError());
    SDL_DestroyWindow(window);
    SDL_Quit();
    return -1;
}

if (!SDL_ClaimWindowForGPUDevice(device, window))
{
    SDL_Log("GPUClaimWindow failed");
    return -1;
}

The swapchain parameters

Then we need the present mode to set up the swapchain parameters :

SDL_GPUPresentMode presentMode = SDL_GPU_PRESENTMODE_VSYNC;
if (SDL_WindowSupportsGPUPresentMode(
    device,
    window,
    SDL_GPU_PRESENTMODE_IMMEDIATE
)) {
    presentMode = SDL_GPU_PRESENTMODE_IMMEDIATE;
}
else if (SDL_WindowSupportsGPUPresentMode(
    device,
    window,
    SDL_GPU_PRESENTMODE_MAILBOX
)) {
    presentMode = SDL_GPU_PRESENTMODE_MAILBOX;
}

SDL_SetGPUSwapchainParameters(
    device,
    window,
    SDL_GPU_SWAPCHAINCOMPOSITION_SDR,
    presentMode
);

Loading the shaders

Then we load the shaders :

// Create the shaders
SDL_GPUShader* vertShader = LoadShader(
    device,
    "PullSpriteBatch.vert",
    0,
    1,
    1,
    0
);

SDL_GPUShader* fragShader = LoadShader(
    device,
    "TexturedQuadColor.frag",
    1,
    0,
    0,
    0
);

The render pipeline

Then it's time to st up the pipeline, to create the render pipeline we need informatiations about colors, for each colors there are parameters for the blend states.
The pipeline use a list of triangle for the rendering we need to pass the shaders to the pipeline.
Once the pipeline is created, we can free the shaders.

SDL_GPUColorTargetDescription target_desc[] {{
    .format = SDL_GetGPUSwapchainTextureFormat(device, window),
    .blend_state = {
        .enable_blend = true,
        .color_blend_op = SDL_GPU_BLENDOP_ADD,
        .alpha_blend_op = SDL_GPU_BLENDOP_ADD,
        .src_color_blendfactor = SDL_GPU_BLENDFACTOR_SRC_ALPHA,
        .dst_color_blendfactor = SDL_GPU_BLENDFACTOR_ONE_MINUS_SRC_ALPHA,
        .src_alpha_blendfactor = SDL_GPU_BLENDFACTOR_SRC_ALPHA,
        .dst_alpha_blendfactor = SDL_GPU_BLENDFACTOR_ONE_MINUS_SRC_ALPHA,
    }
}};
    
SDL_GPUGraphicsPipelineTargetInfo pipeline_target_info{
    .num_color_targets = 1,
    .color_target_descriptions = target_desc
};

SDL_GPUGraphicsPipelineCreateInfo pipeline_info{
    .target_info = pipeline_target_info,
    .primitive_type = SDL_GPU_PRIMITIVETYPE_TRIANGLELIST,
    .vertex_shader = vertShader,
    .fragment_shader = fragShader
};

// Create the sprite render pipeline
RenderPipeline = SDL_CreateGPUGraphicsPipeline(
    device,
    &pipeline_info
);

SDL_ReleaseGPUShader(device, vertShader);
SDL_ReleaseGPUShader(device, fragShader);

Loading the texture

There are multiple step to load the texture, the first thing is to load the surface.
Then a gpu transfer buffer, this buffer need informations.
Then we can map the buffer with the informations of the surface to transfer the informations using a memcpy.
After that the buffer is filled it can be unmapped.

// Load the image data
SDL_Surface *imageData = LoadImage("check-mark.bmp", 4);
if (imageData == NULL)
{
    SDL_Log("Could not load image data!");
    return -1;
}

SDL_GPUTransferBufferCreateInfo buffer_info {
    .usage = SDL_GPU_TRANSFERBUFFERUSAGE_UPLOAD,
    .size = static_cast<Uint32>(imageData->w * imageData->h * 4)
};

SDL_GPUTransferBuffer* textureTransferBuffer = SDL_CreateGPUTransferBuffer(
    device,
    &buffer_info
);

Uint8 *textureTransferPtr = (Uint8*) SDL_MapGPUTransferBuffer(
    device,
    textureTransferBuffer,
    false
);
SDL_memcpy(textureTransferPtr, imageData->pixels, imageData->w * imageData->h * 4);
SDL_UnmapGPUTransferBuffer(device, textureTransferBuffer);

Then the texture is created empty:

SDL_GPUTextureCreateInfo texture_info{
    .type = SDL_GPU_TEXTURETYPE_2D,
    .format = SDL_GPU_TEXTUREFORMAT_R8G8B8A8_UNORM,
    .width = static_cast<Uint32>(imageData->w),
    .height = static_cast<Uint32>(imageData->h),
    .layer_count_or_depth = 1,
    .num_levels = 1,
    .usage = SDL_GPU_TEXTUREUSAGE_SAMPLER
};

// Create the GPU resources
Texture = SDL_CreateGPUTexture(
    device,
    &texture_info
);

The sampler

SDL_GPUSamplerCreateInfo sampler_info {
    .min_filter = SDL_GPU_FILTER_NEAREST,
    .mag_filter = SDL_GPU_FILTER_NEAREST,
    .mipmap_mode = SDL_GPU_SAMPLERMIPMAPMODE_NEAREST,
    .address_mode_u = SDL_GPU_SAMPLERADDRESSMODE_CLAMP_TO_EDGE,
    .address_mode_v = SDL_GPU_SAMPLERADDRESSMODE_CLAMP_TO_EDGE,
    .address_mode_w = SDL_GPU_SAMPLERADDRESSMODE_CLAMP_TO_EDGE
};

Sampler = SDL_CreateGPUSampler(
    device,
    &sampler_info
);

The sprite data transfer buffer

SDL_GPUTransferBufferCreateInfo transfer_buffer_info {
    .usage = SDL_GPU_TRANSFERBUFFERUSAGE_UPLOAD,
    .size = SPRITE_COUNT * sizeof(SpriteInstance)
};

SpriteDataTransferBuffer = SDL_CreateGPUTransferBuffer(
    device,
    &transfer_buffer_info
);

The sprite data buffer

SDL_GPUBufferCreateInfo buffer_info_read {
    .usage = SDL_GPU_BUFFERUSAGE_GRAPHICS_STORAGE_READ,
    .size = SPRITE_COUNT * sizeof(SpriteInstance)
};

SpriteDataBuffer = SDL_CreateGPUBuffer(
    device,
    &buffer_info_read
);

Transfer the data to the GPU

// Transfer the up-front data
SDL_GPUCommandBuffer* uploadCmdBuf = SDL_AcquireGPUCommandBuffer(device);
SDL_GPUCopyPass* copyPass = SDL_BeginGPUCopyPass(uploadCmdBuf);

SDL_GPUTextureTransferInfo transfer_info {
    .transfer_buffer = textureTransferBuffer,
    .offset = 0, /* Zeroes out the rest */
};

SDL_GPUTextureRegion texture_region {
    .texture = Texture,
    .w = static_cast<Uint32>(imageData->w),
    .h = static_cast<Uint32>(imageData->h),
    .d = 1
};

SDL_UploadToGPUTexture(
    copyPass,
    &transfer_info,
    &texture_region,
    false
);

SDL_GPUTextureSamplerBinding sampler_binding {
    .texture = Texture,
    .sampler = Sampler
};

SDL_EndGPUCopyPass(copyPass);
SDL_SubmitGPUCommandBuffer(uploadCmdBuf);

SDL_DestroySurface(imageData);
SDL_ReleaseGPUTransferBuffer(device, textureTransferBuffer);

The main loop

bool running = true;
SDL_Event event;

static float uCoords[4] = { 0.0f, 0.5f, 0.0f, 0.5f };
static float vCoords[4] = { 0.0f, 0.0f, 0.5f, 0.5f };

while (running)
{
    // Handle events
    while (SDL_PollEvent(&event))
    {
        if (event.type == SDL_EVENT_QUIT)
        {
            running = false;
        }
    }

    Matrix4x4 cameraMatrix = Matrix4x4_CreateOrthographicOffCenter(
            0,
            640,
            480,
            0,
            0,
            -1
        );

        SDL_GPUCommandBuffer* cmdBuf = SDL_AcquireGPUCommandBuffer(device);
        if (cmdBuf == NULL)
        {
            SDL_Log("AcquireGPUCommandBuffer failed: %s", SDL_GetError());
            return -1;
        }

        SDL_GPUTexture* swapchainTexture;
        if (!SDL_WaitAndAcquireGPUSwapchainTexture(cmdBuf, window, &swapchainTexture, NULL, NULL)) {
            SDL_Log("WaitAndAcquireGPUSwapchainTexture failed: %s", SDL_GetError());
            return -1;
        }

        if (swapchainTexture != NULL)
        {
            // Build sprite instance transfer
            SpriteInstance* dataPtr = (SpriteInstance*)SDL_MapGPUTransferBuffer(
                device,
                SpriteDataTransferBuffer,
                true
            );

            int j = 0;
            for (Uint32 i = 0; i < SPRITE_COUNT; i += 1)
            {
                dataPtr[i].x = (32*i)%640;
                dataPtr[i].y = (64*j)%480;
                dataPtr[i].z = 0;
                dataPtr[i].rotation = 0;
                dataPtr[i].w = 24;
                dataPtr[i].h = 24;
                dataPtr[i].tex_u = 0;
                dataPtr[i].tex_v = 0;
                dataPtr[i].tex_w = 1.0f;
                dataPtr[i].tex_h = 1.0f;
                dataPtr[i].r = 1.0f;
                dataPtr[i].g = 1.0f;
                dataPtr[i].b = 1.0f;
                dataPtr[i].a = 1.0f;
                
                if(i%20 == 0)
                    j++;
            }

            SDL_UnmapGPUTransferBuffer(device, SpriteDataTransferBuffer);

            SDL_GPUTransferBufferLocation buffer_location {
                .transfer_buffer = SpriteDataTransferBuffer,
                .offset = 0
            };
            
            SDL_GPUBufferRegion buffer_region {
                .buffer = SpriteDataBuffer,
                .offset = 0,
                .size = SPRITE_COUNT * sizeof(SpriteInstance)
            };
            
            // Upload instance data
            SDL_GPUCopyPass* copyPass = SDL_BeginGPUCopyPass(cmdBuf);
            SDL_UploadToGPUBuffer(
                copyPass,
                &buffer_location,
                &buffer_region,
                true
            );
            SDL_EndGPUCopyPass(copyPass);

            SDL_GPUColorTargetInfo target_info{
                .texture = swapchainTexture,
                .cycle = false,
                .load_op = SDL_GPU_LOADOP_CLEAR,
                .store_op = SDL_GPU_STOREOP_STORE,
                .clear_color = { 0, 0, 0, 1 }
            };
            
            // Render sprites
            SDL_GPURenderPass* renderPass = SDL_BeginGPURenderPass(
                cmdBuf,
                &target_info,
                1,
                NULL
            );

            SDL_BindGPUGraphicsPipeline(renderPass, RenderPipeline);
            SDL_BindGPUVertexStorageBuffers(
                renderPass,
                0,
                &SpriteDataBuffer,
                1
            );
            SDL_GPUTextureSamplerBinding sampler_binding{
                .texture = Texture,
                .sampler = Sampler
            };
            SDL_BindGPUFragmentSamplers(
                renderPass,
                0,
                &sampler_binding,
                1
            );
            SDL_PushGPUVertexUniformData(
                cmdBuf,
                0,
                &cameraMatrix,
                sizeof(Matrix4x4)
            );
            SDL_DrawGPUPrimitives(
                renderPass,
                SPRITE_COUNT * 6,
                1,
                0,
                0
            );

            SDL_EndGPURenderPass(renderPass);
        }

        SDL_SubmitGPUCommandBuffer(cmdBuf);


    // Present the frame
    SDL_GL_SwapWindow(window);
}

Cleanup

// Cleanup
SDL_ReleaseGPUGraphicsPipeline(device, RenderPipeline);
SDL_ReleaseGPUSampler(device, Sampler);
SDL_ReleaseGPUTexture(device, Texture);
SDL_ReleaseGPUTransferBuffer(device, SpriteDataTransferBuffer);
SDL_ReleaseGPUBuffer(device, SpriteDataBuffer);

Full project

Download the full project:

For mac: SDL3_SpriteBatchGPU_mac.7z
For windows pick the file: SDL3_SpriteBatchGPU_win.7z