Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Add 10

Implement a kernel that adds 10 to each position of vector a and stores it in vector out.

More threads than data — guard against out-of-bounds access.

### Add 10
### Implement a kernel that adds 10 to each position of vector a and stores it in vector out.
### More threads than data — guard against out-of-bounds access.

from gpu.host import DeviceContext
from memory import UnsafePointer
from gpu import thread_idx, block_dim, block_idx
from std.testing import assert_equal

comptime SIZE = 4
comptime BLOCKS_PER_GRID = 1
comptime THREADS_PER_BLOCK = (8, 1)
comptime dtype = DType.float32


def add_10_with_guard(
    out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]]
):
    tid = (
        thread_idx.z * (block_dim.y * block_dim.x)
        + thread_idx.y * block_dim.x
        + thread_idx.x
    )

    if tid < SIZE:
        out[tid] = array[tid] + 10


def main() raises:
    ctx = DeviceContext()
    d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE)
    d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE)
    expected = ctx.enqueue_create_host_buffer[dtype](SIZE)
    _ = d_out_buff.enqueue_fill(0)

    with d_array_buff.map_to_host() as h_array_buff:
        for i in range(SIZE):
            h_array_buff[i] = i

    ctx.enqueue_function[add_10_with_guard](
        d_out_buff.unsafe_ptr(),
        d_array_buff.unsafe_ptr(),
        grid_dim=BLOCKS_PER_GRID,
        block_dim=THREADS_PER_BLOCK,
    )

    ctx.synchronize()

    for i in range(SIZE):
        expected[i] = i + 10

    print(expected)

    with d_out_buff.map_to_host() as h_out_buff:
        print(h_out_buff)
        for i in range(SIZE):
            assert_equal(h_out_buff[i], expected[i])

View source on GitHub