Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Add 10

Implement a kernel that adds 10 to each position of vector a and stores it in vector out.

from gpu.host import DeviceContext
from memory import UnsafePointer
from gpu import thread_idx

comptime SIZE = 4
comptime BLOCKS_PER_GRID = 1
comptime THREADS_PER_BLOCK = SIZE
comptime dtype = DType.float32


def add_10(
    out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]]
):
    tid = thread_idx.x
    out[tid] = array[tid] + 10


def main() raises:
    ctx = DeviceContext()
    d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE)
    expected = ctx.enqueue_create_buffer[dtype](SIZE)
    d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE)

    _ = d_out_buff.enqueue_fill(0)

    with d_array_buff.map_to_host() as h_array_buff:
        for i in range(SIZE):
            h_array_buff[i] = i

    ctx.enqueue_function[add_10](
        d_out_buff.unsafe_ptr(),
        d_array_buff.unsafe_ptr(),
        grid_dim=BLOCKS_PER_GRID,
        block_dim=THREADS_PER_BLOCK,
    )

    ctx.synchronize()

    with d_out_buff.map_to_host() as h_out_buff:
        print(h_out_buff)

View source on GitHub