Add a constant 10

Implement a kernel that adds 10 to each position of 2d matrix a and stores it in out 2d matrix.

from gpu.host import DeviceContext
from memory import UnsafePointer
from gpu import thread_idx, block_dim
from std.testing import assert_equal

comptime SIZE = 2
comptime BLOCKS_PER_GRID = 1
comptime THREADS_PER_BLOCK = (3,3)
comptime dtype = DType.float32


def add_10_2d(
    out: UnsafePointer[Scalar[dtype]], array: UnsafePointer[Scalar[dtype]], size: Int
):
    tid = thread_idx.z * (block_dim.y * block_dim.x) + thread_idx.y * block_dim.x + thread_idx.x
    if tid < size * size:
        out[tid] = array[tid] + 10


def main():
  try:
    ctx = DeviceContext()
    d_array_buff = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
    d_out_buff = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
    expected = ctx.enqueue_create_host_buffer[dtype](SIZE * SIZE).enqueue_fill(0)


    with d_array_buff.map_to_host() as h_array_buff:
        for i in range(SIZE):
            for j in range(SIZE):
                h_array_buff[i * SIZE + j] = i * SIZE + j
                expected[i * SIZE + j] = h_array_buff[i * SIZE + j] + 10
        print("Input: ", h_array_buff)

    ctx.enqueue_function[add_10_2d](
            d_out_buff.unsafe_ptr(),
            d_array_buff.unsafe_ptr(),
            SIZE,
            grid_dim=BLOCKS_PER_GRID,
            block_dim=THREADS_PER_BLOCK,
        )

    ctx.synchronize()

    with d_out_buff.map_to_host() as h_out_buff:
        print(h_out_buff)
        print(expected)
        for i in range(SIZE * SIZE ):
            assert_equal(h_out_buff[i], expected[i])

  except e:
    print(e)

View source on GitHub

Keyboard shortcuts

Mojo Programming

Add a constant 10