Topic 1: Arrays and Pointers
Contents
Topic 1: Arrays and Pointers¶
Arrays can only be passed to or returned from functions used at compile-time.
For functions used at runtime, pointers should be used instead. This example
demonstrates a function incrementAndSum()
, which accepts a pointer to an
array and a pointer to a scalar. When declaring an array pointer, CSL requires
that the type specification contain the size of the array. CSL does not have
a null pointer.
Pointers are dereferenced using the .*
syntax. Once dereferenced, they can
be used just like non-pointer variables like (dataPtr.*)[0]
for indexing
into the first element of the array.
layout.csl¶
// The core kernel must start at P4.1 so the memcpy infrastructure has enough
// resources to route the data between the host and the device.
// color/ task ID map
//
// ID var ID var ID var ID var
// 0 9 18 27 reserved (memcpy)
// 1 MEMCPY_D2H_DATA_1 10 19 28 reserved (memcpy)
// 2 LAUNCH 11 20 29 reserved
// 3 12 21 reserved (memcpy) 30 reserved (memcpy)
// 4 13 22 reserved (memcpy) 31 reserved
// 5 14 23 reserved (memcpy) 32
// 6 15 24 33
// 7 16 25 34
// 8 main_task_id 17 26 35
// IDs for memcpy streaming colors
param MEMCPYD2H_DATA_1_ID: i16;
// Colors
const MEMCPYD2H_DATA_1: color = @get_color(MEMCPYD2H_DATA_1_ID);
const LAUNCH: color = @get_color(2); // Color used by memcpy for kernel launch
// Task IDs
const main_task_id: local_task_id = @get_local_task_id(8);
const memcpy = @import_module( "<memcpy/get_params>", .{
.width = 1,
.height = 1,
.MEMCPYD2H_1 = MEMCPYD2H_DATA_1,
.LAUNCH = LAUNCH,
});
layout {
@set_rectangle(1, 1);
@set_tile_code(0, 0, "pe_program.csl", .{
.memcpy_params = memcpy.get_params(0),
.main_task_id = main_task_id
});
// export symbol name
@export_name("f_run", fn()void);
}
pe_program.csl¶
// Not a complete program; the top-level source file is layout.csl
param memcpy_params: comptime_struct;
// Task IDs
param main_task_id: local_task_id;
const sys_mod = @import_module( "<memcpy/memcpy>", memcpy_params);
const dsd = @get_dsd(fabout_dsd, .{
.extent = 1,
.fabric_color = sys_mod.MEMCPYD2H_1
});
fn incrementAndSum(dataPtr: *[3]i16, resultPtr: *i16) void {
// Write an updated value to each element of the array
(dataPtr.*)[0] += 1;
(dataPtr.*)[1] += 1;
(dataPtr.*)[2] += 1;
// Read all array values, sum them, and write the result
resultPtr.* = (dataPtr.*)[0] + (dataPtr.*)[1] + (dataPtr.*)[2];
}
task mainTask() void {
var result:i16 = 0;
var data = [3]i16 { 1, 2, 3 };
incrementAndSum(&data, &result);
@mov16(dsd, result);
}
fn f_run() void {
@activate(main_task_id);
// RPC returns early before the data is sent out via D2H color
// The host must wait for streaming D2H
// WARNING: the user must unblock cmd color for every PE
sys_mod.unblock_cmd_stream();
}
comptime {
@bind_local_task(mainTask, main_task_id);
@export_symbol(f_run);
@rpc(@get_data_task_id(sys_mod.LAUNCH));
}
run.py¶
#!/usr/bin/env cs_python
import argparse
import json
import numpy as np
from cerebras.sdk.sdk_utils import memcpy_view
from cerebras.sdk.runtime.sdkruntimepybind import SdkRuntime, MemcpyDataType # pylint: disable=no-name-in-module
from cerebras.sdk.runtime.sdkruntimepybind import MemcpyOrder # pylint: disable=no-name-in-module
parser = argparse.ArgumentParser()
parser.add_argument('--name', help='the test name')
parser.add_argument("--cmaddr", help="IP:port for CS system")
args = parser.parse_args()
dirname = args.name
# Parse the compile metadata
with open(f"{dirname}/out.json", encoding="utf-8") as json_file:
compile_data = json.load(json_file)
params = compile_data["params"]
MEMCPYD2H_DATA_1 = int(params["MEMCPYD2H_DATA_1_ID"])
print(f"MEMCPYD2H_DATA_1 = {MEMCPYD2H_DATA_1}")
memcpy_dtype = MemcpyDataType.MEMCPY_16BIT
runner = SdkRuntime(dirname, cmaddr=args.cmaddr)
runner.load()
runner.run()
print("step 1: call f_run to send global of type i16 via streaming D2H")
runner.launch("f_run", nonblock=False)
print("step 2: streaming D2H")
# The D2H buffer must be of type u32
out_tensors_u32 = np.zeros(1, np.uint32)
runner.memcpy_d2h(out_tensors_u32, MEMCPYD2H_DATA_1, 0, 0, 1, 1, 1, \
streaming=True, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=False)
# remove upper 16-bit of each u32
result_tensor = memcpy_view(out_tensors_u32, np.dtype(np.int16))
runner.stop()
# Ensure that the result matches our expectation
np.testing.assert_equal(result_tensor, [9])
print("SUCCESS!")
commands.sh¶
#!/usr/bin/env bash
set -e
cslc ./layout.csl --fabric-dims=8,3 \
--fabric-offsets=4,1 -o out \
--params=MEMCPYD2H_DATA_1_ID:1 \
--memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
cs_python run.py --name out