GEMV 9: Memcpy Streaming Mode
Contents
GEMV 9: Memcpy Streaming Mode¶
We present an alternative version of the previous example,
in which we use the streaming
mode of memcpy
to stream x
and b
onto the device, and stream y
off of the device.
All of the previous examples used the copy
mode of memcpy
.
This example is meant to simply present the basics of streaming
mode,
and future tutorials will demonstrate some use cases for this mode.
The host code no longer includes an explicit kernel launch.
Instead, computation is started by the wavelet-triggered tasks that receive
elements of x
and b
along the top row and left column of PEs,
respectively.
We finish computation when the kernel streams back the result y
to the host.
The colors MEMCPYH2D_DATA_1
and MEMCPYH2D_DATA_2
are used
to stream x
and b
onto the device, respectively,
while MEMCPYD2H_DATA_1
is used to stream y
off the device.
Note that, because memcpy
commands are serialized, the order of these
streaming
mode memcpy_h2d
calls in this example is important.
If the b
values were streamed in before x
, the program would hang.
layout.csl¶
// kernel dimensions
param kernel_x_dim: i16;
param kernel_y_dim: i16;
// total matrix dimensions
param M: i16;
param N: i16;
// IDs for memcpy streaming colors
param MEMCPYH2D_DATA_1_ID: i16; // streams x from host to top row
param MEMCPYH2D_DATA_2_ID: i16; // streams b from host to left column
param MEMCPYD2H_DATA_1_ID: i16; // streams y from right column to host
// Colors
const ax_color_1: color = @get_color(3); // sends/recvs partial result Ax EAST
const ax_color_2: color = @get_color(4); // sends/recvs partial result Ax EAST
const x_color: color = @get_color(5); // sends/recvs elems x
const MEMCPYH2D_DATA_1: color = @get_color(MEMCPYH2D_DATA_1_ID);
const MEMCPYH2D_DATA_2: color = @get_color(MEMCPYH2D_DATA_2_ID);
const MEMCPYD2H_DATA_1: color = @get_color(MEMCPYD2H_DATA_1_ID);
const LAUNCH: color = @get_color(8); // Color used by memcpy for kernel launch
// Task IDs
const recv_x_task_id: data_task_id = @get_data_task_id(x_color); // Task ID for WTT
const memcpy_recv_x_task_id: data_task_id = @get_data_task_id(MEMCPYH2D_DATA_1);
const memcpy_recv_b_task_id: data_task_id = @get_data_task_id(MEMCPYH2D_DATA_2);
const reduce_task_id: local_task_id = @get_local_task_id(10);
// This example uses kernel_x_dim x kernel_y_dim PEs
const memcpy = @import_module("<memcpy/get_params>", .{
.width = kernel_x_dim,
.height = kernel_y_dim,
.MEMCPYH2D_1 = MEMCPYH2D_DATA_1,
.MEMCPYH2D_2 = MEMCPYH2D_DATA_2,
.MEMCPYD2H_1 = MEMCPYD2H_DATA_1,
.LAUNCH = LAUNCH
});
layout {
// PE coordinates are (column, row)
@set_rectangle(kernel_x_dim, kernel_y_dim);
const common_params = .{
.M_per_PE = M / kernel_y_dim,
.N_per_PE = N / kernel_x_dim,
.kernel_x_dim = kernel_x_dim,
.kernel_y_dim = kernel_y_dim,
// Colors
.x_color = x_color,
// Task IDs
.recv_x_task_id = recv_x_task_id,
.memcpy_recv_x_task_id = memcpy_recv_x_task_id,
.memcpy_recv_b_task_id = memcpy_recv_b_task_id,
.reduce_task_id = reduce_task_id
};
const even_params = @concat_structs(common_params, .{
.send_east_color = ax_color_1, .recv_west_color = ax_color_2
});
const odd_params = @concat_structs(common_params, .{
.send_east_color = ax_color_2, .recv_west_color = ax_color_1
});
for (@range(i16, kernel_x_dim)) |pe_x| {
for (@range(i16, kernel_y_dim)) |pe_y| {
if (pe_x % 2 == 0) {
@set_tile_code(pe_x, pe_y, "pe_program.csl", @concat_structs(
.{ .memcpy_params = memcpy.get_params(pe_x) }, even_params));
} else {
@set_tile_code(pe_x, pe_y, "pe_program.csl", @concat_structs(
.{ .memcpy_params = memcpy.get_params(pe_x) }, odd_params));
}
}
}
// Create route values
const RX_R_TX_RS = .{ .rx = .{RAMP}, .tx = .{RAMP, SOUTH} };
const RX_N_TX_RS = .{ .rx = .{NORTH}, .tx = .{RAMP, SOUTH} };
const RX_N_TX_R = .{ .rx = .{NORTH}, .tx = .{RAMP} };
const RX_R_TX_R = .{ .rx = .{RAMP}, .tx = .{RAMP} };
const RX_W_TX_R = .{ .rx = .{WEST}, .tx = .{RAMP} };
const RX_R_TX_E = .{ .rx = .{RAMP}, .tx = .{EAST} };
for (@range(i16, kernel_x_dim)) |pe_x| {
for (@range(i16, kernel_y_dim)) |pe_y| {
if (pe_y == 0) {
@set_color_config(pe_x, pe_y, x_color, .{ .routes = RX_R_TX_RS });
} else if (pe_y == kernel_y_dim-1) {
@set_color_config(pe_x, pe_y, x_color, .{ .routes = RX_N_TX_R });
} else {
@set_color_config(pe_x, pe_y, x_color, .{ .routes = RX_N_TX_RS });
}
if (pe_x == 0) {
@set_color_config(pe_x, pe_y, ax_color_1, .{ .routes = RX_R_TX_E });
@set_color_config(pe_x, pe_y, ax_color_2, .{ .routes = RX_R_TX_R });
} else if (pe_x % 2 == 0) {
@set_color_config(pe_x, pe_y, ax_color_1, .{ .routes = RX_R_TX_E });
@set_color_config(pe_x, pe_y, ax_color_2, .{ .routes = RX_W_TX_R });
} else {
@set_color_config(pe_x, pe_y, ax_color_1, .{ .routes = RX_W_TX_R });
@set_color_config(pe_x, pe_y, ax_color_2, .{ .routes = RX_R_TX_E });
}
}
}
// export symbol names
@export_name("A", [*]f32, true);
}
pe_program.csl¶
param memcpy_params: comptime_struct;
// Matrix dimensions
param M_per_PE: i16;
param N_per_PE: i16;
// Program rectangle dimensions
param kernel_x_dim: u16;
param kernel_y_dim: u16;
// Colors
param send_east_color: color; // sends partial result Ax EAST
param recv_west_color: color; // recvs partial result Ax WEST
param x_color: color; // sends elems x SOUTH
// Task IDs
param recv_x_task_id: data_task_id; // Task ID for data task recv_x, consumes x_color wlts
param memcpy_recv_x_task_id: data_task_id; // Task ID for data task that recvs x from memcpy
param memcpy_recv_b_task_id: data_task_id; // Task ID for data task that recvs b from memcpy
param reduce_task_id: local_task_id; // Task ID used by reduce task
// memcpy module provides infrastructure for copying data
// and launching functions from the host
const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
// layout module provides PE coordinates at runtime
const layout_mod = @import_module("<layout>");
// 48 kB of global memory contain A, x, b, y
var A: [M_per_PE*N_per_PE]f32; // A is stored column major
var y: [M_per_PE]f32;
// DSDs for accessing A, x, y
// A_dsd accesses column of A
var A_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{M_per_PE} -> A[i] });
var y_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{M_per_PE} -> y[i] });
// ptr to A will be advertised as symbol to host
var A_ptr: [*]f32 = &A;
fn is_right_col() bool {
return (layout_mod.get_x_coord() == kernel_x_dim-1);
}
task reduce() void {
const in_dsd = @get_dsd(fabin_dsd, .{
.fabric_color = recv_west_color, .extent = M_per_PE,
.input_queue = @get_input_queue(1)
});
if (is_right_col()) {
const out_dsd = @get_dsd(fabout_dsd, .{
.fabric_color = sys_mod.MEMCPYD2H_1, .extent = M_per_PE,
.output_queue = @get_output_queue(2)
});
@fadds(out_dsd, y_dsd, in_dsd, .{ .async = true });
} else {
const out_dsd = @get_dsd(fabout_dsd, .{
.fabric_color = send_east_color, .extent = M_per_PE,
.output_queue = @get_output_queue(2)
});
@fadds(out_dsd, y_dsd, in_dsd, .{ .async = true });
}
}
// Use to keep track of # of invocations of recv_x task
// when num_recv_x == N_per_PE, we are done receiving x elements
var num_recv_x: i16 = 0;
task recv_x(x_val: f32) void {
@fmacs(y_dsd, y_dsd, A_dsd, x_val);
A_dsd = @increment_dsd_offset(A_dsd, M_per_PE, f32);
num_recv_x += 1;
if (num_recv_x == N_per_PE) {
@activate(reduce_task_id);
}
}
// buf stores an element in memory to be used by a microthreaded operation
var buf = @zeros([1]f32);
const mem_buf_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{1} -> buf[i] });
const memcpy_x_dsd = @get_dsd(fabout_dsd, .{
.fabric_color = x_color, .extent = 1,
.output_queue = @get_output_queue(3)
});
// 1st row receives x from MEMCPYH2D_DATA_1, then
// forwards data to the whole column, including itself, via color "x_color"
task memcpy_recv_x(data : f32) void {
@block(memcpy_recv_x_task_id);
buf[0] = data;
@fmovs(memcpy_x_dsd, mem_buf_dsd, .{.async = true, .unblock = memcpy_recv_x_task_id });
}
const memcpy_b_dsd = @get_dsd(fabout_dsd, .{
.fabric_color = recv_west_color, .extent = 1,
.output_queue = @get_output_queue(3)
});
// 1st column receives b from MEMCPYH2D_DATA_2, then
// forwards data to itself via color "recv_west_color"
task memcpy_recv_b(data : f32) void {
@block(memcpy_recv_b_task_id);
buf[0] = data;
@fmovs(memcpy_b_dsd, mem_buf_dsd, .{.async = true, .unblock = memcpy_recv_b_task_id });
}
comptime {
// These WTTs are activated by receiving wavelets streamed from host
@bind_data_task(memcpy_recv_x, memcpy_recv_x_task_id);
@bind_data_task(memcpy_recv_b, memcpy_recv_b_task_id);
// recv_x is WTT activated by receiving wavelets along color x_color,
// which corresponds to recv_x_task_id
@bind_data_task(recv_x, recv_x_task_id);
// reduce is local task activated by ID reduce_task_ID
@bind_local_task(reduce, reduce_task_id);
@export_symbol(A_ptr, "A");
@rpc(@get_data_task_id(sys_mod.LAUNCH));
}
run.py¶
#!/usr/bin/env cs_python
import argparse
import json
import numpy as np
from cerebras.sdk.runtime.sdkruntimepybind import SdkRuntime, MemcpyDataType, MemcpyOrder # pylint: disable=no-name-in-module
# Read arguments
parser = argparse.ArgumentParser()
parser.add_argument('--name', help="the test compile output dir")
parser.add_argument('--cmaddr', help="IP:port for CS system")
args = parser.parse_args()
# Get matrix dimensions from compile metadata
with open(f"{args.name}/out.json", encoding='utf-8') as json_file:
compile_data = json.load(json_file)
# Matrix dimensions
N = int(compile_data['params']['N']) # columns
M = int(compile_data['params']['M']) # rows
# PE grid dimensions
kernel_x_dim = int(compile_data['params']['kernel_x_dim'])
kernel_y_dim = int(compile_data['params']['kernel_y_dim'])
# Colors used for memcpy streaming
MEMCPYH2D_DATA_1 = int(compile_data['params']['MEMCPYH2D_DATA_1_ID'])
MEMCPYH2D_DATA_2 = int(compile_data['params']['MEMCPYH2D_DATA_2_ID'])
MEMCPYD2H_DATA_1 = int(compile_data['params']['MEMCPYD2H_DATA_1_ID'])
# Construct A, x, b
A = np.arange(M*N, dtype=np.float32).reshape(M,N)
x = np.full(shape=N, fill_value=1.0, dtype=np.float32)
b = np.full(shape=M, fill_value=2.0, dtype=np.float32)
# Calculate expected y
y_expected = A@x + b
# Size of N dimension on each PE
N_per_PE = N // kernel_x_dim
M_per_PE = M // kernel_y_dim
# Construct a runner using SdkRuntime
runner = SdkRuntime(args.name, cmaddr=args.cmaddr)
# Get symbol for A on device
A_symbol = runner.get_id('A')
# Load and run the program
runner.load()
runner.run()
# Copy chunks of A into all PEs
# Each chunk on each PE is stored column major
A_prepared = A.reshape(kernel_y_dim, M_per_PE, kernel_x_dim, N_per_PE).transpose(0, 2, 3, 1).ravel()
runner.memcpy_h2d(A_symbol, A_prepared, 0, 0, kernel_x_dim, kernel_y_dim, M_per_PE*N_per_PE,
streaming=False, order=MemcpyOrder.ROW_MAJOR, data_type=MemcpyDataType.MEMCPY_32BIT,
nonblock=False)
# Stream x into PEs (0, 0) and (kernel_x_dim-1, 0)
# PE (0, 0) gets first N/2 elements; PE (1, 0) gets last N/2 elements
runner.memcpy_h2d(MEMCPYH2D_DATA_1, x, 0, 0, kernel_x_dim, 1, N_per_PE, streaming=True,
order=MemcpyOrder.ROW_MAJOR, data_type=MemcpyDataType.MEMCPY_32BIT, nonblock=False)
# Stream b into PEs (0, 0) to (0, kernel_y_dim-1)
runner.memcpy_h2d(MEMCPYH2D_DATA_2, b, 0, 0, 1, kernel_y_dim, M_per_PE, streaming=True,
order=MemcpyOrder.ROW_MAJOR, data_type=MemcpyDataType.MEMCPY_32BIT, nonblock=False)
# Stream y back from PEs (kernel_x_dim-1, 0) and (kernel_x_dim-1, kernel_y_dim-1)
y_result = np.zeros([M], dtype=np.float32)
runner.memcpy_d2h(y_result, MEMCPYD2H_DATA_1, kernel_x_dim-1, 0, 1, kernel_y_dim, M_per_PE,
streaming=True, order=MemcpyOrder.ROW_MAJOR, data_type=MemcpyDataType.MEMCPY_32BIT,
nonblock=False)
# Stop the program
runner.stop()
# Ensure that the result matches our expectation
np.testing.assert_allclose(y_result, y_expected)
print("SUCCESS!")
commands.sh¶
#!/usr/bin/env bash
set -e
cslc ./layout.csl --fabric-dims=11,5 \
--fabric-offsets=4,1 --params=kernel_x_dim:4,kernel_y_dim:3,M:6,N:8 \
--params=MEMCPYH2D_DATA_1_ID:0 \
--params=MEMCPYH2D_DATA_2_ID:1 \
--params=MEMCPYD2H_DATA_1_ID:2 \
-o out --memcpy --channels 1
cs_python run.py --name out