Topic 6: Libraries

The CSL compiler comes bundled with a few standard libraries, which can be imported into the user’s program using the @import_module() builtin. This example shows three such compiler-bundled libraries:

  • the random library for generating uniform random numbers,

  • the timestamp library for reading the on-chip timestamp counter, and

  • the math library for square root.

layout.csl

// color/ task ID map
//
//  ID var           ID var     ID var                ID var
//   0 D2H_1          9         18                    27 reserved (memcpy)
//   1               10         19                    28 reserved (memcpy)
//   2 LAUNCH        11         20                    29 reserved
//   3               12         21 reserved (memcpy)  30 reserved (memcpy)
//   4               13         22 reserved (memcpy)  31 reserved
//   5               14         23 reserved (memcpy)  32
//   6               15         24                    33
//   7               16         25                    34
//   8 main_task_id  17         26                    35
//

param MEMCPYD2H_DATA_1_ID: i16; // ID for memcpy streaming colors

param iterations: u32;

// Colors
const MEMCPYD2H_DATA_1: color = @get_color(MEMCPYD2H_DATA_1_ID);
const LAUNCH:           color = @get_color(2);

// Task IDs
const main_task_id: local_task_id = @get_local_task_id(8);

const memcpy = @import_module( "<memcpy/get_params>", .{
    .width = 1,
    .height = 1,
    .MEMCPYD2H_1 = MEMCPYD2H_DATA_1,
    .LAUNCH = LAUNCH
    });

layout {
  @set_rectangle(1, 1);

  @set_tile_code(0, 0, "pe_program.csl", .{
    .memcpy_params = memcpy.get_params(0),
    .main_task_id = main_task_id,
    .iterations = iterations
  });

  // export symbol name
  @export_name("f_run", fn()void);
  @export_name("f_send_timestamps", fn()void);
}

pe_program.csl

// Not a complete program; the top-level source file is layout.csl.
param memcpy_params: comptime_struct;

param iterations: u32;

// Task IDs
param main_task_id: local_task_id;

// memcpy module reserves input queue 0 and output queue 0
const sys_mod = @import_module( "<memcpy/memcpy>", memcpy_params);

// Import compiler-bundled libraries, which are identified by names surrounded
// by angular brackets ('<' and '>').
const random = @import_module("<random>");
const tsc = @import_module("<time>");
const math = @import_module("<math>");

// Declare variables for storing the timestamp counter at the start and the end
// of the core computation.
var startBuffer = @zeros([tsc.tsc_size_words]u16);
var finishBuffer = @zeros([tsc.tsc_size_words]u16);
var timeBuffer = @zeros([tsc.tsc_size_words*2]u16);

/// Send the final result to the host.
fn sendResult(result: f32) void {
  const resultDsd = @get_dsd(fabout_dsd, .{
    .extent = 1,
    .fabric_color = sys_mod.MEMCPYD2H_1,
    .output_queue = @get_output_queue(1)
  });
  // The sync operation works here because the length is 1
  // It would better to use {.async=true}
  @fmovs(resultDsd, result);
}

/// Send the begin and end timestamp counters to the host, which then performs a
/// 48-bit subtraction to get the final cycle count.
fn sendTimeStampCounters() void {
  timeBuffer[0] = startBuffer[0];
  timeBuffer[1] = startBuffer[1];
  timeBuffer[2] = startBuffer[2];

  timeBuffer[3] = finishBuffer[0];
  timeBuffer[4] = finishBuffer[1];
  timeBuffer[5] = finishBuffer[2];

  const timeBufferDsd = @get_dsd(mem1d_dsd, .{
    .tensor_access = |i|{tsc.tsc_size_words*2} -> timeBuffer[i]
  });

  const timeStampDsd = @get_dsd(fabout_dsd, .{
    .extent = tsc.tsc_size_words*2,
    .fabric_color = sys_mod.MEMCPYD2H_1,
    .output_queue = @get_output_queue(1)
  });

  @mov16(timeStampDsd, timeBufferDsd, .{.async=true});
}

task mainTask() void {
  var idx: u32 = 0;
  var hitCount: u32 = 0;

  tsc.enable_tsc();
  tsc.get_timestamp(&startBuffer);

  // For each iteration, compute two random values between -1 and +1, and check
  // whether they are inside the circle of unit radius.
  while (idx < iterations) : (idx += 1) {
    var x = random.random_f32(-1.0, 1.0);
    var y = random.random_f32(-1.0, 1.0);
    var distanceFromOrigin = math.sqrt_f32(x * x + y * y);

    if (distanceFromOrigin <= 1.0) {
      hitCount += 1;
    }
  }

  tsc.get_timestamp(&finishBuffer);
  sendResult(4.0 * @as(f32, hitCount) / @as(f32, iterations));
}

comptime {
  @bind_local_task(mainTask, main_task_id);
}

fn f_run() void {
  @activate(main_task_id);

  // RPC returns early before the data is sent out via D2H color
  // The host must wait for streaming D2H

  // WARNING: the user must unblock cmd color for every PE
  sys_mod.unblock_cmd_stream();
}

fn f_send_timestamps() void {
  sendTimeStampCounters();

  // RPC returns early before the data is sent out via D2H color
  // The host must wait for streaming D2H

  // WARNING: the user must unblock cmd color for every PE
  sys_mod.unblock_cmd_stream();
}

comptime{
  @export_symbol(f_run);
  @export_symbol(f_send_timestamps);
  @rpc(@get_data_task_id(sys_mod.LAUNCH));
}

run.py

#!/usr/bin/env cs_python

import argparse
import json
import numpy as np

from cerebras.sdk.sdk_utils import memcpy_view
from cerebras.sdk.runtime.sdkruntimepybind import SdkRuntime, MemcpyDataType # pylint: disable=no-name-in-module
from cerebras.sdk.runtime.sdkruntimepybind import MemcpyOrder # pylint: disable=no-name-in-module

parser = argparse.ArgumentParser()
parser.add_argument('--name', help='the test name')
parser.add_argument("--cmaddr", help="IP:port for CS system")
parser.add_argument("--tolerance", type=float, help="tolerance for result")
args = parser.parse_args()
dirname = args.name

# Parse the compile metadata
with open(f"{dirname}/out.json", encoding="utf-8") as json_file:
  compile_data = json.load(json_file)
params = compile_data["params"]
MEMCPYD2H_DATA_1 = int(params["MEMCPYD2H_DATA_1_ID"])
print(f"MEMCPYD2H_DATA_1 = {MEMCPYD2H_DATA_1}")

print("The simfab may take 25 sec more")
runner = SdkRuntime(dirname, cmaddr=args.cmaddr)

runner.load()
runner.run()

print("step 1: call f_run to start streaming D2H (result)")
runner.launch("f_run", nonblock=False)

print("step 2: streaming D2H (result)")
# The D2H buffer must be of type u32
result = np.zeros(1, np.float32)
runner.memcpy_d2h(result, MEMCPYD2H_DATA_1, 0, 0, 1, 1, 1, \
    streaming=True, data_type=MemcpyDataType.MEMCPY_32BIT, \
    order=MemcpyOrder.COL_MAJOR, nonblock=False)

print("step 3: call f_send_timestamps to start streaming D2H (timestamp)")
runner.launch("f_send_timestamps", nonblock=False)

print("step 4: streaming D2H (timestamps)")
# The D2H buffer must be of type u32
timestamps_u32 = np.zeros(6, np.uint32)
runner.memcpy_d2h(timestamps_u32, MEMCPYD2H_DATA_1, 0, 0, 1, 1, 6, \
    streaming=True, data_type=MemcpyDataType.MEMCPY_16BIT, \
    order=MemcpyOrder.COL_MAJOR, nonblock=False)
# remove upper 16-bit of each u32
timestamps = memcpy_view(timestamps_u32, np.dtype(np.uint16))

runner.stop()

# Helper functions for computing the delta in the cycle count
def make_u48(words):
  return words[0] + (words[1] << 16) + (words[2] << 32)

def subtract_timestamps(words):
  return make_u48(words[3:]) - make_u48(words[0:3])

cycles = subtract_timestamps(timestamps)
print("cycle count:", cycles)

print(f"result = {result}, np.pi = {np.pi}, tol = {args.tolerance}")
np.testing.assert_allclose(result, np.pi, atol=args.tolerance, rtol=0)
print("SUCCESS!")

commands.sh

#!/usr/bin/env bash

set -e

cslc ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 \
--params=iterations:200 -o out \
--params=MEMCPYD2H_DATA_1_ID:1 \
--memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
cs_python run.py --name out --tolerance 0.1