Sample
The sample in this section implements tensor addition. The preceding TIK optimization mechanisms are applied in this sample to enhance your understanding of TIK programming and optimization.
After learning the tensor addition sample in this section, you can learn more TIK operator samples by referring to Sample Reference.
To make the sample structure clear and readable, this sample is organized and implemented in the format of a class. The class is defined as follows:
class Vadd(): # Receive data and complete initialization. def __init__(self, input_x, input_y, kernel_name="vadd_sample"): # Compute and build the operator. def vadd_compute(self): # Define the operation on each AI Core. def vadd_compute_each_core(self, move_offset, move_num): # Define the tiled compute process on each AI Core. def vadd_compute_each_loop(self, move_offset, move_num): # For function and performance testing. def vadd_sample(input_x, input_y, output_z, kernel_name):
The complete sample code is as follows.
import math from functools import reduce as functools_reduce import numpy as np from te import tik from te import platform as tbe_platform from topi.cce import util class Vadd(): def __init__(self, input_x, input_y, kernel_name="vadd_sample"): self.shape_x = input_x.get("shape") self.dtype_x = input_x.get("dtype") self.shape_y = input_y.get("shape") self.dtype_y = input_y.get("dtype") self.kernel_name = kernel_name self.tik_instance = tik.Tik() self.aicore_num = 2 # Access to the Unified Buffer must be performed in the unit of 32 bytes. This parameter is used to compute the tensor tiling policy and data movement instructions. block_bite_size = 32 # Obtain the Unified Buffer size in bytes. ub_size_bytes = tbe_platform.get_soc_spec("UB_SIZE") # Calculate the number of elements per block based on the input data type. dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len(self.dtype_x) // 8 self.data_each_block = block_bite_size // dtype_bytes_size # Calculate the space (address overlapping is considered) allocated to the two inputs and compute result on the Unified Buffer and perform 32-byte alignment. self.ub_tensor_size = ( ub_size_bytes // dtype_bytes_size // 2 // self.data_each_block * self.data_each_block) # Calculate the number of input elements. self.input_num = functools_reduce(lambda x, y: x * y, self.shape_x) # Calculate the data elements evenly scheduled to each AI Core and perform 32-byte alignment. self.data_num_each_core = self.input_num // self.aicore_num # The vector instruction computes a maximum of eight blocks every iteration repeat. This parameter value is the maximum value of mask. self.vector_mask_max = 8 * self.data_each_block self.input_x_gm = self.tik_instance.Tensor( self.dtype_x, self.shape_x, name="input_x_gm", scope=tik.scope_gm) self.input_y_gm = self.tik_instance.Tensor( self.dtype_x, self.shape_x, name="input_y_gm", scope=tik.scope_gm) self.output_z_gm = self.tik_instance.Tensor( self.dtype_x, self.shape_x, name="output_z_gm", scope=tik.scope_gm) def vadd_compute(self): with self.tik_instance.for_range( 0, self.aicore_num, block_num=self.aicore_num) as index: # Create the tensors of the two inputs in the Unified Buffer. self.input_x_ub = self.tik_instance.Tensor( self.dtype_x, (self.ub_tensor_size,), name="input_x_ub", scope=tik.scope_ubuf) self.input_y_ub = self.tik_instance.Tensor( self.dtype_y, (self.ub_tensor_size,), name="input_y_ub", scope=tik.scope_ubuf) # Move data from the GM to the Unified Buffer. The offset of each movement is the count of processed elements. move_offset = index * self.data_num_each_core # Compute the data tiles scheduled to each AI Core. self.vadd_compute_each_core(move_offset, self.data_num_each_core) self.tik_instance.BuildCCE( kernel_name=self.kernel_name, inputs=[self.input_x_gm, self.input_y_gm], outputs=[self.output_z_gm]) return self.tik_instance def vadd_compute_each_core(self, move_offset, move_num): loop_time = move_num // self.ub_tensor_size if loop_time > 0: with self.tik_instance.for_range(0, loop_time) as loop_index: move_offset = loop_index * self.ub_tensor_size self.vadd_compute_each_loop(move_offset, self.ub_tensor_size) move_offset = loop_time * self.ub_tensor_size last_num = move_num % self.ub_tensor_size if last_num > 0: self.vadd_compute_each_loop(move_offset, last_num) def vadd_compute_each_loop(self, move_offset, move_num): # Compute burst_len of each data movement. burse_len = math.ceil(move_num / self.data_each_block) self.tik_instance.data_move(self.input_x_ub, self.input_x_gm[move_offset], 0, 1, burse_len, 0, 0) self.tik_instance.data_move(self.input_y_ub, self.input_y_gm[move_offset], 0, 1, burse_len, 0, 0) vadd_loop = move_num // (self.vector_mask_max * 255) add_offset = 0 if vadd_loop > 0: with self.tik_instance.for_range(0, vadd_loop) as add_index: add_offset = add_index * self.vector_mask_max * 255 self.tik_instance.vec_add(self.vector_mask_max, self.input_x_ub[add_offset], self.input_x_ub[add_offset], self.input_y_ub[add_offset], 255, 8, 8, 8) add_offset = vadd_loop * vector_mask_max * 255 repeat_time = ( move_num % (self.vector_mask_max * 255) // self.vector_mask_max) if repeat_time > 0: self.tik_instance.vec_add(self.vector_mask_max, self.input_x_ub[add_offset], self.input_x_ub[add_offset], self.input_y_ub[add_offset], repeat_time, 8, 8, 8) add_offset += repeat_time * self.vector_mask_max last_num = move_num % self.vector_mask_max if last_num > 0: self.tik_instance.vec_add(last_num, self.input_x_ub[add_offset], self.input_x_ub[add_offset], self.input_y_ub[add_offset], 1, 8, 8, 8) self.tik_instance.data_move(self.output_z_gm[move_offset], self.input_x_ub, 0, 1, burse_len, 0, 0) @util.check_input_type(dict, dict, dict, str) def vadd_sample(input_x, input_y, output_z, kernel_name): """ calculating data Parameters ---------- input_x : dict shape and dtype of input input_y : dict shape and dtype of input output_z : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "vadd_sample" Returns ------- None """ vadd_instance = Vadd(input_x, input_y, kernel_name) tik_instance = vadd_instance.vadd_compute() return tik_instance