Sample
The sample in this section implements tensor addition. The preceding TIK optimization mechanisms are applied in this sample to enhance your understanding of TIK programming and optimization.
After learning the tensor addition sample in this section, you can learn more TIK operator samples by referring to Sample Reference.
To make the sample structure clear and readable, this sample is organized and implemented in the format of a class. The class is defined as follows:
class Vadd(): # Receive data and complete initialization. def __init__(self, input_x, input_y, kernel_name="vadd_sample"): # Perform operator computation and building. def vadd_compute(self): # Define the operations on each AI Core. def vadd_compute_each_core(self, move_offset, move_num): # Define the tiled computation on each AI Core. def vadd_compute_each_loop(self, move_offset, move_num): # Used for function and performance tests. def vadd_sample(input_x, input_y, output_z, kernel_name):
The complete sample code is as follows:
import math from functools import reduce as functools_reduce import numpy as np from te import tik from te import platform as cce from topi.cce import util class Vadd(): def __init__(self, input_x, input_y, kernel_name="vadd_sample"): self.shape_x = input_x.get("shape") self.dtype_x = input_x.get("dtype") self.shape_y = input_y.get("shape") self.dtype_y = input_y.get("dtype") self.kernel_name = kernel_name self.tik_instance = tik.Tik() self.aicore_num = 2 # The data read and write on the Unified Buffer must be performed in the unit of 32 bytes. This parameter is used to compute the tensor division and data movement instruction parameters. block_bite_size = 32 # Obtain the Unified Buffer size, in bytes. ub_size_bytes = te.platform.get_soc_spec("UB_SIZE") # Calculate the number of elements that can be stored in a block based on the input data type. dtype_bytes_size = cce.cce_intrin.get_bit_len(self.dtype_x) // 8 self.data_each_block = block_bite_size // dtype_bytes_size # Calculate the space (address overlapping is considered) allocated to the two inputs and computation results on the Unified Buffer and perform 32-byte alignment. self.ub_tensor_size = ( ub_size_bytes // dtype_bytes_size // 2 // self.data_each_block * self.data_each_block) # Calculate the number of input elements. self.input_num = functools_reduce(lambda x, y: x * y, self.shape_x) # Evenly distribute data to be processed to each AI Core and perform 32-byte alignment. self.data_num_each_core = self.input_num // self.aicore_num # For each Vector instruction, a maximum of eight blocks are computed per iteration repeat. This parameter value is the maximum value of mask. self.vector_mask_max = 8 * self.data_each_block self.input_x_gm = self.tik_instance.Tensor( self.dtype_x, self.shape_x, name="input_x_gm", scope=tik.scope_gm) self.input_y_gm = self.tik_instance.Tensor( self.dtype_x, self.shape_x, name="input_y_gm", scope=tik.scope_gm) self.output_z_gm = self.tik_instance.Tensor( self.dtype_x, self.shape_x, name="output_z_gm", scope=tik.scope_gm) def vadd_compute(self): with self.tik_instance.for_range( 0, self.aicore_num, block_num=self.aicore_num) as index: # Create two tensors with inputs in the Unified Buffer. self.input_x_ub = self.tik_instance.Tensor( self.dtype_x, (self.ub_tensor_size,), name="input_x_ub", scope=tik.scope_ubuf) self.input_y_ub = self.tik_instance.Tensor( self.dtype_y, (self.ub_tensor_size,), name="input_y_ub", scope=tik.scope_ubuf) # Move data from the Global Memory to the Unified Buffer. The offset of each movement is the count of processed data. move_offset = index * self.data_num_each_core # Each AI Core computes the corresponding data. self.vadd_compute_each_core(move_offset, self.data_num_each_core) self.tik_instance.BuildCCE( kernel_name=self.kernel_name, inputs=[self.input_x_gm, self.input_y_gm], outputs=[self.output_z_gm]) return self.tik_instance def vadd_compute_each_core(self, move_offset, move_num): loop_time = move_num // self.ub_tensor_size if loop_time > 0: with self.tik_instance.for_range(0, loop_time) as loop_index: move_offset = loop_index * self.ub_tensor_size self.vadd_compute_each_loop(move_offset, self.ub_tensor_size) move_offset = loop_time * self.ub_tensor_size last_num = move_num % self.ub_tensor_size if last_num > 0: self.vadd_compute_each_loop(move_offset, last_num) def vadd_compute_each_loop(self, move_offset, move_num): # Compute burst_len of each data movement. burse_len = math.ceil(move_num / self.data_each_block) self.tik_instance.data_move(self.input_x_ub, self.input_x_gm[move_offset], 0, 1, burse_len, 0, 0) self.tik_instance.data_move(self.input_y_ub, self.input_y_gm[move_offset], 0, 1, burse_len, 0, 0) vadd_loop = move_num // (self.vector_mask_max * 255) add_offset = 0 if vadd_loop > 0: with self.tik_instance.for_range(0, vadd_loop) as add_index: add_offset = add_index * self.vector_mask_max * 255 self.tik_instance.vec_add(self.vector_mask_max, self.input_x_ub[add_offset], self.input_x_ub[add_offset], self.input_y_ub[add_offset], 255, 8, 8, 8) add_offset = vadd_loop * vector_mask_max * 255 repeat_time = ( move_num % (self.vector_mask_max * 255) // self.vector_mask_max) if repeat_time > 0: self.tik_instance.vec_add(self.vector_mask_max, self.input_x_ub[add_offset], self.input_x_ub[add_offset], self.input_y_ub[add_offset], repeat_time, 8, 8, 8) add_offset += repeat_time * self.vector_mask_max last_num = move_num % self.vector_mask_max if last_num > 0: self.tik_instance.vec_add(last_num, self.input_x_ub[add_offset], self.input_x_ub[add_offset], self.input_y_ub[add_offset], 1, 8, 8, 8) self.tik_instance.data_move(self.output_z_gm[move_offset], self.input_x_ub, 0, 1, burse_len, 0, 0) @util.check_input_type(dict, dict, dict, str) def vadd_sample(input_x, input_y, output_z, kernel_name): """ calculating data Parameters ---------- input_x : dict shape and dtype of input input_y : dict shape and dtype of input output_z : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "vadd_sample" Returns ------- None """ vadd_instance = Vadd(input_x, input_y, kernel_name) tik_instance = vadd_instance.vadd_compute() return tik_instance