自定义算子开发系列：AICPU Tiling下沉编程方式介绍

昇腾CANN

800人浏览 · 2025-12-19 18:55:36

昇腾CANN · 2025-12-19 18:55:36 发布

《自定义算子开发系列：AICPU Tiling下沉编程方式介绍》

基础知识准备

本文内容基于Ascend C算子开发衍生而来，对于算子开发还不了解的读者可以通过以下资源进行学习：
《Ascend C算子开发文档手册》
《Ascend C算子开发系列课程：入门》
《Ascend C算子开发系列课程：进阶》
《Ascend C算子开发系列课程：高级》

背景介绍

Host Bound一直是算子调用的显著性能瓶颈，造成Host Bound的核心原因就在于算子在Kernel执行前都需要计算出TilingData，而TilingData的计算通常是在Host侧完成再拷贝到Device侧的。针对这一问题我们推出了AICPU Tiling下沉编程方式，使用Device侧的AICPU计算TilingData，节省了Host侧拷贝TilingData到Device侧的步骤，降低算子执行耗时。

亮点介绍

通过减少Host与Device的交互，提升算子执行性能
通过<<<>>>调用AICPU的方式，降低了编程成本

AICPU Tiling下沉编程使用详解

一、开发流程

目录结构

以一个简单的abs算子的demo为例：

📁 aicpu_demo/             # demo目录
├── 📄 main.cpp            # 算子入口，分别调用AICPU和AICORE
├── 📄 abs.aicpu           # 算子AICPU实现
├── 📄 abs.asc             # 算子AICORE实现
├── 📄 kernel_args.h       # 结构体定义
├── 📄 CMakeLists.txt      # cmake文件

编写AICPU Tiling实现逻辑

定义AICPU Tiling的KernelArgs入参（对应kernel_args.h文件）

当前<<<>>>方式调用AICPU函数可以通过传入一个结构体指针的方式进行调用，如下，将算子需要的用于计算Tiling的入参和输出的TilingData地址定义在一个struct中。
```
 
 // kernel_args.h
 struct TilingInfo {
   uint32_t data_size_per_block;
 };
 
 struct KernelArgs {
     uint32_t block_num;
     uint32_t data_size;
     TilingInfo *ti; // 与aicore共享的参数
 };
 
```

AICPU Tiling的实现

将上一步定义的KernelArgs作为入参，实现AICPU Tiling的逻辑，将计算好的结果写入TilingData中。

// abs.aicpu
__global__ __aicpu__ int32_t TemplateAicpuKernel(T *args)
{
    // 计算每个核需要处理的数据量，将结果保存在tiling地址对应的device空间中
    args->ti->data_size_per_block = args->data_size / args->block_num;
    return 0;
}

编写AICORE实现逻辑

实现一个简单的abs算子示例：只使用一个核计算所有输入的abs结果，通过tiling地址来访问计算好的tiling数据。

// abs.asc
template<typename T>
__aicore__ void hello_world_impl(GM_ADDR src_gm, GM_ADDR dst_gm, GM_ADDR tiling_addr)
{
    __gm__ struct TilingInfo *tiling = (__gm__ struct TilingInfo *)tiling_addr;
    uint64_t dataSize =  tiling->data_size_per_block;

    AscendC::printf("aicore get dataSize %d\n", dataSize);

    AscendC::GlobalTensor<float> inputGlobal; 
    AscendC::GlobalTensor<float> outputGlobal; 
    inputGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(src_gm), dataSize);
    outputGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(dst_gm), dataSize);
    AscendC::TPipe pipe;

    AscendC::TBuf<AscendC::TPosition::VECCALC> calcBuf;
    pipe.InitBuffer(calcBuf, dataSize * sizeof(float));
    AscendC::LocalTensor<float> tempTensor1 = calcBuf.Get<float>();

    AscendC::DataCopy(tempTensor1, inputGlobal, dataSize);

    event_t eventID1 = static_cast<event_t>(pipe.FetchEventID(AscendC::HardEvent::MTE2_V));
    AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventID1);
    AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventID1);

    AscendC::Abs(tempTensor1, tempTensor1, dataSize);
    
    event_t eventIdVToMte3 = static_cast<event_t>(pipe.FetchEventID(AscendC::HardEvent::V_MTE3));
    AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(eventIdVToMte3);
    AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(eventIdVToMte3);

    AscendC::DataCopy(outputGlobal, tempTensor1, dataSize);
}

通过两条不同的流分别调用AICPU和AICORE任务

出于性能的考虑，需要使用不同的两条流来分别执行AICPU和AICORE任务，目的是在网络场景中让AICPU和AICORE的计算能够并行；同时对于单算子内部的实现，需要使用event机制，来保证AICPU的计算结束后再执行AICORE上的任务。

// main.cpp
int32_t main(void) {
    CHECK_ACL(aclInit(nullptr));
    int32_t deviceId = 0;
    printf("acl init ok! \n");
    CHECK_ACL(aclrtSetDevice(deviceId));
    printf("set device ok! \n");

    aclrtStream aicpu_stream = nullptr;
    aclrtStream aicore_stream = nullptr;
    CHECK_ACL(aclrtCreateStream(&aicpu_stream));
    CHECK_ACL(aclrtCreateStream(&aicore_stream));
    printf("create stream ok! \n");

    aclrtEvent event;
    CHECK_ACL(aclrtCreateEventExWithFlag(&event, ACL_EVENT_SYNC));

    void *srcDevice;
    void *dstDevice;
    void *ti;
    CHECK_ACL(aclrtMalloc((void **)&srcDevice, 4096, ACL_MEM_MALLOC_HUGE_FIRST));
    CHECK_ACL(aclrtMalloc((void **)&dstDevice, 4096, ACL_MEM_MALLOC_HUGE_FIRST));
    CHECK_ACL(aclrtMalloc((void **)&ti, 4096, ACL_MEM_MALLOC_HUGE_FIRST));
    void *zHost = malloc(4096);
    memset(zHost, 0, 4096);
    CHECK_ACL(aclrtMemcpy(srcDevice, 4096, zHost, 4096, ACL_MEMCPY_HOST_TO_DEVICE));
    CHECK_ACL(aclrtMemcpy(dstDevice, 4096, zHost, 4096, ACL_MEMCPY_HOST_TO_DEVICE));
    struct KernelArgs args = {0};
    args.block_num = 1;
    args.data_size = 10;
    args.ti = (TilingInfo *)ti;

    TemplateAicpuKernel_do(aicpu_stream, &args);

    CHECK_ACL(aclrtRecordEvent(event, aicpu_stream));
    CHECK_ACL(aclrtStreamWaitEvent(aicore_stream, event));

    hello_world_do(1, aicore_stream, (uint8_t *)srcDevice, (uint8_t *)dstDevice, (uint8_t *)ti);
    printf("launch ok! \n");

    CHECK_ACL(aclrtSynchronizeStreamWithTimeout(aicore_stream, 10000));
    printf("sync ok!\n");

    CHECK_ACL(aclrtFree(srcDevice));
    CHECK_ACL(aclrtFree(dstDevice));
    free(zHost);
    CHECK_ACL(aclrtDestroyStream(aicpu_stream));
    CHECK_ACL(aclrtDestroyStream(aicore_stream));
    CHECK_ACL(aclrtResetDevice(deviceId));
    CHECK_ACL(aclFinalize());
    return 0;
}

AICPU Tiling入口

// abs.asc
template<typename T>
extern __global__ __aicpu__ int32_t TemplateAicpuKernel(T *args);

template extern __global__ __aicpu__ int32_t TemplateAicpuKernel<KernelArgs>(KernelArgs *args);

void TemplateAicpuKernel_do(void *stream, KernelArgs *args) // aicpu entrance
{
    TemplateAicpuKernel<KernelArgs><<<1, nullptr, stream>>>(args, sizeof(KernelArgs));
}

AICORE入口

// abs.asc
template<typename T>
__global__ __aicore__ void hello_world(GM_ADDR src, GM_ADDR dst, GM_ADDR tiling)
{
    hello_world_impl<T>(src, dst, tiling);
}
extern "C" {
    void hello_world_do(uint32_t blockDim, void *stream, uint8_t *src, uint8_t *dst, uint8_t *ti)  // aicore entrance
    {
        hello_world<int><<<1, nullptr, stream>>>(src, dst, ti);
    }
}

CMake编译

在CMakeLists.txt文件中分别使用不同的编译配置编译AICORE和AICPU，最终将结果打包成一个静态库。

// CMakeLists.txt
cmake_minimum_required(VERSION 3.18)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(ASCEND_CANN_PACKAGE_PATH "$ENV{ASCEND_HOME_PATH}" CACHE PATH "ASCEND CANN package installation directory" FORCE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
set(ASCEND_LIB_DIR "$ENV{ASCEND_HOME_PATH}/x86_64-linux/lib64")
link_directories(${ASCEND_LIB_DIR})
find_package(ASC REQUIRED)
find_package(AICPU REQUIRED)
add_library(my_kernel SHARED
    abs.aicpu
    abs.asc
)
set_target_properties(my_kernel PROPERTIES LINKER_LANGUAGE CXX)
project(my_ops LANGUAGES ASC AICPU CXX)
target_link_libraries(my_kernel PRIVATE
    ascendc_runtime
    profapi
    ascendalog
    ascendcl 
    runtime 
    c_sec 
    mmpa
    error_manager
    ascend_dump
    pthread
)
target_compile_options(my_kernel PRIVATE
    $<$<COMPILE_LANGUAGE:ASC>: --npu-arch=dav-2201>
)
target_include_directories(my_kernel PUBLIC
    $ENV{ASCEND_HOME_PATH}/lib64
    $ENV{ASCEND_HOME_PATH}/x86_64-linux/include
    $ENV{ASCEND_HOME_PATH}/x86_64-linux/lib64
    ${ASCEND_CANN_PACKAGE_PATH}/include/ascendc/aicpu_api
)

add_executable(main main.cpp)

target_link_libraries(main PRIVATE
    my_kernel
    ascendcl
)

二. 代码调测

Host侧
可使用通用C++语言的维测手段，包括打印，GDB等。

Device侧

AICORE 可直接使用AscendC::printf或AscendC::DumpTensor，打印变量调试。

AICPU 也可使用AscendC::printf打印变量调试。

__global__ __aicpu__ int32_t TemplateAicpuKernel(T *args)
{
    int32_t var = 0;
    AscendC::printf("TemplateAicpuKernel inited! %d\n", var);
    ...
}

三. 性能调优

该方案中由于把Tiling计算移动到了AICPU上，因此Tilingkey无法在Host上获取，只能将原本的Tilingkey分发逻辑移动到AICORE Kernel中进行判断；在实际开发dequant_swiglu_quant算子时，初步性能测试时发现这一改动导致了额外的icache miss，算子整体性能下降5%。

__global__ __aicore__ __attribute__((aiv)) void dequant_swiglu_quant(GM_ADDR x, GM_ADDR weight_scale, GM_ADDR activation_scale, GM_ADDR bias, GM_ADDR quant_scale, GM_ADDR quant_offset, GM_ADDR y, GM_ADDR scale, GM_ADDR tiling_data) {
      __gm__ struct DequantSwigluQuantTiling *tiling = (__gm__ struct DequantSwigluQuantTiling *)tiling_data;
    
      if (AscendC::GetBlockIdx() >= tiling->core_num) {
        return;
      }
    
      // 原本是在Host上进行判断
      if (tiling->tiling_key == 0) {
        swiglu_quant_impl<float16_t, 2>(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data);
      } else if (tiling->tiling_key == 1) {
        swiglu_quant_impl<bfloat16_t, 2>(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data);
      } else if (tiling->tiling_key == 2) {
        swiglu_quant_impl<float16_t, 1>(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data);
      } else if (tiling->tiling_key == 3) {
        swiglu_quant_impl<bfloat16_t, 1>(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data);
      } else if (tiling->tiling_key == 4) {
        dequant_swiglu_quant_impl<2>(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data);
      } else if (tiling->tiling_key == 5) {
        dequant_swiglu_quant_impl<1>(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data);
      }
  }

针对这一现象，可以使用Ascend C提供的ICachePreLoad接口将代码段预加载到ICache中，使得该算子整体性能相较于原本提升了15%。

 template <int BufferNum>
 __aicore__  void dequant_swiglu_quant_impl(GM_ADDR x, GM_ADDR weight_scale, GM_ADDR activation_scale, GM_ADDR bias,
                            GM_ADDR quant_scale, GM_ADDR quant_offset, GM_ADDR y, GM_ADDR scale,
                            GM_ADDR tiling_data) {
  AscendC::ICachePreLoad(2); // 按照实际代码段长度根据接口文档来设置参数
  AscendC::TPipe pipe;
  DequantSwigluQuantKernel<BufferNum> op(&pipe);
  op.init(x, weight_scale, activation_scale, bias, quant_scale, quant_offset, y, scale, tiling_data);
  op.process();
 }