0
点赞
收藏
分享

微信扫一扫

智能计算系统2 bangc算子开发的demo (CPU和MLU270的异构编程流程)


文章目录

  • ​​1. 首先加载环境​​
  • ​​2. 代码​​
  • ​​2.1 Makefile 项目管理编写​​
  • ​​2.2 MLU270需要执行的代码​​
  • ​​2.3 CPU上执行的代码调度MLU​​
  • ​​2.4 主函数​​
  • ​​2.5 实验结果​​


本文主要是介绍如何利用寒武纪

​bangc​​ 语言, 即

​cnrt.h​​ 库, 实现

​CPU​​与

​MLU​​异构编程的流程

​​本文独立代码gitee中​​​​另一个实验是后面bangc实验一中的powerdifference​​

这是一个智能计算系统259页累加的例子实现

整个流程如下图所示

智能计算系统2 bangc算子开发的demo (CPU和MLU270的异构编程流程)_bangc算子开发

1. 首先加载环境

  • 环境位置
    ​/home/zjq/AICSE-demo-student/env/env.sh​
  • 修改环境内容改成相对路径

#!/bin/bash
export AICSE_MODELS_MODEL_HOME=/opt/Cambricon-Test/models
export AICSE_MODELS_DATA_HOME=/opt/Cambricon-Test/datasets/
#export AICSE_MODELS_MODEL_HOME=$PWD/../data/models
#export AICSE_MODELS_DATA_HOME=$PWD/../data/data
export NEUWARE=$PWD/neuware
export NEUWARE_HOME=$PWD/neuware
export TENSORFLOW_MODELS_DATA_HOME=$AICSE_MODELS_DATA_HOME
export PATH=$PATH:$NEUWARE/bin
export PATH=$PATH:/usr/local/neuware/bin
unset LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$NEUWARE/lib64
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/neuware/lib64
source /etc/profile

  • 运行环境
    ​source env.sh​
  • 为了方便, 执行下面代码, 就可以通过​​bangcstart​​每次直接进入环境了

echo 'alias bangcstart="pushd /home/zjq/AICSE-demo-student/env/ && source env.sh && pushd"' >> ~/.bashrc
source ~/.bashrc

2. 代码

  • 下载代码
    ​cd /home/zjq && git clone https://gitee.com/nwu_zjq/cambrian-demo.git​
  • 下面的代码是例子 ​​powerDifference​​ 对应于​​智能计算系统​​ 里面的实验一, ​​bangc 算子实验​

只不过因为里面的项目管理有点混乱, 为了更有条理的理解整个CPU和MLU的异构编程流程, 我重新整理了项目, 利用Makefile进行管理, 标准的 src 放源码, ​include​ 放头文件

其实项目的原始代码是在 ​​/home/zjq/AICSE-demo-student/demo/style_transfer_bcl/src/bangc/PluginPowerDifferenceOp/​

2.1 Makefile 项目管理编写

# 需要运行 pushd /home/zjq/AICSE-demo-student/env/ && source env.sh && pushd

CNCCParams = --bang-mlu-arch=MLU200
GPPParams = -g -std=c++11 -Iinclude -I${NEUWARE_HOME}/include # 这里设置的是C++11标准, include是头文件 -g是调试
LINKParams = -L ${NEUWARE_HOME}/lib64 -lcnrt

object= obj/plugin_power_difference_kernel.o obj/plugin_power_difference_op.o obj/main.o

obj/%.o: src/%.mlu
mkdir -p $(@D)
cncc ${CNCCParams} -o $@ -c $<

obj/%.o: src/%.cpp
mkdir -p $(@D)
g++ ${GPPParams} -o $@ -c $<

# 注意这里不需要把main也转化为.o, 因为main是需要去其他四个.o文件中去查找函数的功能
all: ${object}
g++ ${object} -o main ${LINKParams}
./main


clean:
rm -rf obj main

上面的Makefile文件能够实现运行 ​​make​​ 会直接获得结果

2.2 MLU270需要执行的代码

​_kernel.mlu​​这里面定义的就是如何在MLU上申请内存, 如果将CPU上的数据拷贝到MLU, 然后在MLU上利用care进行计算, 完成后在将结果拷贝会CPU对应的内存

// /home/zjq/cambrian-demo/powerDifference/src/plugin_power_difference_kernel.mlu

// TODO:PowerDifference BCL单核实现

#define ONELINE 64
__mlu_entry__ void PowerDifferenceKernel( half* input1, // X
half* input2, // Y
int32_t pow, // Z
half* output, // 结果
int32_t dims_a) // 维度
{
if (taskId > 0) return;
__bang_printf("总长度 %d 任务维度%d\n", dims_a, taskDim);
// TODO:循环条件判断
int32_t quotient = dims_a/ONELINE; // 大循环
int32_t rem = dims_a % ONELINE; // 循环剩余
if(rem != 0) {
quotient+=1;
}


// TODO:内存申请
__nram__ half inputx_nram[ONELINE];
__nram__ half inputy_nram[ONELINE];
__nram__ half temp_nram[ONELINE];


// TODO:For循环计算
for (int i = 0; i < quotient; i++) {

// TODO:拷入操作
__memcpy(inputx_nram, input1+i*ONELINE, ONELINE*sizeof(half), GDRAM2NRAM);
__memcpy(inputy_nram, input2+i*ONELINE, ONELINE*sizeof(half), GDRAM2NRAM);

// TODO:实际计算部分
__bang_sub(temp_nram, inputx_nram, inputy_nram, ONELINE); // 按行做减法保存到temp
__bang_active_abs(temp_nram, temp_nram, ONELINE);

for(int i=0;i<pow-1;i++) {
__bang_mul(temp_nram,temp_nram,temp_nram,ONELINE);
}
// TODO:结果拷出操作
__memcpy(output+i*ONELINE,temp_nram,ONELINE*sizeof(half),NRAM2GDRAM);
}
}

  • 对应的头文件

// /home/zjq/cambrian-demo/powerDifference/include/plugin_power_difference_kernel.h
#ifndef _PLUGIN_POWER_DIFFERENCE_KERNEL_H_
#define _PLUGIN_POWER_DIFFERENCE_KERNEL_H_
#ifdef __cplusplus
extern "C" {
#endif

#include <stdlib.h>
#include "cnrt.h" // 调用mlu的库函数
#include "cnrt_data.h"
#include "stdio.h"

typedef uint16_t half;

// TODO:BCL接口定义
// void PowerDifferenceKernel(---);
void PowerDifferenceKernel(half* input1,half* input2,int32_t pow, half* output,int32_t dims_a);

#ifdef __cplusplus
}
#endif
#endif // _PLUGIN_POWER_DIFFERENCE_KERNEL_H_

2.3 CPU上执行的代码调度MLU

​_op.cc​​​ 完成了CPU上内存数据的开辟以及跟MLU之间的数据传递, 并且封装成​​op​​​ 的 ​​API​​​, 供给 ​​main函数调用​

// /home/zjq/cambrian-demo/powerDifference/src/plugin_power_difference_op.cpp

#include "cnrt.h" // 调用mlu的库函数
#include "cnrt_data.h"
#include "plugin_power_difference_kernel.h"
#include "plugin_power_difference_op.h"



int MLUPowerDifferenceOp(float* input1,float* input2, int pow, float*output, int dims_a) {

cnrtInit(0); // 初始化设备
cnrtDev_t dev;
cnrtGetDeviceHandle(&dev, 0);
cnrtSetCurrentDevice(dev);
cnrtQueue_t pQueue;
cnrtCreateQueue(&pQueue);

// 设置任务划分,
cnrtDim3_t dim;
dim.x = 1; // 这里使单核, 如果是dim.x=4, 则是4核, 也就是一行能计算64*4=256位
dim.y = 1;
dim.z = 1;
float hardware_time = 0.0;
cnrtNotifier_t event_start;
cnrtNotifier_t event_end;
cnrtCreateNotifier(&event_start);
cnrtCreateNotifier(&event_end);
cnrtFunctionType_t c = CNRT_FUNC_TYPE_BLOCK;

//prepare data
half* input1_half = (half*)malloc(dims_a * sizeof(half));
half* input2_half = (half*)malloc(dims_a * sizeof(half));
half* output_half = (half*)malloc(dims_a * sizeof(half));

cnrtConvertFloatToHalfArray(input1_half, input1, dims_a);
cnrtConvertFloatToHalfArray(input2_half, input2, dims_a);
cnrtConvertFloatToHalfArray(output_half, output,dims_a);

half *mlu_input1, *mlu_input2, *mlu_output;
if (CNRT_RET_SUCCESS != cnrtMalloc((void**)&mlu_input1, dims_a * sizeof(half))) {
printf("cnrtMalloc Failed!\n");
exit(-1);
}
if (CNRT_RET_SUCCESS != cnrtMalloc((void**)&mlu_input2, dims_a * sizeof(half))) {
printf("cnrtMalloc Failed!\n");
exit(-1);
}
if (CNRT_RET_SUCCESS != cnrtMalloc((void**)&mlu_output, dims_a * sizeof(half))) {
printf("cnrtMalloc output Failed!\n");
exit(-1);
}
// TODO:完成cnrtMemcpy拷入函数
cnrtMemcpy(mlu_input1,input1_half,dims_a*sizeof(half),CNRT_MEM_TRANS_DIR_HOST2DEV);
cnrtMemcpy(mlu_input2,input2_half,dims_a*sizeof(half),CNRT_MEM_TRANS_DIR_HOST2DEV);

//kernel parameters
cnrtKernelParamsBuffer_t params;
cnrtGetKernelParamsBuffer(¶ms);
cnrtKernelParamsBufferAddParam(params, &mlu_input1, sizeof(half*));
cnrtKernelParamsBufferAddParam(params, &mlu_input2, sizeof(half*));
cnrtKernelParamsBufferAddParam(params, &pow, sizeof(int));
cnrtKernelParamsBufferAddParam(params, &mlu_output, sizeof(half*));
cnrtKernelParamsBufferAddParam(params, &dims_a, sizeof(int));
cnrtPlaceNotifier(event_start, pQueue);

// TODO:完成cnrtInvokeKernel函数
cnrtInvokeKernel_V2((void*)&PowerDifferenceKernel,dim,params,c,pQueue);


if (CNRT_RET_SUCCESS != cnrtSyncQueue(pQueue))
{
printf("syncQueue Failed!\n");
exit(-1);
}
cnrtPlaceNotifier(event_end, pQueue);

//get output data
// TODO:完成cnrtMemcpy拷出函数
cnrtMemcpy(output_half,mlu_output,dims_a*sizeof(half),CNRT_MEM_TRANS_DIR_DEV2HOST);

cnrtConvertHalfToFloatArray(output, output_half,dims_a );

//free data
if (CNRT_RET_SUCCESS != cnrtFree(mlu_input1)) {
printf("cnrtFree Failed!\n");
exit(-1);
}
if (CNRT_RET_SUCCESS != cnrtFree(mlu_input2)) {
printf("cnrtFree Failed!\n");
exit(-1);
}
if (CNRT_RET_SUCCESS != cnrtFree(mlu_output)) {
printf("cnrtFree output Failed!\n");
exit(-1);
}
if (CNRT_RET_SUCCESS != cnrtDestroyQueue(pQueue)) {
printf("cnrtDestroyQueue Failed!\n");
exit(-1);
}
if (CNRT_RET_SUCCESS != cnrtDestroyKernelParamsBuffer(params)) {
printf("cnrtDestroyKernelParamsBuffer Failed!\n");
return -1;
}
cnrtDestroy();
free(input1_half);
free(input2_half);
free(output_half);
return 0;
}

  • 对应的头文件

// /home/zjq/cambrian-demo/powerDifference/include/plugin_power_difference_op.h
#ifndef _PLUGIN_POWER_DIFFERENCE_OP_H_
#define _PLUGIN_POWER_DIFFERENCE_OP_H_
#ifdef __cplusplus
extern "C" {
#endif

#include <stdlib.h>
#include "stdio.h"

int MLUPowerDifferenceOp(float* input1,float* input2, int pow, float*output, int dims_a);

#ifdef __cplusplus
}
#endif
#endif

2.4 主函数

// /home/zjq/cambrian-demo/powerDifference/src/main.cpp
#include <math.h>
#include <time.h>
#include "stdio.h"
#include <stdlib.h>
#include <sys/time.h>
#include "plugin_power_difference_op.h" // 这里包含CPU和MLU交互逻辑

#define DATA_COUNT 32768
#define POW_COUNT 2
// int MLUPowerDifferenceOp(float* input1,float* input2, int pow, float*output, int dims_a);

int main() {
float* input_x = (float*)malloc(DATA_COUNT * sizeof(float));
float* input_y = (float*)malloc(DATA_COUNT * sizeof(float));
float* output_data = (float*)malloc(DATA_COUNT * sizeof(float));
float* output_data_cpu = (float*)malloc(DATA_COUNT * sizeof(float));
FILE* f_input_x = fopen("./data/in_x.txt", "r");
FILE* f_input_y = fopen("./data/in_y.txt", "r");
FILE* f_output_data = fopen("./data/out.txt", "r");
struct timeval tpend, tpstart;
float err = 0.0;
float cpu_sum = 0.0;
float time_use = 0.0;

if (f_input_x == NULL|| f_input_y == NULL || f_output_data == NULL) {
printf("Open file fail!\n");
return 0;
}

gettimeofday(&tpstart, NULL);
srand((unsigned)time(NULL));
for (int i = 0; i < DATA_COUNT; i++) {
fscanf(f_input_x, "%f\n", &input_x[i]);
fscanf(f_input_y, "%f\n", &input_y[i]);
fscanf(f_output_data, "%f\n", &output_data_cpu[i]);
}
gettimeofday(&tpend, NULL);
time_use = 1000000 * (tpend.tv_sec - tpstart.tv_sec)+ tpend.tv_usec - tpstart.tv_usec;
printf("get data cost time %f ms\n", time_use/1000.0);

gettimeofday(&tpstart, NULL);
MLUPowerDifferenceOp(input_x,input_y,POW_COUNT,output_data,DATA_COUNT);
gettimeofday(&tpend, NULL);
time_use = 1000000 * (tpend.tv_sec - tpstart.tv_sec)+ tpend.tv_usec - tpstart.tv_usec;
printf("compute data cost time %f ms\n", time_use/1000.0);
printf("input x %f\n",input_x[0]);
printf("input y %f\n",input_y[0]);
printf("output data %f\n",output_data[0]);
printf("output data %f\n",output_data[1]);
printf("output data %f\n",output_data[2]);
for(int i = 0; i < DATA_COUNT;++i)
{
err +=fabs(output_data_cpu[i] - output_data[i]) ;
cpu_sum +=fabs(output_data_cpu[i]);
}
printf("err rate = %0.4f%%\n", err*100.0/cpu_sum);
return 0;
}

智能计算系统2 bangc算子开发的demo (CPU和MLU270的异构编程流程)_#endif_02

2.5 实验结果

root@localhost:/home/zjq/cambrian-demo/powerDifference# make

/*
g++ obj/plugin_power_difference_kernel.o obj/powerDiff.o obj/main.o -o main -L /home/zjq/AICSE-demo-student/env/neuware/lib64 -lcnrt
./main
get data cost time 27.130000 ms
CNRT: 4.2.1 fa5e44c
compute data cost time 31.934000 ms
input x 139.000000
input y 70.000000
output data 4760.000000
output data 15872.000000
output data 14880.000000
err rate = 0.0117%
*/


举报

相关推荐

0 条评论