目的
从服务器将数据拷贝到昇腾310 ai卡
结论
数据进入到服务器后的内存不能直接用昇腾acl接口拷贝到AI卡。
需要
1)先用acl接口申请内存;
2) 将数据拷贝到acl申请的内存
3)用acl接口将数据拷贝到AI卡
makefile
# Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.
# CMake lowest version requirement
cmake_minimum_required(VERSION 3.5.1)
# project information
project(mem310p)
set(LIB_PATH $ENV{NPU_HOST_LIB})
# Dynamic libraries in the stub directory can only be used for compilation
if (NOT DEFINED ENV{NPU_HOST_LIB})
#set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/CANN-6.4/runtime/lib64/stub/aarch64")
set(LIB_PATH "/usr/local/Ascend/nnrt/latest/runtime/lib64/stub/")
message(STATUS "set default LIB_PATH: ${LIB_PATH}")
else ()
message(STATUS "env LIB_PATH: ${LIB_PATH}")
endif()
set(INC_PATH $ENV{NPU_HOST_INC})
# Dynamic libraries in the stub directory can only be used for compilation
if (NOT DEFINED ENV{NPU_HOST_INC})
#set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest/CANN-6.4/runtime/include")
# set(INC_PATH "/usr/local/Ascend/nnrt/6.2.RC2/aarch64-linux/include")
# set(INC_PATH "/usr/local/Ascend/nnrt/7.0.0/aarch64-linux/include")
set(INC_PATH "/usr/local/Ascend/nnrt/latest/aarch64-linux/include")
message(STATUS "set default INC_PATH: ${INC_PATH}")
else ()
message(STATUS "env INC_PATH: ${INC_PATH}")
endif()
link_directories(
${LIB_PATH}
${LIB_PATH}/stub
)
add_executable(mem310p
#add_library(zh_sensor SHARED
mem310p.c
)
include_directories(mem310p
${INC_PATH}
${INC_PATH}/acl/media
)
add_compile_options(mem310p
-O2
-Wall
-fpic
)
# sns_zh
target_link_libraries(mem310p
acl_dvpp_mpi
ascendcl
pthread
)
测试代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <errno.h>
#include <stdint.h>
#include "acl/acl.h"
#define SAMPLE_PRT(fmt...) \
do { \
printf("[%s]-%d: ", __FUNCTION__, __LINE__); \
printf(fmt); \
} while (0)
int SUCCESS = 0;
int FAILED = 1;
aclrtContext g_context = NULL;
uint32_t g_device_id = 0;
typedef uint64_t u64;
typedef uint32_t u32;
int32_t setup_acl_device()
{
aclError aclRet = aclInit(NULL);
if (aclRet != ACL_SUCCESS)
{
SAMPLE_PRT("aclInit fail with %d.\n", aclRet);
return aclRet;
}
SAMPLE_PRT("aclInit succ.\n");
aclRet = aclrtSetDevice(g_device_id);
if (aclRet != ACL_SUCCESS)
{
SAMPLE_PRT("aclrtSetDevice %u fail with %d.\n", g_device_id, aclRet);
aclFinalize();
return aclRet;
}
SAMPLE_PRT("aclrtSetDevice(%u) succ.\n", g_device_id);
aclRet = aclrtCreateContext(&g_context, g_device_id);
if (aclRet != ACL_SUCCESS)
{
SAMPLE_PRT("acl create context failed with %d.\n", aclRet);
aclrtResetDevice(g_device_id);
aclFinalize();
return aclRet;
}
SAMPLE_PRT("create context success\n");
aclRet = aclrtGetCurrentContext(&g_context);
if (aclRet != ACL_SUCCESS)
{
SAMPLE_PRT("get current context failed\n");
aclrtDestroyContext(g_context);
g_context = NULL;
aclrtResetDevice(g_device_id);
aclFinalize();
return aclRet;
}
SAMPLE_PRT("get current context success\n");
return SUCCESS;
}
void destroy_acl_device()
{
if (g_context)
{
aclrtDestroyContext(g_context);
g_context = NULL;
aclrtResetDevice(g_device_id);
aclFinalize();
}
}
struct dev_addr
{
u64 phy_addr;
void *virt_addr;
u32 reg_len;
void *mapbase;
};
static void *ioremap_system_mem(struct dev_addr *addr)
{
void *map_base;
int fd;
off_t target;
target = addr->phy_addr;
// if ((fd = open("/dev/mem", O_RDWR | O_SYNC)) == -1)
if ((fd = open("/dev/mem", O_RDWR | O_NONBLOCK)) == -1)
{
fprintf(stderr, "Error at line %d, file %s (%d) [%s]\n", \
__LINE__, __FILE__, errno, strerror(errno));
return NULL ;
}
fflush(stdout);
map_base = mmap(0, addr->reg_len, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
target);
if (map_base == (void *) -1)
{
fprintf(stderr, "Error at line %d, file %s (%d) [%s]\n", \
__LINE__, __FILE__, errno, strerror(errno));
return NULL ;
}
fflush(stdout);
addr->mapbase = map_base;
addr->virt_addr = map_base;//(map_base + (target & MAP_MASK));
close(fd);
return addr->virt_addr;
}
//#ifdef __ARM__
#if 1
void neon_memcpy(volatile void *dst, volatile void *src, int sz)
{
if (sz & 63)
{
sz = (sz & -64) + 64;
}
asm volatile(
"NEONCopyPLD: \n"
"sub %[dst], %[dst], #64 \n"
"1: \n"
"ldnp q0, q1, [%[src]] \n"
"ldnp q2, q3, [%[src], #32] \n"
"add %[dst], %[dst], #64 \n"
"subs %[sz], %[sz], #64 \n"
"add %[src], %[src], #64 \n"
"stnp q0, q1, [%[dst]] \n"
"stnp q2, q3, [%[dst], #32] \n"
"b.gt 1b \n"
: [dst]"+r"(dst), [src]"+r"(src), [sz]"+r"(sz) : : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
}
#else
#include <arm_neon.h>
void *neon_memcpy(void *dest, void *src, size_t count)
{
int i;
unsigned long *s = (unsigned long *)src;
unsigned long *d = (unsigned long *)dest;
for (i = 0; i < count / 64; i++)
{
vst1q_u64(&d[0], vld1q_u64(&s[0]));
vst1q_u64(&d[2], vld1q_u64(&s[2]));
vst1q_u64(&d[4], vld1q_u64(&s[4]));
vst1q_u64(&d[6], vld1q_u64(&s[6]));
d += 8;
s += 8;
}
return dest;
}
#endif
//#endif
struct timeval start_time;
struct timeval end_time;
double elapsed_time = 0.0;
static double elapsed(struct timeval start_time, struct timeval end_time)
{
elapsed_time = (end_time.tv_sec * 1000000 + end_time.tv_usec - start_time.tv_sec
* 1000000 - start_time.tv_usec);
return elapsed_time;
}
int32_t main(int32_t argc, char *argv[])
{
int ret = SUCCESS;
aclError aclRet = ACL_SUCCESS;
// acl资源初始化
ret = setup_acl_device();
if (ret != SUCCESS)
{
SAMPLE_PRT("Setup Device failed! ret code:%#x\n", ret);
return FAILED;
}
struct dev_addr atu_addr;
//atu_addr.phy_addr = 0x2000000000;
atu_addr.phy_addr = 0x2200000000; //0x22 0000 0000
atu_addr.virt_addr = 0;
atu_addr.reg_len = 0x2000000;
size_t g_memory_size = 4752 * 3795;
if (NULL == ioremap_system_mem(&atu_addr))
{
printf("ioremap fail\n");
return -1;
}
void *host_buffer = NULL;
void *host_buffer1 = NULL;
void *device_buffer = NULL;
void *my_host_buffer = NULL;
my_host_buffer = malloc(g_memory_size);
if (NULL == my_host_buffer)
{
printf("malloc fail\n");
return 0;
}
memset((unsigned char *)my_host_buffer, 0xaa, g_memory_size);
memset((unsigned char *)atu_addr.virt_addr, 0x55, g_memory_size);
// 申请device内存
aclRet = aclrtMalloc(&device_buffer, g_memory_size, ACL_MEM_MALLOC_HUGE_FIRST);
if (aclRet != ACL_SUCCESS)
{
SAMPLE_PRT("aclrtMalloc failed\n");
}
// 申请host内存
aclRet = aclrtMallocHost(&host_buffer, g_memory_size);
if (aclRet != ACL_SUCCESS)
{
SAMPLE_PRT("aclrtMallocHost failed\n");
}
// 申请host内存
aclRet = aclrtMallocHost(&host_buffer1, g_memory_size);
if (aclRet != ACL_SUCCESS)
{
SAMPLE_PRT("aclrtMallocHost failed\n");
}
gettimeofday(&(start_time), NULL);
memcpy(host_buffer1, (unsigned char *)atu_addr.virt_addr, g_memory_size);
gettimeofday(&(end_time), NULL);
elapsed_time = elapsed(start_time, end_time);
// 内存拷贝H2D
aclRet = aclrtMemcpy(device_buffer, g_memory_size,
host_buffer1 /* atu_addr.virt_addr*/, g_memory_size, ACL_MEMCPY_HOST_TO_DEVICE);
if (aclRet != ACL_SUCCESS)
{
SAMPLE_PRT("aclrtMemcpy H2D failed\n");
}
printf("....,dma time %f\n", elapsed_time);
// 内存拷贝D2H
aclRet = aclrtMemcpy(host_buffer, g_memory_size, device_buffer,
g_memory_size, ACL_MEMCPY_DEVICE_TO_HOST);
if (aclRet != ACL_SUCCESS)
{
SAMPLE_PRT("aclrtMemcpy D2H failed\n");
}
SAMPLE_PRT("\data is %x\n", *(unsigned char *)host_buffer);
if (munmap(atu_addr.mapbase, atu_addr.reg_len) == -1)
{
printf("un ioremap fail\n");
return -1;
}
// 释放al资源
destroy_acl_device();
SAMPLE_PRT("run success!\n");
return SUCCESS;
}
上述测试代码,测试从mmap映射的内存到acl分配的内存,耗时70几个ms,这是不正确的。
耗时分析
待续。。。。