IMX8M系列 OpenCL FFT 示例编译及其他demo测试（MYD-JX8MX）

IMX8M系列 OpenCL FFT 示例编译测试及其他demo测试（MYD-JX8MX）

上一篇文章已经将如何编译镜像，如何用官方的方式FslBuild.py 脚本编译demo。不知道有没有成功的朋友，如果你成功了，那么可以依然用这种方式，如果你没有成功，那么你可以参考接下来的方式来编译你的demo

导出文件系统

上一篇文章有描述如何导出文件系统，这里的目的是为了提供编译所需的头文件和库文件等
具体方法如下：

#进入源码root目录
cd fsl-release-yocto
#执行设置环境变量的脚本
. ./setup-environment build-xwayland
#编译工具链
bitbake meta-toolchain
#安装工具链
cd ~/fsl-release-yocto/build-xwayland/tmp/deploy/sdk
./fsl-imx-xwayland-glibc-x86_64-meta-toolchain-aarch64-toolchain-4.9.88-2.0.0.sh
./opt/fsl-imx-xwayland/4.19-warrior/environment-setup-aarch64-poky-linux
#提取文件系统
runqemu-extract-sdk ~/fsl-release-yocto/build-xwayland/tmp/deploy/images/imx8mqevk/fsl-image-qt5-validation-imx-imx8mqevk-20210809020904.rootfs.tar.bz2 ~/imx8mqevk-rootfs

提取好的文件系统在~/imx8mqevk-rootfs 目录下
具体内容如下：

该文件系统时利用交叉编译方式编译出来的，有我们所需的头文件和库等。

编译FFT

这里我利用gtec-demo-frameworkNXP提供的示例来单独编译FFT用于简单的测试其GPU的性能

main.cpp


#include "clutil.h"int
main(const int argc,const char* argv[])
{if (argc < 2){printf("Usage: %s fftlen \n", argv[0]);return -1;}const unsigned len = atoi(argv[1]);if (len > FFT_MAX){printf("FFT length cannot be greater than %d.\n", FFT_MAX);return -1;}if (len < 16){printf("FFT length has to at least be 16.\n");return -1;}if ((len != 1) && (len & (len - 1))){printf("FFT length (%d) must be a power-of-2.\n", len);return -1;}printf("Block size: %d \n", blockSize);printf("Print result: %s \n", print ? "yes" : "no");int result = 0;result = runFFT(len);if (result == 0){printf("Successful.\n");if (print) printResult(len);}else{printf("Failed.\n");}cleanup();
}

fft.cpp


#include "clutil.h"
#include <time.h>
static unsigned workOffset;
static unsigned workSize;
static int p[FFT_MAX_LOG2N]       = {  1,   2,   4,   8,   16,   32,   64,   128,   256,   512,   1024,   2048,   4096,   8192,   16384,   32768};
static int twop[FFT_MAX_LOG2N]    = {2*1, 2*2, 2*4, 2*8, 2*16, 2*32, 2*64, 2*128, 2*256, 2*512, 2*1024, 2*2048, 2*4096, 2*8192, 2*16384, 2*32768};
static int threep[FFT_MAX_LOG2N]  = {3*1, 3*2 ,3*4, 3*8, 3*16, 3*32, 3*64, 3*128, 3*256, 3*512, 3*1024, 3*2048, 3*4096, 3*8192, 3*16384, 3*32768};
static int pminus1[FFT_MAX_LOG2N] = {1-1, 2-1, 4-1, 8-1, 16-1, 32-1, 64-1, 128-1, 256-1, 512-1, 1024-1, 2048-1, 4096-1, 8192-1, 16384-1, 32768-1};#ifndef M_PI
#define M_PI 3.14159265358979f
#endif
static cl_float minusPIoverp[FFT_MAX_LOG2N]     = {    -M_PI,         -M_PI/2.f,     -M_PI/4.f,     -M_PI/ 8.f,     -M_PI/16.f,     -M_PI/32.f,     -M_PI/ 64.f,     -M_PI/128.f,     -M_PI/256.f,     -M_PI/ 512.f,     -M_PI/1024.f,     -M_PI/2048.f,     -M_PI/4096.f,     -M_PI/ 8192.f,     -M_PI/16384.f,     -M_PI/32768.f};
static cl_float minusPIover2p[FFT_MAX_LOG2N]    = {    -M_PI/2.f,     -M_PI/4.f,     -M_PI/8.f,     -M_PI/16.f,     -M_PI/32.f,     -M_PI/64.f,     -M_PI/128.f,     -M_PI/256.f,     -M_PI/512.f,     -M_PI/1024.f,     -M_PI/2048.f,     -M_PI/4096.f,     -M_PI/8192.f,     -M_PI/16384.f,     -M_PI/32768.f,     -M_PI/65536.f};
static cl_float minusPIover2p_2x[FFT_MAX_LOG2N] = {-2.f*M_PI/2.f, -2.f*M_PI/4.f, -2.f*M_PI/8.f, -2.f*M_PI/16.f, -2.f*M_PI/32.f, -2.f*M_PI/64.f, -2.f*M_PI/128.f, -2.f*M_PI/256.f, -2.f*M_PI/512.f, -2.f*M_PI/1024.f, -2.f*M_PI/2048.f, -2.f*M_PI/4096.f, -2.f*M_PI/8192.f, -2.f*M_PI/16384.f, -2.f*M_PI/32768.f, -2.f*M_PI/65536.f};
static cl_float minusPIover2p_3x[FFT_MAX_LOG2N] = {-3.f*M_PI/2.f, -3.f*M_PI/4.f, -3.f*M_PI/8.f, -3.f*M_PI/16.f, -3.f*M_PI/32.f, -3.f*M_PI/64.f, -3.f*M_PI/128.f, -3.f*M_PI/256.f, -3.f*M_PI/512.f, -3.f*M_PI/1024.f, -3.f*M_PI/2048.f, -3.f*M_PI/4096.f, -3.f*M_PI/8192.f, -3.f*M_PI/16384.f, -3.f*M_PI/32768.f, -3.f*M_PI/65536.f};static int
radix(int N)
{int i = 0, j = 0;for (; i <= 31; i++){if ((N & (1 << i)) == 0){j++;}else{break;}}return (0 == (j%2)) ? 4 : 2;
}static unsigned int
log2NFFT(unsigned int size)
{unsigned int v = size;unsigned int log2n = 0;while (v >>= 1){log2n++;}return log2n;
}#define RADIX2_FFT_KERNEL "fft_radix2"
#define RADIX4_FFT_KERNEL "fft_radix4"static void
FFTGpu(const unsigned len)
{if (len == 0){return;}// figure out if we can use a radix-4 FFT : otherwise radix-2int rad = radix(len);if (4==rad && ((16==len) || (256==len) || (4096==len) || (65536==len) || (1048576 == len) ))rad = 2;// log2(n) is the # of kernels that will be invoked (for a radix-2 FFT)unsigned int log2n = log2NFFT(len);printf("log2(fft size) = log2(%d)=%d\n", len, log2n);printf("Compiling  radix-%d FFT Program for GPU...\n", rad);compileProgram("fft.cl");printf("creating radix-%d kernels...\n", rad);if (2 == rad){for (unsigned kk = 0; kk < log2n; kk++){printf("Creating kernel %s %d (p=%d)...\n", RADIX2_FFT_KERNEL, kk, p[kk]);createFFTKernel(RADIX2_FFT_KERNEL, kk);}}else{ // radix-4for (unsigned kk = 0; kk < log2n; kk+=2){printf("Creating kernel %s %d...\n", RADIX4_FFT_KERNEL, kk>>1);createFFTKernel(RADIX4_FFT_KERNEL, kk>>1);}}workSize = len;allocateDeviceMemory(workSize, workOffset);if (2 == rad){// FFT kernel invoked for p=1, p=2, ..., p=n/2// input and output swapped each timefor (unsigned kk = 0; kk < log2n; kk++){void *in = (0 == (kk&1)) ? &d_intime : &d_outfft;void *out = (0 == (kk&1)) ? &d_outfft : &d_intime;printf("Setting kernel args for kernel %d (p=%d)...\n", kk, p[kk]);clSetKernelArg(kernels[kk], 0, sizeof(cl_mem), in);clSetKernelArg(kernels[kk], 1, sizeof(cl_mem), out);clSetKernelArg(kernels[kk], 2, sizeof(unsigned), &p[kk]);clSetKernelArg(kernels[kk], 3, sizeof(unsigned), &pminus1[kk]);clSetKernelArg(kernels[kk], 4, sizeof(cl_float), &minusPIoverp[kk]);} // end (for 1,2,4,8,...N/2)}else{// radix-4, FFT kernel invoked for p=1, p=4, ..., p=n/4for (unsigned kk = 0; kk < log2n; kk+=2){int idx   = kk>>1;void *in  = (0 == (idx&1)) ? &d_intime : &d_outfft;void *out = (0 == (idx&1)) ? &d_outfft : &d_intime;printf("Setting kernel args for kernel %d (p=%d)...\n", idx, p[kk]);clSetKernelArg(kernels[idx], 0, sizeof(cl_mem), in);clSetKernelArg(kernels[idx], 1, sizeof(cl_mem), out);clSetKernelArg(kernels[idx], 2, sizeof(unsigned), &p[kk]);clSetKernelArg(kernels[idx], 3, sizeof(unsigned), &pminus1[kk]);clSetKernelArg(kernels[idx], 4, sizeof(unsigned), &twop[kk]);clSetKernelArg(kernels[idx], 5, sizeof(unsigned), &threep[kk]);clSetKernelArg(kernels[idx], 6, sizeof(cl_float), &minusPIover2p[kk]);clSetKernelArg(kernels[idx], 7, sizeof(cl_float), &minusPIover2p_2x[kk]);clSetKernelArg(kernels[idx], 8, sizeof(cl_float), &minusPIover2p_3x[kk]);} // end (for 1,4,16,...,N/4)} // end (if radix-2 or radix-4)size_t globalWorkSize[] = { (2==rad) ? (1<<(log2n-1)) : (len>>2) };size_t localWorkSize[] = { (blockSize <= globalWorkSize[0]) ? blockSize : globalWorkSize[0] };cl_int ciErrNum = 0;cl_mem d_result;clock_t start,end1,end2,end3;start = clock();Cl_finish();if (2==rad){for (unsigned kk = 0; kk < log2n; kk++){// note to self: up to 8 it works, beyond that it does notprintf("running kernel %d (p=%d)...\n", kk, p[kk]);runKernelFFT(localWorkSize, globalWorkSize, kk);d_result = (0 == (kk&1)) ? d_outfft : d_intime;}}else{// radix-4for (unsigned kk = 0; kk < log2n; kk+=2){int idx = kk>>1;printf("running kernel %d (p=%d)...\n", idx, p[kk]);runKernelFFT(localWorkSize, globalWorkSize, idx);d_result = (0 == (kk&1)) ? d_outfft : d_intime;}}Cl_finish();end1 = clock();//printf("time_1: %f s\n",double(end-start)/CLOCKS_PER_SEC);copyFromDevice(d_result, h_outfft + workOffset,  2*workSize);end2 = clock();//printf("time_2: %f s\n",double(end-start)/CLOCKS_PER_SEC);printGpuTime((2==rad)?log2n:(log2n>>1));end3 = clock();printf("time_1: %f s\ntime_2: %f s\ntime_3: %f s\n",\double(end1-start)/CLOCKS_PER_SEC,\double(end2-start)/CLOCKS_PER_SEC,\double(end3-start)/CLOCKS_PER_SEC);
}int
runFFT(const unsigned len)
{cl_int err;err = initExecution(len);if (err){return err;}FFTGpu(len);return 0;
}

clutil.cpp


#include "clutil.h"
#ifdef UNDER_CE
#include <windows.h>
#endif// global variables
cl_context cxContext = 0;
cl_program cpProgram = 0;
cl_device_id cdDeviceID[2];
cl_kernel kernels[FFT_MAX_LOG2N];
cl_command_queue commandQueue;
cl_event gpuExecution[FFT_MAX_LOG2N];#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))// default configs
unsigned blockSize = 16;
unsigned print = 1;// h_Freal and h_Fimag represent the input signal to be transformed.
// h_Rreal and h_Rimag represent the transformed output.
float*  h_Freal = 0;
float*  h_Fimag = 0;
float*  h_Rreal = 0;
float*  h_Rimag = 0;
//  real & imag interleaved
float* h_intime = 0; // time-domain input samples
float* h_outfft = 0; // freq-domain output samples// d_Freal and d_Fimag represent the input signal to be transformed.
// d_Rreal and d_Rimag represent the transformed output.
cl_mem d_Freal;
cl_mem d_Fimag;
cl_mem d_Rreal;
cl_mem d_Rimag;
//  real & imag interleaved
cl_mem d_intime; // time-domain input samples
cl_mem d_outfft; // freq-domain output samplesint
initExecution(const unsigned len)
{// Allocate host memory (and initialize input signal)allocateHostMemory(len);printf("Initializing device(s)...\n");// create the OpenCL context on available GPU devicesinit_cl_context(CL_DEVICE_TYPE_GPU);const cl_uint ciDeviceCount =  getDeviceCount();printf("ciDeviceCount:%d \n",ciDeviceCount);if (!ciDeviceCount){printf("No opencl specific devices!\n");return -1;}const cl_uint ciComputeUnitsCount = getNumComputeUnits();printf("# compute units = %d\n", ciComputeUnitsCount);printf("Creating Command Queue...\n");// create a command queue on device 0createCommandQueue();return 0;
}void
printGpuTime(const unsigned int kernelCount)
{double t, total = 0;for (unsigned k = 0; k<kernelCount; ++k){t = executionTime(gpuExecution[k]);printf("Kernel execution time on GPU (kernel %d) : %10.6f seconds\n", k, t);total += t;}printf("Total Kernel execution time on GPU : %10.6f seconds\n",total);
}void
printResult(const unsigned size)
{FILE *fp;
#ifdef UNDER_CEwchar_t moduleName[MAX_PATH];char path[MAX_PATH], * p;GetModuleFileName(NULL, moduleName, MAX_PATH);wcstombs(path, moduleName, MAX_PATH);p = strrchr(path, '\\');strcpy(p + 1, "fft_output.csv");fp = fopen(path, "w+");
#elsefp = fopen("fft_output.csv", "w+");
#endifif (fp == NULL) return;for (unsigned i = 0; i < size; ++i){fprintf(fp, "%f,%f\n", h_outfft[2*i], h_outfft[2*i+1]);}fclose(fp);
}double
executionTime(const cl_event event)
{cl_ulong start, end;cl_int err;err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);err |= clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);if (err){return 0;}printf("start:%llu, end:%llu \n", start, end);return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds
}void
allocateHostMemory(const unsigned len)
{h_Freal = (float *) malloc(sizeof(float) * len);checkError((h_Freal != NULL), CL_TRUE, "Could not allocate memory");h_Fimag = (float *) malloc(sizeof(float) * len);checkError((h_Fimag != NULL), CL_TRUE, "Could not allocate memory");h_Rreal = (float *) malloc(sizeof(float) * len);checkError((h_Rreal != NULL), CL_TRUE, "Could not allocate memory");h_Rimag = (float *) malloc(sizeof(float) * len);checkError((h_Rimag != NULL), CL_TRUE, "Could not allocate memory");//  real/imag interleaved input time-domain samplesh_intime = (float *) malloc(sizeof(float) * len * 2);checkError((h_intime != NULL), CL_TRUE, "Could not allocate memory");//  real/imag interleaved output FFT datah_outfft = (float *) malloc(sizeof(float) * len * 2);checkError((h_outfft != NULL), CL_TRUE, "Could not allocate memory");const unsigned n = 16;for (unsigned i = 0 ; i < len; ++i){h_Freal[i] = (i + 1) % n;h_Fimag[i] = (i + 1) % n;h_intime[2*i] = h_intime[2*i+1] = (i + 1) % n;h_Rreal[i] = 0;h_Rimag[i] = 0;h_outfft[2*i] = h_outfft[2*i+1] = 0;}if (print){FILE *fp = NULL;
#ifdef UNDER_CEwchar_t moduleName[MAX_PATH];char path[MAX_PATH], * p;GetModuleFileName(NULL, moduleName, MAX_PATH);wcstombs(path, moduleName, MAX_PATH);p = strrchr(path, '\\');strcpy(p + 1, "fft_input.csv");fp = fopen(path, "w+");
#elsefp = fopen("fft_input.csv", "w+");
#endifif (fp == NULL) return;for (unsigned int kk=0; kk<len; kk++){fprintf(fp, "%f,%f\n", h_intime[2*kk], h_intime[2*kk+1]);}fclose(fp);}
}void
allocateDeviceMemory(const unsigned size,const unsigned copyOffset)
{d_Freal = createDeviceBuffer(CL_MEM_READ_ONLY, sizeof(float) * size, h_Freal + copyOffset);copyToDevice(d_Freal,  h_Freal + copyOffset, size);d_Fimag = createDeviceBuffer(CL_MEM_READ_ONLY, sizeof(float) * size, h_Fimag + copyOffset);copyToDevice(d_Fimag,  h_Fimag + copyOffset, size);//  copy real/imag interleaved input data to deviced_intime = createDeviceBuffer(CL_MEM_READ_WRITE, sizeof(float) * size * 2, h_intime + copyOffset * 2);copyFromDevice(d_intime, h_outfft, size * 2); // debugd_Rreal = createDeviceBuffer(CL_MEM_WRITE_ONLY, sizeof(float) * size, h_Rreal + copyOffset);copyToDevice(d_Rreal,  h_Rreal + copyOffset, size);d_Rimag = createDeviceBuffer(CL_MEM_WRITE_ONLY, sizeof(float) * size, h_Rimag + copyOffset);copyToDevice(d_Rimag,  h_Rimag + copyOffset, size);//  copy real/imag interleaved out FFT to deviced_outfft = createDeviceBuffer(CL_MEM_READ_WRITE, sizeof(float) * size * 2, h_outfft + copyOffset * 2);copyToDevice(d_intime,  h_outfft + copyOffset * 2, size * 2);
}void
cleanup(void)
{if (d_Freal)  clReleaseMemObject(d_Freal);if (d_Fimag)  clReleaseMemObject(d_Fimag);if (d_Rreal)  clReleaseMemObject(d_Rreal);if (d_Rimag)  clReleaseMemObject(d_Rimag);if (d_intime) clReleaseMemObject(d_intime);if (d_outfft) clReleaseMemObject(d_outfft);for (unsigned kk=0; kk<ARRAY_SIZE(kernels); kk++) {if (gpuExecution[kk]) clReleaseEvent(gpuExecution[kk]);}if (commandQueue) clReleaseCommandQueue(commandQueue);if (cpProgram) clReleaseProgram(cpProgram);if (cxContext) clReleaseContext(cxContext);free(h_Freal);h_Freal = 0;free(h_Fimag);h_Fimag = 0;free(h_Rreal);h_Rreal = 0;free(h_Rimag);h_Rimag = 0;free(h_intime);h_intime = 0;free(h_outfft);h_outfft = 0;
}void
checkError(const cl_int ciErrNum,const cl_int ref,const char* const operation)
{if (ciErrNum != ref) {printf("ERROR:: %d %s failed\n\n", ciErrNum, operation);cleanup();exit(EXIT_FAILURE);}
}void
init_cl_context(const cl_device_type device_type)
{cl_int ciErrNum = CL_SUCCESS;#ifndef WIN32cxContext = clCreateContextFromType(0, /* cl_context_properties */device_type,NULL, /* error function ptr */NULL, /* user data to be passed to err fn */&ciErrNum);checkError(ciErrNum, CL_SUCCESS, "clCreateContextFromType");
#elsecl_platform_id cpPlatform;ciErrNum =     clGetPlatformIDs(1, &cpPlatform, NULL);checkError(ciErrNum, CL_SUCCESS, "clGetPlatformIDs");cl_uint uiNumDevices;ciErrNum = clGetDeviceIDs(cpPlatform, device_type, 0, NULL, &uiNumDevices);checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");cl_device_id cdDevices[20];ciErrNum = clGetDeviceIDs(cpPlatform, device_type, uiNumDevices, cdDevices, NULL);checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");cl_uint targetDevice=0, uiNumDevsUsed=1;cxContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum);checkError(ciErrNum, CL_SUCCESS, "clCreateContextFromType");
#endif
}cl_uint
getDeviceCount(void)
{size_t nDeviceBytes;const cl_int ciErrNum = clGetContextInfo(cxContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);checkError(ciErrNum, CL_SUCCESS, "clGetContextInfo");return ((cl_uint)nDeviceBytes/sizeof(cl_device_id));
}cl_uint
getNumComputeUnits(void)
{cl_platform_id cpPlatform;cl_int ciErrNum = clGetPlatformIDs(1, &cpPlatform, NULL);checkError(ciErrNum, CL_SUCCESS, "clGetPlatformIDs");//Get all the devicesprintf("Get the Device info and select Device...\n");cl_uint uiNumDevices;ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");// Set target device and Query number of compute units on targetDeviceprintf("# of Devices Available = %d\n", uiNumDevices);cl_uint num_compute_units;clGetDeviceInfo(cdDevices[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL);printf("# of Compute Units = %d\n", num_compute_units);free(cdDevices);return num_compute_units;
}void
createCommandQueue(void)
{cl_int ciErrNum = CL_SUCCESS;ciErrNum = clGetContextInfo(cxContext, CL_CONTEXT_DEVICES, sizeof(cl_device_id)*2, &cdDeviceID, NULL);commandQueue = clCreateCommandQueue(cxContext, cdDeviceID[0], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);checkError(ciErrNum, CL_SUCCESS, "clCreateCommandQueue");
}void
compileProgram(const char* const kernel_file)
{size_t program_length;FILE* pFileStream = NULL;cl_int ciErrNum;#ifdef _WIN32
#ifdef UNDER_CEwchar_t moduleName[MAX_PATH];char path[MAX_PATH], * p;GetModuleFileName(NULL, moduleName, MAX_PATH);wcstombs(path, moduleName, MAX_PATH);p = strrchr(path, '\\');strcpy(p + 1, kernel_file);pFileStream = fopen(path, "rb");if (pFileStream == NULL){checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on open source");}
#elseif(fopen_s(&pFileStream, kernel_file, "rb") != 0){checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on open source");}
#endif
#elsepFileStream = fopen(kernel_file, "rb");if(pFileStream == 0){checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on open source");}
#endif// get the length of the source codefseek(pFileStream, 0, SEEK_END);program_length = ftell(pFileStream);fseek(pFileStream, 0, SEEK_SET);// allocate a buffer for the source code string and read it inchar* source = (char *)malloc(program_length + 1);if (fread((source), program_length, 1, pFileStream) != 1){fclose(pFileStream);free(source);checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on read source");}fclose(pFileStream);source[program_length] = '\0';// Create the program for all GPUs in the contextcpProgram = clCreateProgramWithSource( cxContext, 1, (const char **) &source, &program_length, &ciErrNum);free(source);checkError(ciErrNum, CL_SUCCESS, "clCreateProgramWithSource");ciErrNum = clBuildProgram(cpProgram, 0, NULL, "", NULL, NULL);if (ciErrNum != CL_SUCCESS){char cBuildLog[10240];clGetProgramBuildInfo(cpProgram, cdDeviceID[0], CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL );printf("\nBuild Log : \n%s\n", cBuildLog);checkError(ciErrNum, CL_SUCCESS, "clBuildProgram");}
}void
createFFTKernel(const char* const kernelName,int kk)
{cl_int ciErrNum = CL_SUCCESS;kernels[kk] = clCreateKernel(cpProgram, kernelName, &ciErrNum);checkError(ciErrNum, CL_SUCCESS, "clCreateKernel");
}cl_mem
createDeviceBuffer(const cl_mem_flags flags,const size_t size,void* const hostPtr)
{cl_int ciErrNum = CL_SUCCESS;const cl_mem d_mem = clCreateBuffer(cxContext, flags | CL_MEM_COPY_HOST_PTR, size, hostPtr, &ciErrNum);checkError(ciErrNum, CL_SUCCESS,  "clCreateBuffer");return d_mem;
}void
copyToDevice(const cl_mem mem,float* const hostPtr,const unsigned size)
{const cl_int ciErrNum = clEnqueueWriteBuffer(commandQueue, mem, CL_TRUE, 0, sizeof(float) * size, hostPtr, 0, NULL, NULL);checkError(ciErrNum, CL_SUCCESS,  "clEnqueueWriteBuffer");
}void
copyFromDevice(const cl_mem dMem,float* const hostPtr,const unsigned size)
{cl_int ciErrNum = clEnqueueReadBuffer(commandQueue, dMem, CL_TRUE, 0, sizeof(float) * size, hostPtr, 0, NULL, NULL);checkError(ciErrNum, CL_SUCCESS, "clEnqueueReadBuffer");
}void
runKernelFFT(const size_t localWorkSize[],const size_t globalWorkSize[],const int kk)
{const cl_int ciErrNum = clEnqueueNDRangeKernel(commandQueue, kernels[kk], 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &gpuExecution[kk]);checkError(ciErrNum, CL_SUCCESS, "clEnqueueNDRangeKernel");
}void Cl_finish(void)
{clFinish(commandQueue);
}

clutil.h


#ifndef __CLUTIL__
#define __CLUTIL__#include <CL/opencl.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#ifndef UNDER_CE
#include <sys/types.h>
#endifint initExecution(const unsigned len);
void checkError(const cl_int ciErrNum, const cl_int ref, const char* const operation);
void printResult(const unsigned size);
void init_cl_context(cl_device_type device_type);
cl_uint getDeviceCount();
cl_uint getNumComputeUnits();
void createCommandQueue();
void compileProgram(const char* const kernel_file);
void createFFTKernel(const char* const kernelName, int kk);
cl_mem createDeviceBuffer(const cl_mem_flags flags, const size_t size, void* const  hostPtr);
void runKernelFFT(const size_t localWorkSize[], const size_t globalWorkSize[], const int kk);
void copyToDevice(const cl_mem mem, float* const hostPtr, const unsigned size);
void copyFromDevice(const cl_mem dMem, float* const hostPtr, const unsigned size);
double executionTime(const cl_event event);
void allocateHostMemory(const unsigned len);
void allocateDeviceMemory(const unsigned size, const unsigned copyOffset);
void printGpuTime(const unsigned int kernelCount);
void cleanup();
int runFFT(const unsigned len);
void Cl_finish(void);// Support 2^16 = 65536 point FFT
#define FFT_MAX_LOG2N   20
#define FFT_MAX         (1 << FFT_MAX_LOG2N)extern unsigned blockSize;
extern unsigned print;extern float*  h_Freal;
extern float*  h_Fimag;
extern float*  h_Rreal;
extern float*  h_Rimag;
extern float*  h_intime; // time-domain input samples
extern float*  h_outfft; // freq-domain output samplesextern cl_mem d_Freal;
extern cl_mem d_Fimag;
extern cl_mem d_Rreal;
extern cl_mem d_Rimag;
extern cl_mem d_intime; // time-domain input samples
extern cl_mem d_outfft; // freq-domain output samplesextern cl_context cxContext;
extern cl_program cpProgram;
extern cl_kernel kernels[FFT_MAX_LOG2N];
extern cl_event gpuExecution[FFT_MAX_LOG2N];
extern cl_command_queue commandQueue;#endif

fft.cl

#define M_PI            3.14159265358979f
#define MUL_RE(a, b)    (a.even*b.even - a.odd*b.odd)
#define MUL_IM(a, b)    (a.even*b.odd + a.odd*b.even)typedef float2 real2_t;
typedef float real_t;// complex multiply
real2_t mul(real2_t a,real2_t b)
{return (real2_t) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); // no mad
}// twiddle_P_Q(A) returns A * EXP(-P*PI*i/Q)
real2_t
twiddle_1_2(real2_t a)
{// A * (-i)return (real2_t) (a.y, -a.x);
}// Return A * exp(K*ALPHA*i)
real2_t
twiddle(real2_t a,int k,real_t alpha)
{real_t cs, sn;//sn = sincos((real_t)k*alpha, &cs);cs = native_cos((real_t) k * alpha);sn = native_sin((real_t) k * alpha);return mul(a, (real2_t) (cs, sn));
}// Return A * exp(KALPHA*i)
real2_t
twiddle_kalpha(real2_t a,real_t kalpha)
{real_t cs, sn;//sn = sincos((real_t) alpha, &cs);cs = native_cos((real_t) kalpha);sn = native_sin((real_t) kalpha);return mul(a, (real2_t) (cs, sn));
}// In-place DFT-2, output is (a, b).  Arguments must be variables.
#define DFT2(a, b)  { real2_t tmp = a - b; a += b; b = tmp; }// Compute T x DFT-2.
// T is the number of threads.
// N = 2*T is the size of input vectors.
// X[N], Y[N]
// P is the length of input sub-sequences: 1,2,4,...,T.
// Each DFT-2 has input (X[I],X[I+T]), I=0..T-1,
// and output Y[J], Y|J+P], J = I with one 0 bit inserted at postion P. */
__kernel void
fft_radix2(__global const real2_t * x,__global real2_t * y,int p,int pminus1,real_t minusPIoverp)
{int t = get_global_size(0); // thread countint i = get_global_id(0);   // thread indexint k = i&pminus1;          // index in input sequence, in 0..P-1int j = ((i-k)<<1) + k;     // output indexreal_t alpha = minusPIoverp * (real_t) k;  // -M_PI*(real_t)k/(real_t)p;// Read and twiddle inputx += i;real2_t u0 = x[0];//real2_t u1 = twiddle(x[t], 1, alpha);real_t cs,sn;//sn = sincos(alpha, &cs);cs = native_cos(alpha);sn = native_sin(alpha);real2_t u1 = mul(x[t], (real2_t) (cs, sn));// In-place DFT-2DFT2(u0,u1);// Write outputy += j;y[0] = u0;y[p] = u1;
}// In-place DFT-4, output is (a, c, b, d). Arguments must be variables.
#define DFT4(a, b, c, d) { DFT2(a, c); DFT2(b, d); d=twiddle_1_2(d); DFT2(a, b); DFT2(c, d); }// Compute T x DFT-4.
// T is the number of threads.
// N = 4*T is the size of input vectors.
// X[N], Y[N]
// P is the length of input sub-sequences: 1,4,16,...,T.
// Each DFT-4 has input (X[I],X[I+T],X[I+2*T],X[I+3*T]), I=0..T-1,
// and output (Y[J],Y|J+P],Y[J+2*P],Y[J+3*P], J = I with two 0 bits inserted at postion P.
__kernel void
fft_radix4(__global const float2 * x,__global float2 * y,int p,int pminus1,int twop,int threep,real_t minusPIover2p,real_t minusPIover2p_2x,real_t minusPIover2p_3x)
{int t = get_global_size(0); // thread countint i = get_global_id(0);   // thread indexint k = i&pminus1;          //(p-1); // index in input sequence, in 0..P-1int j = ((i - k) << 2) + k; // output indexreal_t alpha   = minusPIover2p    * (real_t) k; //-M_PI*(real_t)k/(real_t)(2*p);real_t alpha2x = minusPIover2p_2x * (real_t) k;real_t alpha3x = minusPIover2p_3x * (real_t) k;// Read and twiddle inputx += i;real2_t u0 = x[0];real2_t u1 = twiddle_kalpha(x[t],   alpha);     //twiddle(x[t],   1, alpha);real2_t u2 = twiddle_kalpha(x[2*t], alpha2x);   //twiddle(x[2*t], 2, alpha);real2_t u3 = twiddle_kalpha(x[3*t], alpha3x);   //twiddle(x[3*t], 3, alpha);// In-place DFT-4DFT4(u0, u1, u2, u3);// Shuffle and write outputy        += j;y[0]      = u0;y[p]      = u2;y[twop]   = u1;y[threep] = u3;
}

将该部分代码拷贝至~/imx8mqevk-rootfs/code/fft目录下
执行编译命令

$CXX -I../../usr/include -L../../usr/lib -lOpenCL -o fft main.cpp fft.cpp clutil.cpp
# 注意：  这里的$CXX的作用就是交叉编译工具， 他在脚本中设置为环境变量了。

结果如下

IMX8M系列 OpenCL FFT 示例编译及其他demo测试（MYD-JX8MX）相关推荐

ubuntu编译ffmpeg并且demo测试
一.Ubuntu下编译ffmpeg源码指令下载ffmpeg,解压(跳过),需要安装一些基本的依赖库,如x264等: 切换到ffmpeg源码的目录,config指令 $ ./configure --e ...
IMX8M系列 yocto编译镜像及demo编译（MYD-JX8MX）
IMX8M系列 yocto编译镜像及demo编译(MYD-JX8MX) 前段时间由于工作需要,研究了一下米尔的MYD-JX8MX开发板,用的是NXP 的IMX8M型号芯片,说实话,官方提供的文档描述的 ...
OpenCL，OpenGL编译
OpenCL,OpenGL编译 TVM已经支持多个硬件后端:CPU,GPU,移动设备等-添加了另一个后端:OpenGL / WebGL. OpenGL / WebGL能够在没有安装CUDA的环境中利用 ...
opencl fft实例整理
ocl版的: https://github.com/betaupsx86/FFT_OCL python版的: A Python wrapper for the OpenCL FFT library c ...
CUDA并行算法系列之FFT快速卷积
CUDA并行算法系列之FFT快速卷积卷积定义在维基百科上,卷积定义为: 离散卷积定义为: [ 0, 1, 2, 3]和[0, 1, 2]的卷积例子如下图所示: Python实现(直接卷积) 根据离 ...
libev学习系列之三：libev编译安装
libev学习系列之三:libev编译安装版本说明版本作者日期备注 0.1 ZY 2019.5.31 初稿目录文章目录 libev学习系列之三:libev编译安装版本说明目录源码结 ...
【已解决】海康威视MFC综合示例(C++ 官网Demo)采用VS2019编译异常如何解决？
采用VS2019编译运行海康威视MFC综合示例Demo 一.文章背景: 二.操作步骤: 1.海康威视设备网络SDK下载: 2.VS2019 MFC开发环境配置: 3.MFC综合示例编译运行: 三.小结 ...
ESP32-WROOM-32D模组上传Arduino IDE示例编译的固件后总是重启，该如何解决
手头的ESP32-WROOM-32D模组搭载的是单核CPU,因此Arduino IDE需要配置成单核编译模式.用ESP32的示例编译并上传固件后不断重启: 例如GetChipID,源代码: uint3 ...
laya3d系列——如何用vscode编译laya项目
laya3d系列--如何用vscode编译laya项目 ---------------------------------------------转载请说明出处,抄袭必举报查封------------ ...

IMX8M系列 OpenCL FFT 示例编译及其他demo测试（MYD-JX8MX）

IMX8M系列 OpenCL FFT 示例编译测试及其他demo测试（MYD-JX8MX）

导出文件系统

编译FFT

IMX8M系列 OpenCL FFT 示例编译及其他demo测试（MYD-JX8MX）相关推荐

最新文章

热门文章