cuda编程python接口_CUDA共享内存问题(以及将CUDA与python / ctypes一...

(请注意,此答案中的代码还提供了有关如何在与使用python ctypes的python应用程序共享的库中使用CUDA代码(例如CUDA设备内核)的完整秘诀/示例.如果您希望使用CUDA库功能,答案here提供了一个使用python ctypes的示例.)

这里的问题是内核正在写越界,并且显然编译器/运行时将分配定位在设备内存中足够近的位置,这超出了第一个分配的界限,导致代码写入了第二个分配：

cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map

cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));

越界访问即将到来是因为内核启动涉及的线程数量过多(在这种情况下,它正在启动1024个线程),而我们实际上仅“需要” SIZE_X * SIZE_Y线程(在此示例中为16)：

#define blockSize 1024

...

int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;

...

update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);

当然,这在CUDA编程中是“典型的”,以启动足够多的线程,但是在执行此操作时,在内核中包括“线程检查”,以防止任何“多余的”线程使任何非法的,过时的线程变得很重要.边界访问.在这种情况下,可能的内核线程检查可能是这样的：

if ((row >= SIZE_Y) || (col >= SIZE_X)) return;

这是一个基于提供的代码的完整示例(尽管在Linux上,并且在python代码中删除了Blender依赖项),显示了之前和之后的效果.请注意,我们甚至可以使用cuda-memcheck运行这样的代码,在这种情况下,该代码将指出越界访问(为清晰起见,在下面的第一个示例中省略了该访问)：

$cat t383.cu

extern "C"

void init(float *t_height_map,

float *w_height_map,

float *s_height_map,

int SIZE_X,

int SIZE_Y);

extern "C"

void run_hydro_erosion(int cycles,

float t_step,

float min_tilt_angle,

float SEDIMENT_CAP,

float DISSOLVE_CONST,

float DEPOSIT_CONST,

int SIZE_X,

int SIZE_Y,

float PIPE_LENGTH,

float ADJACENT_LENGTH,

float TIME_STEP,

float MIN_TILT_ANGLE);

extern "C"

void free_mem();

extern "C"

void procedural_rain(float *water_height_map, float *rain_map, int SIZE_X, int SIZE_Y);

// includes, system

#include

// includes CUDA

#include

using namespace std;

#define FLOW_RIGHT 0

#define FLOW_UP 1

#define FLOW_LEFT 2

#define FLOW_DOWN 3

#define X_VEL 0

#define Y_VEL 1

#define LEFT_CELL row, col - 1

#define RIGHT_CELL row, col + 1

#define ABOVE_CELL row - 1, col

#define BELOW_CELL row + 1, col

// CUDA API error checking macro

#define T 1024

#define M 1536

#define blockSize 1024

#define cudaCheck(error) \n if (error != cudaSuccess) { \n printf("Fatal error: %s at %s:%d

", \n cudaGetErrorString(error), \n __FILE__, __LINE__); \n exit(1); \n }

__global__ void update_water_flow(float *water_height_map, float *water_flow_map, float *d_updated_water_flow_map, int SIZE_X, int SIZE_Y)

{

int index = blockIdx.x * blockDim.x + threadIdx.x;

int col = index % SIZE_X;

int row = index / SIZE_X;

index = row * (SIZE_X * 4) + col * 4; // 3D index

#ifdef FIX

if ((row >= SIZE_Y) || (col >= SIZE_X)) return;

#endif

d_updated_water_flow_map[index + FLOW_RIGHT] = 0;

d_updated_water_flow_map[index + FLOW_UP] = 0;

d_updated_water_flow_map[index + FLOW_LEFT] = 0;

d_updated_water_flow_map[index + FLOW_DOWN] = 0;

}

static float *terrain_height_map;

static float *water_height_map;

static float *sediment_height_map;

void init(float *t_height_map,

float *w_height_map,

float *s_height_map,

int SIZE_X,

int SIZE_Y)

{

/* set vars HOST*/

terrain_height_map = t_height_map;

water_height_map = w_height_map;

sediment_height_map = s_height_map;

}

void run_hydro_erosion(int cycles,

float t_step,

float min_tilt_angle,

float SEDIMENT_CAP,

float DISSOLVE_CONST,

float DEPOSIT_CONST,

int SIZE_X,

int SIZE_Y,

float PIPE_LENGTH,

float ADJACENT_LENGTH,

float TIME_STEP,

float MIN_TILT_ANGLE)

{

int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;

int SIZE = SIZE_X * SIZE_Y * sizeof(float);

float *d_terrain_height_map, *d_updated_terrain_height_map;

float *d_water_height_map, *d_updated_water_height_map;

float *d_sediment_height_map, *d_updated_sediment_height_map;

float *d_suspended_sediment_level;

float *d_updated_suspended_sediment_level;

float *d_water_flow_map;

float *d_updated_water_flow_map;

float *d_prev_water_height_map;

float *d_water_velocity_vec;

float *d_rain_map;

cudaCheck(cudaMalloc(&d_water_height_map, SIZE));

cudaCheck(cudaMalloc(&d_updated_water_height_map, SIZE));

cudaCheck(cudaMalloc(&d_prev_water_height_map, SIZE));

cudaCheck(cudaMalloc(&d_water_flow_map, SIZE * 4));

cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map

cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));

cudaCheck(cudaMalloc(&d_updated_terrain_height_map, SIZE));

cudaCheck(cudaMalloc(&d_sediment_height_map, SIZE));

cudaCheck(cudaMalloc(&d_updated_sediment_height_map, SIZE));

cudaCheck(cudaMalloc(&d_suspended_sediment_level, SIZE));

cudaCheck(cudaMalloc(&d_updated_suspended_sediment_level, SIZE));

cudaCheck(cudaMalloc(&d_rain_map, SIZE));

cudaCheck(cudaMalloc(&d_water_velocity_vec, SIZE * 2));

cudaCheck(cudaMemcpy(d_terrain_height_map, terrain_height_map, SIZE, cudaMemcpyHostToDevice));

cudaCheck(cudaMemcpy(d_water_height_map, water_height_map, SIZE, cudaMemcpyHostToDevice));

cudaCheck(cudaMemcpy(d_sediment_height_map, sediment_height_map, SIZE, cudaMemcpyHostToDevice));

cout << "init terrain_height_map" << endl;

for (int i = 0; i < SIZE_X * SIZE_Y; i++) {

cout << terrain_height_map[i] << ", ";

if (i % SIZE_X == 0 && i != 0) cout << endl;

}

/* launch the kernel on the GPU */

float *temp;

while (cycles--) {

update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);

temp = d_water_flow_map;

d_water_flow_map = d_updated_water_flow_map;

d_updated_water_flow_map = temp;

}

cudaCheck(cudaMemcpy(terrain_height_map, d_terrain_height_map, SIZE, cudaMemcpyDeviceToHost));

cout << "updated terrain" << endl;

for (int i = 0; i < SIZE_X * SIZE_Y; i++) {

cout << terrain_height_map[i] << ", ";

if (i % SIZE_X == 0 && i != 0) cout << endl;

}

$cat t383.py

import numpy

import ctypes

import random

width = 4

height = 4

size_x = width

size_y = height

N = size_x * size_y

scrpt_cycles = 1

kernel_cycles = 1

time_step = 0.005

pipe_length = 1.0

adjacent_length = 1.0

min_tilt_angle = 10

sediment_cap = 0.01

dissolve_const = 0.01

deposit_const = 0.01

# initialize arrays

ter_height_map = numpy.ones((N), dtype=numpy.float32)

water_height_map = numpy.zeros((N), dtype=numpy.float32)

sed_height_map = numpy.zeros((N), dtype=numpy.float32)

rain_map = numpy.ones((N), dtype=numpy.float32)

# load terrain height from image

for i in range(0, len(ter_height_map)):

ter_height_map[i] = 1

# import DLL

E = ctypes.cdll.LoadLibrary("./t383.so")

# initialize device memory

E.init( ctypes.c_void_p(ter_height_map.ctypes.data),

ctypes.c_void_p(water_height_map.ctypes.data),

ctypes.c_void_p(sed_height_map.ctypes.data),

ctypes.c_int(size_x),

ctypes.c_int(size_y))

# run erosion

while(scrpt_cycles):

scrpt_cycles = scrpt_cycles - 1

E.run_hydro_erosion(ctypes.c_int(kernel_cycles),

ctypes.c_float(time_step),

ctypes.c_float(min_tilt_angle),

ctypes.c_float(sediment_cap),

ctypes.c_float(dissolve_const),

ctypes.c_float(deposit_const),

ctypes.c_int(size_x),

ctypes.c_int(size_y),

ctypes.c_float(pipe_length),

ctypes.c_float(adjacent_length),

ctypes.c_float(time_step),

ctypes.c_float(min_tilt_angle))

$nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu

$python t383.py

init terrain_height_map

1, 1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, updated terrain

0, 0, 0, 0, 0,

0, 0, 0, 0,

0, 0, 0,

$nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu -DFIX

$cuda-memcheck python t383.py

========= CUDA-MEMCHECK

init terrain_height_map

1, 1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, updated terrain

1, 1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1,

========= ERROR SUMMARY: 0 errors

如果我们编译前一个没有修复的示例,但是使用cuda-memcheck运行它,我们将获得指示越界访问的输出：

$nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu

$cuda-memcheck python t383.py

========= CUDA-MEMCHECK

init terrain_height_map

1, 1, 1, 1, 1,

1, 1, 1, 1,

========= Invalid __global__ write of size 4

========= at 0x000002f0 in update_water_flow(float*, float*, float*, int, int)

========= by thread (31,0,0) in block (0,0,0)

========= Address 0x1050d6009f0 is out of bounds

========= Saved host backtrace up to driver entry point at kernel launch time

========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204505]

========= Host Frame:./t383.so [0x1c291]

========= Host Frame:./t383.so [0x39e33]

========= Host Frame:./t383.so [0x6879]

========= Host Frame:./t383.so (_Z43__device_stub__Z17update_water_flowPfS_S_iiPfS_S_ii + 0xe3) [0x6747]

========= Host Frame:./t383.so (_Z17update_water_flowPfS_S_ii + 0x38) [0x6781]

========= Host Frame:./t383.so (run_hydro_erosion + 0x8f2) [0x648b]

========= Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call_unix64 + 0x4c) [0x5adc]

========= Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call + 0x1fc) [0x540c]

========= Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so (_ctypes_callproc + 0x48e) [0x145fe]

========= Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so [0x15f9e]

========= Host Frame:python (PyEval_EvalFrameEx + 0x98d) [0x1244dd]

========= Host Frame:python [0x167d14]

========= Host Frame:python (PyRun_FileExFlags + 0x92) [0x65bf4]

========= Host Frame:python (PyRun_SimpleFileExFlags + 0x2ee) [0x6612d]

========= Host Frame:python (Py_Main + 0xb5e) [0x66d92]

========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]

========= Host Frame:python [0x177c2e]

=========

========= Invalid __global__ write of size 4

========= at 0x000002f0 in update_water_flow(float*, float*, float*, int, int)

========= by thread (30,0,0) in block (0,0,0)

========= Address 0x1050d6009e0 is out of bounds