(请注意,此答案中的代码还提供了有关如何在与使用python ctypes的python应用程序共享的库中使用CUDA代码(例如CUDA设备内核)的完整秘诀/示例.如果您希望使用CUDA库功能,答案here提供了一个使用python ctypes的示例.)

这里的问题是内核正在写越界,并且显然编译器/运行时将分配定位在设备内存中足够近的位置,这超出了第一个分配的界限,导致代码写入了第二个分配:

cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map

cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));

越界访问即将到来是因为内核启动涉及的线程数量过多(在这种情况下,它正在启动1024个线程),而我们实际上仅“需要” SIZE_X * SIZE_Y线程(在此示例中为16):

#define blockSize 1024

...

int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;

...

update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);

当然,这在CUDA编程中是“典型的”,以启动足够多的线程,但是在执行此操作时,在内核中包括“线程检查”,以防止任何“多余的”线程使任何非法的,过时的线程变得很重要.边界访问.在这种情况下,可能的内核线程检查可能是这样的:

if ((row >= SIZE_Y) || (col >= SIZE_X)) return;

这是一个基于提供的代码的完整示例(尽管在Linux上,并且在python代码中删除了Blender依赖项),显示了之前和之后的效果.请注意,我们甚至可以使用cuda-memcheck运行这样的代码,在这种情况下,该代码将指出越界访问(为清晰起见,在下面的第一个示例中省略了该访问):

$cat t383.cu

extern "C"

void init(float *t_height_map,

float *w_height_map,

float *s_height_map,

int SIZE_X,

int SIZE_Y);

extern "C"

void run_hydro_erosion(int cycles,

float t_step,

float min_tilt_angle,

float SEDIMENT_CAP,

float DISSOLVE_CONST,

float DEPOSIT_CONST,

int SIZE_X,

int SIZE_Y,

float PIPE_LENGTH,

float ADJACENT_LENGTH,

float TIME_STEP,

float MIN_TILT_ANGLE);

extern "C"

void free_mem();

extern "C"

void procedural_rain(float *water_height_map, float *rain_map, int SIZE_X, int SIZE_Y);

// includes, system

#include

#include

#include

#include

#include

#include

#include

#include

// includes CUDA

#include

using namespace std;

#define FLOW_RIGHT 0

#define FLOW_UP 1

#define FLOW_LEFT 2

#define FLOW_DOWN 3

#define X_VEL 0

#define Y_VEL 1

#define LEFT_CELL row, col - 1

#define RIGHT_CELL row, col + 1

#define ABOVE_CELL row - 1, col

#define BELOW_CELL row + 1, col

// CUDA API error checking macro

#define T 1024

#define M 1536

#define blockSize 1024

#define cudaCheck(error) \n if (error != cudaSuccess) { \n printf("Fatal error: %s at %s:%d

", \n cudaGetErrorString(error), \n __FILE__, __LINE__); \n exit(1); \n }

__global__ void update_water_flow(float *water_height_map, float *water_flow_map, float *d_updated_water_flow_map, int SIZE_X, int SIZE_Y)

{

int index = blockIdx.x * blockDim.x + threadIdx.x;

int col = index % SIZE_X;

int row = index / SIZE_X;

index = row * (SIZE_X * 4) + col * 4; // 3D index

#ifdef FIX

if ((row >= SIZE_Y) || (col >= SIZE_X)) return;

#endif

d_updated_water_flow_map[index + FLOW_RIGHT] = 0;

d_updated_water_flow_map[index + FLOW_UP] = 0;

d_updated_water_flow_map[index + FLOW_LEFT] = 0;

d_updated_water_flow_map[index + FLOW_DOWN] = 0;

}

static float *terrain_height_map;

static float *water_height_map;

static float *sediment_height_map;

void init(float *t_height_map,

float *w_height_map,

float *s_height_map,

int SIZE_X,

int SIZE_Y)

{

/* set vars HOST*/

terrain_height_map = t_height_map;

water_height_map = w_height_map;

sediment_height_map = s_height_map;

}

void run_hydro_erosion(int cycles,

float t_step,

float min_tilt_angle,

float SEDIMENT_CAP,

float DISSOLVE_CONST,

float DEPOSIT_CONST,

int SIZE_X,

int SIZE_Y,

float PIPE_LENGTH,

float ADJACENT_LENGTH,

float TIME_STEP,

float MIN_TILT_ANGLE)

{

int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;

int SIZE = SIZE_X * SIZE_Y * sizeof(float);

float *d_terrain_height_map, *d_updated_terrain_height_map;

float *d_water_height_map, *d_updated_water_height_map;

float *d_sediment_height_map, *d_updated_sediment_height_map;

float *d_suspended_sediment_level;

float *d_updated_suspended_sediment_level;

float *d_water_flow_map;

float *d_updated_water_flow_map;

float *d_prev_water_height_map;

float *d_water_velocity_vec;

float *d_rain_map;

cudaCheck(cudaMalloc(&d_water_height_map, SIZE));

cudaCheck(cudaMalloc(&d_updated_water_height_map, SIZE));

cudaCheck(cudaMalloc(&d_prev_water_height_map, SIZE));

cudaCheck(cudaMalloc(&d_water_flow_map, SIZE * 4));

cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map

cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));

cudaCheck(cudaMalloc(&d_updated_terrain_height_map, SIZE));

cudaCheck(cudaMalloc(&d_sediment_height_map, SIZE));

cudaCheck(cudaMalloc(&d_updated_sediment_height_map, SIZE));

cudaCheck(cudaMalloc(&d_suspended_sediment_level, SIZE));

cudaCheck(cudaMalloc(&d_updated_suspended_sediment_level, SIZE));

cudaCheck(cudaMalloc(&d_rain_map, SIZE));

cudaCheck(cudaMalloc(&d_water_velocity_vec, SIZE * 2));

cudaCheck(cudaMemcpy(d_terrain_height_map, terrain_height_map, SIZE, cudaMemcpyHostToDevice));

cudaCheck(cudaMemcpy(d_water_height_map, water_height_map, SIZE, cudaMemcpyHostToDevice));

cudaCheck(cudaMemcpy(d_sediment_height_map, sediment_height_map, SIZE, cudaMemcpyHostToDevice));

cout << "init terrain_height_map" << endl;

for (int i = 0; i < SIZE_X * SIZE_Y; i++) {

cout << terrain_height_map[i] << ", ";

if (i % SIZE_X == 0 && i != 0) cout << endl;

}

/* launch the kernel on the GPU */

float *temp;

while (cycles--) {

update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);

temp = d_water_flow_map;

d_water_flow_map = d_updated_water_flow_map;

d_updated_water_flow_map = temp;

}

cudaCheck(cudaMemcpy(terrain_height_map, d_terrain_height_map, SIZE, cudaMemcpyDeviceToHost));

cout << "updated terrain" << endl;

for (int i = 0; i < SIZE_X * SIZE_Y; i++) {

cout << terrain_height_map[i] << ", ";

if (i % SIZE_X == 0 && i != 0) cout << endl;

}

}

$cat t383.py

import numpy

import ctypes

import random

width = 4

height = 4

size_x = width

size_y = height

N = size_x * size_y

scrpt_cycles = 1

kernel_cycles = 1

time_step = 0.005

pipe_length = 1.0

adjacent_length = 1.0

min_tilt_angle = 10

sediment_cap = 0.01

dissolve_const = 0.01

deposit_const = 0.01

# initialize arrays

ter_height_map = numpy.ones((N), dtype=numpy.float32)

water_height_map = numpy.zeros((N), dtype=numpy.float32)

sed_height_map = numpy.zeros((N), dtype=numpy.float32)

rain_map = numpy.ones((N), dtype=numpy.float32)

# load terrain height from image

for i in range(0, len(ter_height_map)):

ter_height_map[i] = 1

# import DLL

E = ctypes.cdll.LoadLibrary("./t383.so")

# initialize device memory

E.init( ctypes.c_void_p(ter_height_map.ctypes.data),

ctypes.c_void_p(water_height_map.ctypes.data),

ctypes.c_void_p(sed_height_map.ctypes.data),

ctypes.c_int(size_x),

ctypes.c_int(size_y))

# run erosion

while(scrpt_cycles):

scrpt_cycles = scrpt_cycles - 1

E.run_hydro_erosion(ctypes.c_int(kernel_cycles),

ctypes.c_float(time_step),

ctypes.c_float(min_tilt_angle),

ctypes.c_float(sediment_cap),

ctypes.c_float(dissolve_const),

ctypes.c_float(deposit_const),

ctypes.c_int(size_x),

ctypes.c_int(size_y),

ctypes.c_float(pipe_length),

ctypes.c_float(adjacent_length),

ctypes.c_float(time_step),

ctypes.c_float(min_tilt_angle))

$nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu

$python t383.py

init terrain_height_map

1, 1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, updated terrain

0, 0, 0, 0, 0,

0, 0, 0, 0,

0, 0, 0, 0,

0, 0, 0,

$nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu -DFIX

$cuda-memcheck python t383.py

========= CUDA-MEMCHECK

init terrain_height_map

1, 1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, updated terrain

1, 1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1,

========= ERROR SUMMARY: 0 errors

$

如果我们编译前一个没有修复的示例,但是使用cuda-memcheck运行它,我们将获得指示越界访问的输出:

$nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu

$cuda-memcheck python t383.py

========= CUDA-MEMCHECK

init terrain_height_map

1, 1, 1, 1, 1,

1, 1, 1, 1,

1, 1, 1, 1,

========= Invalid __global__ write of size 4

========= at 0x000002f0 in update_water_flow(float*, float*, float*, int, int)

========= by thread (31,0,0) in block (0,0,0)

========= Address 0x1050d6009f0 is out of bounds

========= Saved host backtrace up to driver entry point at kernel launch time

========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204505]

========= Host Frame:./t383.so [0x1c291]

========= Host Frame:./t383.so [0x39e33]

========= Host Frame:./t383.so [0x6879]

========= Host Frame:./t383.so (_Z43__device_stub__Z17update_water_flowPfS_S_iiPfS_S_ii + 0xe3) [0x6747]

========= Host Frame:./t383.so (_Z17update_water_flowPfS_S_ii + 0x38) [0x6781]

========= Host Frame:./t383.so (run_hydro_erosion + 0x8f2) [0x648b]

========= Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call_unix64 + 0x4c) [0x5adc]

========= Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call + 0x1fc) [0x540c]

========= Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so (_ctypes_callproc + 0x48e) [0x145fe]

========= Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so [0x15f9e]

========= Host Frame:python (PyEval_EvalFrameEx + 0x98d) [0x1244dd]

========= Host Frame:python [0x167d14]

========= Host Frame:python (PyRun_FileExFlags + 0x92) [0x65bf4]

========= Host Frame:python (PyRun_SimpleFileExFlags + 0x2ee) [0x6612d]

========= Host Frame:python (Py_Main + 0xb5e) [0x66d92]

========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]

========= Host Frame:python [0x177c2e]

=========

========= Invalid __global__ write of size 4

========= at 0x000002f0 in update_water_flow(float*, float*, float*, int, int)

========= by thread (30,0,0) in block (0,0,0)

========= Address 0x1050d6009e0 is out of bounds

========= Saved host backtrace up to driver entry point at kernel launch time

========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204505]

========= Host Frame:./t383.so [0x1c291]

========= Host Frame:./t383.so [0x39e33]

========= Host Frame:./t383.so [0x6879]

========= Host Frame:./t383.so (_Z43__device_stub__Z17update_water_flowPfS_S_iiPfS_S_ii + 0xe3) [0x6747]

========= Host Frame:./t383.so (_Z17update_water_flowPfS_S_ii + 0x38) [0x6781]

========= Host Frame:./t383.so (run_hydro_erosion + 0x8f2) [0x648b]

========= Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call_unix64 + 0x4c) [0x5adc]

========= Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call + 0x1fc) [0x540c]

========= Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so (_ctypes_callproc + 0x48e) [0x145fe]

========= Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so [0x15f9e]

========= Host Frame:python (PyEval_EvalFrameEx + 0x98d) [0x1244dd]

========= Host Frame:python [0x167d14]

========= Host Frame:python (PyRun_FileExFlags + 0x92) [0x65bf4]

========= Host Frame:python (PyRun_SimpleFileExFlags + 0x2ee) [0x6612d]

========= Host Frame:python (Py_Main + 0xb5e) [0x66d92]

========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]

========= Host Frame:python [0x177c2e]

=========

... (output truncated for brevity of presentation)

========= ERROR SUMMARY: 18 errors

$

cuda编程python接口_CUDA共享内存问题(以及将CUDA与python / ctypes一...相关推荐

  1. python共享内存mmap_python - IPC在单独的Docker容器中的Python脚本之间共享内存 - 堆栈内存溢出...

    问题 我已经编写了一个神经网络分类器,该分类器可以获取海量图像(每张图像约1-3 GB),将其打补丁,然后分别通过网络传递这些补丁. 培训的进行过程非常缓慢,因此我对其进行了基准测试,发现用大约50秒 ...

  2. python slice是共享内存吗_python共享内存实现进程通信

    1.概述 共享内存可以说是最有用的进程间通信方式.两个不同的进程共享内存的意思是:同一块物理内存被映射到两个进程的各自的进程地址空间.一个进程可以及时看到另一个进程对共享内存的更新,反之亦然.采用共享 ...

  3. python跨进程共享内存

    生成内存地址name # In the first Python interactive shell import numpy as np import time a = np.array([1, 1 ...

  4. cuda合并访问的要求_在 CUDA C / C ++ 中使用共享内存

    在 上一篇文章 中,我研究了如何将一组线程访问的全局内存合并到一个事务中,以及对齐和跨步如何影响 CUDA 各代硬件的合并.对于最新版本的 CUDA 硬件,未对齐的数据访问不是一个大问题.然而,不管 ...

  5. c++ fork 进程时 共享内存_尚学堂百战程序员:Python多进程与共享内存

    多进程使用 linux下可使用 fork 函数 #!/bin/env python import os print 'Process (%s) start...' % os.getpid() pid ...

  6. Python并行计算使用共享内存

    在使用并行计算的时候希望维护同一个变量,比如将高分辨率的全球数据(例如30m)重采样为0.25度的数据,全球(720,1440),原始数据是10×10度的单个文件(存到单一文件太大了),全球的话就是1 ...

  7. python多进程之间共享内存

    一.为什么要用到共享内存 进程之间交换数据我们可以通过建立本地RPC,但往往比较慢,因为要花时间去执行数据传递. 此时,如果有一个实时性要求比较高的跨进程功能,共享内存就是一个不错的选择. 1.什么是 ...

  8. python slice是共享内存吗_在共享内存中使用numpy数组进行多处理

    在共享内存中使用numpy数组进行多处理 我希望在共享内存中使用numpy数组,以便与多处理模块一起使用.困难之处在于它像一个numpy数组一样使用,而不仅仅是作为一个ctype数组使用.from m ...

  9. Linux学习之系统编程篇:shm 共享内存及其操作函数

    一.shm 和 mmap 的区别 (1)mmap 是在磁盘上建立一个文件,每个进程地址空间中开辟出一块空间进行映射.shm 每个进程最终会映射到同一块物理内存.shm 保存在物理内存,这样读写的速度最 ...

最新文章

  1. 合并代码还在用git merge吗?我们都用git rebase!
  2. Java多线程中的Runnable和Thread
  3. AI发现人类肾细胞有一半结构未知,UCSD最新研究登上Nature,算法已开源
  4. eclipse卸载插件小记
  5. 【文本信息抽取与结构化】目前NLP领域最有应用价值的子任务之一
  6. 开个定时器给echarts组件配置定时更新
  7. 利用pandas,matplotlib画饼图
  8. python解决数据不均衡,上采样方法解决
  9. android RecyclerView实战
  10. 【python】python读取命令行选项参数
  11. jvm 面试之参数实战
  12. 135编辑器html点击图片播放音乐,135微信编辑器怎样添加音乐 135编辑器添加音乐图文教程...
  13. 北斗导航 | 北斗/GNSS精密定位:从PPP-RTK 到 Vision-PPP(第十一届中国卫星导航年会报告)
  14. 使用python连接clickhouse并发送邮件
  15. 如果一觉醒来已是光年之远
  16. 出租屋管理员计算机测试,出租屋管理员的个人工作总结
  17. 关于实验室信息管理系统(LIMS)三大问题
  18. 算法第一章作业(c++代码规范+数学之美读后感+规划)
  19. 关于11月28日CSDN遭受网络攻击的情况说明
  20. 程序员可以选择哪些平台写技术博客?

热门文章

  1. 苹果手机配对手表总是显示服务器,苹果手机配对手表时一直显示正在验证ID~这是什么情况,在线等...
  2. 大疆网上测评题库_大疆校招笔试实录
  3. Spring Boot实现MyBatis分页查询
  4. FBEC大会 | 元境技术负责人郭旷野:云游戏新一轮浪潮下的规模化之路
  5. Python PIL.Image模块:图片变更尺寸大小(宽x高)
  6. 基于微信小程序在线电子书阅读、在线小说系统 系统的设计与实现 开题报告和效果图
  7. 卓威显示器能用html线吗,普通玩家到底用得到电竞显示器吗?卓威XL2540测评
  8. 如何制作RTS游戏的寻路系统?
  9. [HPU] LianLianKan [STLstack]
  10. 物联网概述(全网最全)