一、算法原理

接受N级score,bbox_pred,anchor和image_shape作为输入,通过anchor和框的偏移(bbox_pred)得到proposal,然后对这些proposal做NMS,最后选出前num个。

二、执行步骤

  1. 将每级score,bbox_pred,anchor按照score从大到小排序,并选择前nms_pre个(一般为1000),共N*nms_pre个。
  2. 通过anchor和框的偏移(bbox_pred)得到proposal
  3. 去除框大小为负数的框,并且对于每级的proposal,加上一个足够大的offset,使得每级的框之间不会有重叠,将多分类NMS转成单分类NMS
  4. 将N级score和proposal整合在一起,按照score从大到小排序
  5. 做NMS
  6. 取前num个,并且给proposal减去之前加上的offset

三、python源码解析

#路径:mmdetection/mmdet/models/dense_heads/cascade_rpn_head.py:StageCascadeRPNHead::_get_bboxes_single
level_ids = []
mlvl_scores = []
mlvl_bbox_preds = []
mlvl_valid_anchors = []
for idx in range(len(cls_scores)): #len(cls_scores)表示是N级cascaderpn_cls_score = cls_scores[idx] #每级的scorerpn_bbox_pred = bbox_preds[idx] #每级的bbox_predsassert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]#每级score的shape是(num_anchors * num_classes, H, W),bbox_preds的shape是(num_anchors * 4, H, W)rpn_cls_score = rpn_cls_score.permute(1, 2, 0)if self.use_sigmoid_cls: #对score做二分类,用sigmoidrpn_cls_score = rpn_cls_score.reshape(-1)scores = rpn_cls_score.sigmoid()else: #对score做二分类,用softmaxrpn_cls_score = rpn_cls_score.reshape(-1, 2)# We set FG labels to [0, num_class-1] and BG label to# num_class in RPN head since mmdet v2.5, which is unified to# be consistent with other head since mmdet v2.0. In mmdet v2.0# to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.scores = rpn_cls_score.softmax(dim=1)[:, 0]rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)anchors = mlvl_anchors[idx]if 0 < nms_pre < scores.shape[0]:# sort is faster than topk# _, topk_inds = scores.topk(cfg.nms_pre)ranked_scores, rank_inds = scores.sort(descending=True)topk_inds = rank_inds[:nms_pre]scores = ranked_scores[:nms_pre]rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]anchors = anchors[topk_inds, :]mlvl_scores.append(scores)mlvl_bbox_preds.append(rpn_bbox_pred)mlvl_valid_anchors.append(anchors)level_ids.append(scores.new_full((scores.size(0), ), idx, dtype=torch.long))scores = torch.cat(mlvl_scores)
anchors = torch.cat(mlvl_valid_anchors)
rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
proposals = self.bbox_coder.decode( #通过anchor和框的偏移(bbox_pred)得到proposalanchors, rpn_bbox_pred, max_shape=img_shape)
ids = torch.cat(level_ids)if cfg.min_bbox_size >= 0: #去除大小为负数的框w = proposals[:, 2] - proposals[:, 0]h = proposals[:, 3] - proposals[:, 1]valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)if not valid_mask.all():proposals = proposals[valid_mask]scores = scores[valid_mask]ids = ids[valid_mask]
#NMS操作
if proposals.numel() > 0:dets, _ = batched_nms(proposals, scores, ids, cfg.nms)
else:return proposals.new_zeros(0, 5)
#取前max_per_img个
return dets[:cfg.max_per_img]

四,cpu源码解析

float* score_ptr = new float[level*nms_pre];//level是级数,nms_pre是每级保留的框数
memset(score_ptr, 0, sizeof(float)*level*nms_pre);//有的级不足nms_pre个框,将多余的框的分数置零
float* score_sorted_ptr = new float[level*nms_pre];//排序后分数保存的地址
float* bbox_pred = new float[level*nms_pre*4]; //bbox_pred,每个坐标都对应一个,一个框有4个坐标
float* anchor = new float[level*nms_pre*4]; //anchor的坐标,一个框有4个
float* proposal_ptr = new float[level*nms_pre*4]; //偏移后proposal的坐标,一个框有4个
float* proposal_sorted_ptr = new float[level*nms_pre*4]; //排序后proposal的坐标//step1 整合并排序N级score,bbox_pred和anchor
vector<thread> vec_thread;
for(int i=0; i<level; i++){float* score = score_ptr+i*nms_pre;float* bbox = bbox_pred+i*nms_pre*4;float* anch = anchor+i*nms_pre*4;vec_thread.push_back(thread(merge_input, i, nms_pre, score, bbox, anch));
}
for(int i=0; i<level; i++){vec_thread[i].join();
}
void merge_input(int i, int nms_pre, ...){const float* input_score = Input<Tensor>(i)->template Data<float>();const float* input_bbox = Input<Tensor>(i+level)->template Data<float>();const float* input_anchor = Input<Tensor>(i+level*2)->template Data<float>();//对score进行排序,并且取前nms_pre个vector<KeyValuePair> vec_node;vec_node.resize(Input<Tensor>(i).Size());//排序所有的scorevector<int> sorted_id = SortedIdx(input_score, vec_node, nms_pre);for(int i=0; i<nms_pre; i++){score[i] = input_score[sorted_id[i]];bbox[i*4] = input_bbox[sorted_id[i]*4];bbox[i*4+1] = input_bbox[sorted_id[i]*4+1];bbox[i*4+2] = input_bbox[sorted_id[i]*4+2];bbox[i*4+3] = input_bbox[sorted_id[i]*4+3];anch[i*4] = input_anchor[sorted_id[i]*4];anch[i*4+1] = input_anchor[sorted_id[i]*4+1];anch[i*4+2] = input_anchor[sorted_id[i]*4+2];anch[i*4+3] = input_anchor[sorted_id[i]*4+3];}
}
typedef struct{float key;int value;
}KeyValuePair;
bool compareNode(KeyValuePair node1, KeyValuePair node2) return node1.key>node2.key;
vector<int> SortedIdx(float* input_score, vector<KeyValuePair>& vec_node, int nms_pre){for(int i=0; i<vec_node.size(); i++){vec_node[i] = {input_score[i], i};}sort(vec_node.begin(), vec_node.end(), compareNode);//倒序排列,得到分数和对应的索引vector<int> sorted_id(nms_pre);for(int i=0; i<nms_pre; i++) sorted_id[i] = vec_node[i].value;return sorted_id;
}//step2 通过anchor和框的偏移(bbox_pred)得到proposal
float means[4] = {0,0,0,0};
float stds[4] = {1,1,1,1};
int num_boxes = level*nms_pre;
const int* image_shape = Input<Tensor>(15)->template Data<int>();
//参考python的delta2bbox实现,路径mmdetection/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py:delta2bbox
delta2bbox(bbox_pred, anchor, image_shape, means, stds, proposal_ptr, num_boxes);//注意这里的bbox_pred,anchor都不是整体有序的。只是局部有序//step3 处理proposal坐标,去除无效值,添加offset,对整体score和proposal排序
for(int i=0; i<num_boxes; i++){float w = proposal_ptr[i*4+2] - proposal_ptr[i*4];float h = proposal_ptr[i*4+3] - proposal_ptr[i*4+1];if(w<=min_w || h<=min_h){//min_w和min_h一般为0proposal_ptr[i*4+2] = proposal_ptr[i*4] + 0.5;//变成小框proposal_ptr[i*4+3] = proposal_ptr[i*4+1] + 0.5;}int level_id = i / nms_pre;//为每个level添加offsetproposal_ptr[i*4] += level_id*offset;proposal_ptr[i*4+1] += level_id*offset;proposal_ptr[i*4+2] += level_id*offset;proposal_ptr[i*4+3] += level_id*offset;
}
vector<KeyValuePair> vec_node;
vec_node.resize(num_boxes);//排序所有的score
vector<int> sorted_id = SortedIdx(score_ptr, vec_node, num_boxes);
for(int i=0; i<num_boxes; i++){score_sorted_ptr[i] = score_ptr[sorted_id[i]];proposal_sorted_ptr[i*4] = proposal_ptr[sorted_id[i]*4];proposal_sorted_ptr[i*4+1] = proposal_ptr[sorted_id[i]*4+1];proposal_sorted_ptr[i*4+2] = proposal_ptr[sorted_id[i]*4+2];proposal_sorted_ptr[i*4+3] = proposal_ptr[sorted_id[i]*4+3];
}//step4 NMS,参考onnxruntime的cpu实现,onnxruntime/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc
vector<int> selected_indices;//输出的结果,是score_sorted_ptr对应的索引
vector<BoxInfo> selected_boxes_inside_classes;
selected_boxes_inside_classes.reserve(num_boxes);
nms(selected_indices, selected_boxes_inside_classes, score_sorted_ptr, proposal_sorted_ptr);
inline void MaxMin(float lhs, float rhs, float& min, float& max){if(lhs >= rhs){min = rhs;max = lhs;}else{min = lhs;max = rhs;}
}
struct BoxInfo{float score_{};int index_{};float box_[4]{};BoxInfo() = default;explicit BoxInfo(float score, int idx, int center_point_box, const float* box) : score_(score), index_(idx){if(center_point_box == 0){\//数据格式是[y1, x1, y2, x2]MaxMin(box[1], box[3], box_[1], box_[3]);//将输入box中的小值给左边MaxMin(box[0], box[2], box_[0], box_[2]);}else{//数据格式是[x_center, y_center, width, height]float box_width_half = box[2] / 2;float box_height_half = box[3] / 2;box_[1] = box[0] - box_width_half;box_[3] = box[0] + box_width_half;box_[0] = box[1] - box_height_half;box_[2] = box[1] + box_height_half;}}inline bool operator<(const BoxInfo* rhs) const {//返回分数大的,或者是索引小的return score_ < rhs.score_ || (score_ == rhs.score_ && index_ > rhs.index_);}
};//step5 选择前num个框
Tensor *output = ctx->Output(0, {num, 5});//指定0号输出的尺寸是(num,5),num是框的数量,前四个是框的坐标,最后一个是框的得分
float* output_ptr = output->template MutableData<float>();//获取输出指针for(int i=0; i<num; i++){int index = selected_indices[i];output_ptr[i*5] = proposal_sorted_ptr[index*4] - (int)(proposal_sorted_ptr[index*4] / offset) * offset;output_ptr[i*5+1] = proposal_sorted_ptr[index*4+1] - (int)(proposal_sorted_ptr[index*4+1] / offset) * offset;output_ptr[i*5+2] = proposal_sorted_ptr[index*4+2] - (int)(proposal_sorted_ptr[index*4+2] / offset) * offset;output_ptr[i*5+3] = proposal_sorted_ptr[index*4+3] - (int)(proposal_sorted_ptr[index*4+3] / offset) * offset;output_ptr[i*5+4] = score_sorted_ptr[index];
}

五、cuda源码解析

int threadPerBlock = 32*4;
int blockPerGrid = 0;
void* storage_ptr = nullptr;//保存临时空间的指针
size_t storage_bytes = 0;//保存临时空间的大小//step1 整合并排序N级score,bbox_pred和anchor
for(int i=0; i<level; i++){int score_num = Input<Tensor>(i).Size();blockPerGrid = (score_num + threadPerBlock - 1) / threadPerBlock;Iota<int><<<blockPerGrid, threadPerBlock, 0, stream>>>(score_id, 0, score_num);//创建score的索引const float* input_score = Input<Tensor>(i)->template Data<float>();const float* input_bbox = Input<Tensor>(i+level)->template Data<float>();const float* input_anchor = Input<Tensor>(i+level*2)->template Data<float>();cub::DoubleBuffer<float> d_keys(input_score, input_score_bak);//input_score_bak是和input_score大小相同的空间,用于双buffercub::DoubleBuffer<float> d_values(score_id, score_id_bak);void* temp_storage_ptr = nullptr;size_t temp_storage_bytes = 0;cub::DeviceRadixSort::SortPairsDescending(temp_storage_ptr, temp_storage_bytes, d_keys, d_values, score_num, 0, 8*sizeof(float), stream);//同步排序d_keys和d_values,但是当temp_storage_ptr为空时,只是算出temp_storage_bytes的大小,其余什么都不做RE_CUDA_MALLOC(storage_ptr, storage_bytes, temp_storage_bytes, 1); //只有当前需要的临时空间比之前分配的大,才重新分配cub::DeviceRadixSort::SortPairsDescending(temp_storage_ptr, temp_storage_bytes, d_keys, d_values, score_num, 0, 8*sizeof(float), stream);blockPerGrid = (nms_pre + threadPerBlock - 1) / threadPerBlock;merge_input<<<blockPerGrid, threadPerBlock, 0, stream>>>(input_score_bak, score_ptr, input_bbox, bbox_pred, input_anchor, anchor, score_id, nms_pre, i);
}
template <typename T>
__global__ Iota(T* to_fill, const T offset, const int num){for(int idx=blockIdx.x*blockDim.x+threadIdx.x; idx<num; idx+=blockDim.x*gridDim.x){to_fill[idx] = static_cast<T>(idx) + offset;}
}#define RE_CUDA_MALLOC(ptr, pre_size, now_size, ele_size) \if(pre_size<now_size){ \if(ptr == nullptr) cudaMalloc(&ptr, now_size*ele_size); \else{ \cudaFree(ptr); \cudaMalloc(ptr, now_size*ele_size); \} \pre_size = now_size; \}__global__ void merge_input(...){int idx = blockIdx.x * blockDimDim.x + threadIdx.x;if(idx < nms_pre){int index = score_id[idx];int dest = i * nms_pre + idx;score_ptr[dest] = input_score_bak[idx];//input_score_bak已经是排好序的scorebbox_pred[dest*4] = input_bbox[index*4];bbox_pred[dest*4+1] = input_bbox[index*4+1];bbox_pred[dest*4+2] = input_bbox[index*4+2];bbox_pred[dest*4+3] = input_bbox[index*4+3];anchor[dest*4] = input_anchor[index*4];anchor[dest*4+1] = input_anchor[index*4+1];anchor[dest*4+2] = input_anchor[index*4+2];anchor[dest*4+3] = input_anchor[index*4+3];}
}//step2 通过anchor和框的偏移(bbox_pred)得到proposal
float means[4] = {0,0,0,0};
float stds[4] = {1,1,1,1};
int num_boxes = level*nms_pre;
const int* image_shape = Input<Tensor>(15)->template Data<int>();
//参考python的delta2bbox实现,路径mmdetection/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py:delta2bbox
blockPerGrid = (num_boxes + threadPerBlock - 1) / threadPerBlock;
delta2bbox<<<blockPerGrid, threadPerBlock, 0, stream>>>(bbox_pred, anchor, image_shape, means, stds, proposal_ptr, num_boxes);//step3 处理proposal坐标,去除无效值,添加offset,对整体score和proposal排序
valid_w_h<<<blockPerGrid, threadPerBlock, 0, stream>>>(proposal_ptr);
__global__ void valid_w_h(...){int i = blockIdx.x * blockDim.x + threadIdx.x;if(i<num_boxes){float w = proposal_ptr[i*4+2] - proposal_ptr[i*4];float h = proposal_ptr[i*4+3] - proposal_ptr[i*4+1];if(w<=min_w || h<=min_h){//min_w和min_h一般为0proposal_ptr[i*4+2] = proposal_ptr[i*4] + 0.5;//变成小框proposal_ptr[i*4+3] = proposal_ptr[i*4+1] + 0.5;}int level_id = i / nms_pre;//为每个level添加offsetproposal_ptr[i*4] += level_id*offset;proposal_ptr[i*4+1] += level_id*offset;proposal_ptr[i*4+2] += level_id*offset;proposal_ptr[i*4+3] += level_id*offset;}
}//step4 NMS,参考onnxruntime的gpu实现,onnxruntime/onnxruntime/core/providers/gpu/object_detection/non_max_suppression.cc
struct __align__(16) Box{float x1, y1, x2, y2;
};
cub::DoubleBuffer<float> d_keys(score_ptr, score_ptr_bak);
cub::DoubleBuffer<Box> d_values((Box*)proposal_ptr, (Box*)proposal_ptr_bak);
void* temp_storage_ptr = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending(temp_storage_ptr, temp_storage_bytes, d_keys, d_values, score_num, 0, 8*sizeof(float), stream);//同步排序d_keys和d_values,但是当temp_storage_ptr为空时,只是算出temp_storage_bytes的大小,其余什么都不做
RE_CUDA_MALLOC(storage_ptr, storage_bytes, temp_storage_bytes, 1); //只有当前需要的临时空间比之前分配的大,才重新分配
cub::DeviceRadixSort::SortPairsDescending(temp_storage_ptr, temp_storage_bytes, d_keys, d_values, score_num, 0, 8*sizeof(float), stream);
nms(stream, 0, score_ptr_bak, proposal_ptr_bak, proposal_sorted_ptr, selected_indices);//step5 选择前num个框
blockPerGrid = (num + threadPerBlock - 1) / threadPerBlock;
select_top_n<<<blockPerGrid, threadPerBlock, 0, stream>>>(proposal_sorted_ptr, score_ptr_bak, output_ptr, selected_indices, num);
__global__ void select_top_n(...){int i = blockIdx.x * blockDim.x + threadIdx.x;if(idx < num){int index = selected_indices[i];output_ptr[i*5] = proposal_sorted_ptr[index*4] - (int)(proposal_sorted_ptr[index*4] / offset) * offset;output_ptr[i*5+1] = proposal_sorted_ptr[index*4+1] - (int)(proposal_sorted_ptr[index*4+1] / offset) * offset;output_ptr[i*5+2] = proposal_sorted_ptr[index*4+2] - (int)(proposal_sorted_ptr[index*4+2] / offset) * offset;output_ptr[i*5+3] = proposal_sorted_ptr[index*4+3] - (int)(proposal_sorted_ptr[index*4+3] / offset) * offset;output_ptr[i*5+4] = score_ptr_bak[index];}
}

MMdetection的Proposal原理和代码解析相关推荐

  1. 视觉SLAM开源算法ORB-SLAM3 原理与代码解析

    来源:深蓝学院,文稿整理者:何常鑫,审核&修改:刘国庆 本文总结于上交感知与导航研究所科研助理--刘国庆关于[视觉SLAM开源算法ORB-SLAM3 原理与代码解析]的公开课. ORB-SLA ...

  2. Tensorflow2.0---SSD网络原理及代码解析(二)-锚点框的生成

    Tensorflow2.0-SSD网络原理及代码解析(二)-锚点框的生成 分析完SSD网络的原理之后,一起来看看代码吧~ 代码转载于:https://github.com/bubbliiiing/ss ...

  3. ICCV2017跟踪算法BACF原理及代码解析

    文章和代码下载地址: Galoogahi H K, Fagg A, Lucey S. Learning Background-Aware Correlation Filters for Visual ...

  4. DBNet++(TPAMI) 原理与代码解析

    paper:Real-Time Scene Text Detection with Differentiable Binarization and Adaptive Scale Fusion code ...

  5. G-GhostNet(IJCV 2022)原理与代码解析

    paper:GhostNets on Heterogeneous Devices via Cheap Operations code:https://github.com/huawei-noah/Ef ...

  6. YOLOv3原理及代码解析

    博主完整翻译了YOLOV1和YOLOV3的论文:请移步查看: YOLOV1:https://blog.csdn.net/taifengzikai/article/details/81988891 YO ...

  7. 【资源】CenterNet原理与代码解析

    <CenterNet原理与代码解析>是首发于GiantPandaCV公众号的电子书教程,由pprp总结并整理CenterNet相关解析,这本电子书是基于非官方的CenterNet实现,ht ...

  8. TPAMI2015跟踪算法KCF原理及代码解析

    文章和代码下载地址: http://www.robots.ox.ac.uk/~joao/circulant/ 一.基础公式的由来 相关滤波跟踪器可以表示为最小化的岭回归问题: 表示期望相应,表示正则系 ...

  9. ViBe算法原理和代码解析

    ViBe - a powerful technique for background detection and subtraction in video sequences 算法官网:http:// ...

  10. ORB-SLAM / ORB-SLAM2原理解读+代码解析(汇总了资料,方便大家学习)

    注释:本文非原创,初学搜集了很多资料附上链接,方便初学者学习,避免盲目搜索浪费时间. 目录 官方代码链接 代码框架思维导图 参考解读 参考链接- -一步步带你看懂orbslam2源码 ORB-SLAM ...

最新文章

  1. C# 遍历 HTML元素 遍历html控件
  2. 如何让“远程登录”更安全
  3. Chrome中输入框默认样式移除
  4. Halcon 一维测量
  5. Ubuntu设置静态IP/网关
  6. 对于 js 性能优化的启发,debounce 和 throttle。
  7. 头文件不是可有可无的
  8. 编译安装mysql 不动了_编译安装MySQL5.6失败的相关问题解决方案
  9. 小米路由R4A千兆版安装breed+OpenWRT教程以及救砖(全脚本无需硬改)
  10. Layui 内置方法 - layer.style(重新定义层的样式)
  11. 只允许输入数字,英文字母,符号(密码)
  12. 谷歌:但使龙城飞将在 不教算法度阴山
  13. 2.2磁盘IO网络IO工作机制
  14. win7家庭版和旗舰版区别_Win7 ultimate是什么版本?ultimate是什么意思功能区别介绍!...
  15. 企业微信hook接口,协议开发,群操作功能教程
  16. nodejs PM2监控及报警邮件发送(二)
  17. 软件测试面试,一定要准备的7个高频面试题(附答案,建议收藏)
  18. 浅谈煤矿智能化解决方案
  19. 基于Java的社区团购系统丨团购商城系统
  20. Python获取时间戳

热门文章

  1. 使用excel导入功能时日期数据变成数字的解决
  2. 基于POP3协议收取邮件
  3. 记忆碎片之python爬虫APP数据爬取fiddler抓包及多线程爬取流程分析(四)
  4. 针式打印机保养方法汇总
  5. 《Neural Network and Deep Learning(神经网络与深度学习)》练习及问题详解
  6. 五笔字根口诀(五笔86字根表)
  7. 航拍全景拍摄教程,如何制作航拍全景?
  8. 获取多边形的最大最小坐标
  9. oracle把中文转换成ascii,js 中文汉字转Unicode、Unicode转中文汉字、ASCII转换Unicode、Unicode转换ASCII、中文转换XX函数代码...
  10. 使用replaceAll()方法替换字符串中的反斜杠:左斜杠(\)和右斜杠(/)