一、算法原理

接受N级score，bbox_pred，anchor和image_shape作为输入，通过anchor和框的偏移(bbox_pred)得到proposal，然后对这些proposal做NMS，最后选出前num个。

二、执行步骤

将每级score，bbox_pred，anchor按照score从大到小排序，并选择前nms_pre个(一般为1000)，共N*nms_pre个。
通过anchor和框的偏移(bbox_pred)得到proposal
去除框大小为负数的框，并且对于每级的proposal，加上一个足够大的offset，使得每级的框之间不会有重叠，将多分类NMS转成单分类NMS
将N级score和proposal整合在一起，按照score从大到小排序
做NMS
取前num个，并且给proposal减去之前加上的offset

三、python源码解析

#路径：mmdetection/mmdet/models/dense_heads/cascade_rpn_head.py:StageCascadeRPNHead::_get_bboxes_single
level_ids = []
mlvl_scores = []
mlvl_bbox_preds = []
mlvl_valid_anchors = []
for idx in range(len(cls_scores)): #len(cls_scores)表示是N级cascaderpn_cls_score = cls_scores[idx] #每级的scorerpn_bbox_pred = bbox_preds[idx] #每级的bbox_predsassert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]#每级score的shape是(num_anchors * num_classes, H, W)，bbox_preds的shape是(num_anchors * 4, H, W)rpn_cls_score = rpn_cls_score.permute(1, 2, 0)if self.use_sigmoid_cls: #对score做二分类，用sigmoidrpn_cls_score = rpn_cls_score.reshape(-1)scores = rpn_cls_score.sigmoid()else: #对score做二分类，用softmaxrpn_cls_score = rpn_cls_score.reshape(-1, 2)# We set FG labels to [0, num_class-1] and BG label to# num_class in RPN head since mmdet v2.5, which is unified to# be consistent with other head since mmdet v2.0. In mmdet v2.0# to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.scores = rpn_cls_score.softmax(dim=1)[:, 0]rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)anchors = mlvl_anchors[idx]if 0 < nms_pre < scores.shape[0]:# sort is faster than topk# _, topk_inds = scores.topk(cfg.nms_pre)ranked_scores, rank_inds = scores.sort(descending=True)topk_inds = rank_inds[:nms_pre]scores = ranked_scores[:nms_pre]rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]anchors = anchors[topk_inds, :]mlvl_scores.append(scores)mlvl_bbox_preds.append(rpn_bbox_pred)mlvl_valid_anchors.append(anchors)level_ids.append(scores.new_full((scores.size(0), ), idx, dtype=torch.long))scores = torch.cat(mlvl_scores)
anchors = torch.cat(mlvl_valid_anchors)
rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
proposals = self.bbox_coder.decode( #通过anchor和框的偏移(bbox_pred)得到proposalanchors, rpn_bbox_pred, max_shape=img_shape)
ids = torch.cat(level_ids)if cfg.min_bbox_size >= 0: #去除大小为负数的框w = proposals[:, 2] - proposals[:, 0]h = proposals[:, 3] - proposals[:, 1]valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)if not valid_mask.all():proposals = proposals[valid_mask]scores = scores[valid_mask]ids = ids[valid_mask]
#NMS操作
if proposals.numel() > 0:dets, _ = batched_nms(proposals, scores, ids, cfg.nms)
else:return proposals.new_zeros(0, 5)
#取前max_per_img个
return dets[:cfg.max_per_img]

四，cpu源码解析

float* score_ptr = new float[level*nms_pre];//level是级数，nms_pre是每级保留的框数
memset(score_ptr, 0, sizeof(float)*level*nms_pre);//有的级不足nms_pre个框，将多余的框的分数置零
float* score_sorted_ptr = new float[level*nms_pre];//排序后分数保存的地址
float* bbox_pred = new float[level*nms_pre*4]; //bbox_pred，每个坐标都对应一个，一个框有4个坐标
float* anchor = new float[level*nms_pre*4]; //anchor的坐标，一个框有4个
float* proposal_ptr = new float[level*nms_pre*4]; //偏移后proposal的坐标，一个框有4个
float* proposal_sorted_ptr = new float[level*nms_pre*4]; //排序后proposal的坐标//step1 整合并排序N级score，bbox_pred和anchor
vector<thread> vec_thread;
for(int i=0; i<level; i++){float* score = score_ptr+i*nms_pre;float* bbox = bbox_pred+i*nms_pre*4;float* anch = anchor+i*nms_pre*4;vec_thread.push_back(thread(merge_input, i, nms_pre, score, bbox, anch));
}
for(int i=0; i<level; i++){vec_thread[i].join();
}
void merge_input(int i, int nms_pre, ...){const float* input_score = Input<Tensor>(i)->template Data<float>();const float* input_bbox = Input<Tensor>(i+level)->template Data<float>();const float* input_anchor = Input<Tensor>(i+level*2)->template Data<float>();//对score进行排序，并且取前nms_pre个vector<KeyValuePair> vec_node;vec_node.resize(Input<Tensor>(i).Size());//排序所有的scorevector<int> sorted_id = SortedIdx(input_score, vec_node, nms_pre);for(int i=0; i<nms_pre; i++){score[i] = input_score[sorted_id[i]];bbox[i*4] = input_bbox[sorted_id[i]*4];bbox[i*4+1] = input_bbox[sorted_id[i]*4+1];bbox[i*4+2] = input_bbox[sorted_id[i]*4+2];bbox[i*4+3] = input_bbox[sorted_id[i]*4+3];anch[i*4] = input_anchor[sorted_id[i]*4];anch[i*4+1] = input_anchor[sorted_id[i]*4+1];anch[i*4+2] = input_anchor[sorted_id[i]*4+2];anch[i*4+3] = input_anchor[sorted_id[i]*4+3];}
}
typedef struct{float key;int value;
}KeyValuePair;
bool compareNode(KeyValuePair node1, KeyValuePair node2) return node1.key>node2.key;
vector<int> SortedIdx(float* input_score, vector<KeyValuePair>& vec_node, int nms_pre){for(int i=0; i<vec_node.size(); i++){vec_node[i] = {input_score[i], i};}sort(vec_node.begin(), vec_node.end(), compareNode);//倒序排列，得到分数和对应的索引vector<int> sorted_id(nms_pre);for(int i=0; i<nms_pre; i++) sorted_id[i] = vec_node[i].value;return sorted_id;
}//step2 通过anchor和框的偏移(bbox_pred)得到proposal
float means[4] = {0,0,0,0};
float stds[4] = {1,1,1,1};
int num_boxes = level*nms_pre;
const int* image_shape = Input<Tensor>(15)->template Data<int>();
//参考python的delta2bbox实现，路径mmdetection/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py:delta2bbox
delta2bbox(bbox_pred, anchor, image_shape, means, stds, proposal_ptr, num_boxes);//注意这里的bbox_pred，anchor都不是整体有序的。只是局部有序//step3 处理proposal坐标，去除无效值，添加offset，对整体score和proposal排序
for(int i=0; i<num_boxes; i++){float w = proposal_ptr[i*4+2] - proposal_ptr[i*4];float h = proposal_ptr[i*4+3] - proposal_ptr[i*4+1];if(w<=min_w || h<=min_h){//min_w和min_h一般为0proposal_ptr[i*4+2] = proposal_ptr[i*4] + 0.5;//变成小框proposal_ptr[i*4+3] = proposal_ptr[i*4+1] + 0.5;}int level_id = i / nms_pre;//为每个level添加offsetproposal_ptr[i*4] += level_id*offset;proposal_ptr[i*4+1] += level_id*offset;proposal_ptr[i*4+2] += level_id*offset;proposal_ptr[i*4+3] += level_id*offset;
}
vector<KeyValuePair> vec_node;
vec_node.resize(num_boxes);//排序所有的score
vector<int> sorted_id = SortedIdx(score_ptr, vec_node, num_boxes);
for(int i=0; i<num_boxes; i++){score_sorted_ptr[i] = score_ptr[sorted_id[i]];proposal_sorted_ptr[i*4] = proposal_ptr[sorted_id[i]*4];proposal_sorted_ptr[i*4+1] = proposal_ptr[sorted_id[i]*4+1];proposal_sorted_ptr[i*4+2] = proposal_ptr[sorted_id[i]*4+2];proposal_sorted_ptr[i*4+3] = proposal_ptr[sorted_id[i]*4+3];
}//step4 NMS，参考onnxruntime的cpu实现，onnxruntime/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc
vector<int> selected_indices;//输出的结果，是score_sorted_ptr对应的索引
vector<BoxInfo> selected_boxes_inside_classes;
selected_boxes_inside_classes.reserve(num_boxes);
nms(selected_indices, selected_boxes_inside_classes, score_sorted_ptr, proposal_sorted_ptr);
inline void MaxMin(float lhs, float rhs, float& min, float& max){if(lhs >= rhs){min = rhs;max = lhs;}else{min = lhs;max = rhs;}
}
struct BoxInfo{float score_{};int index_{};float box_[4]{};BoxInfo() = default;explicit BoxInfo(float score, int idx, int center_point_box, const float* box) : score_(score), index_(idx){if(center_point_box == 0){\//数据格式是[y1, x1, y2, x2]MaxMin(box[1], box[3], box_[1], box_[3]);//将输入box中的小值给左边MaxMin(box[0], box[2], box_[0], box_[2]);}else{//数据格式是[x_center, y_center, width, height]float box_width_half = box[2] / 2;float box_height_half = box[3] / 2;box_[1] = box[0] - box_width_half;box_[3] = box[0] + box_width_half;box_[0] = box[1] - box_height_half;box_[2] = box[1] + box_height_half;}}inline bool operator<(const BoxInfo* rhs) const {//返回分数大的，或者是索引小的return score_ < rhs.score_ || (score_ == rhs.score_ && index_ > rhs.index_);}
};//step5 选择前num个框
Tensor *output = ctx->Output(0, {num, 5});//指定0号输出的尺寸是(num,5)，num是框的数量，前四个是框的坐标，最后一个是框的得分
float* output_ptr = output->template MutableData<float>();//获取输出指针for(int i=0; i<num; i++){int index = selected_indices[i];output_ptr[i*5] = proposal_sorted_ptr[index*4] - (int)(proposal_sorted_ptr[index*4] / offset) * offset;output_ptr[i*5+1] = proposal_sorted_ptr[index*4+1] - (int)(proposal_sorted_ptr[index*4+1] / offset) * offset;output_ptr[i*5+2] = proposal_sorted_ptr[index*4+2] - (int)(proposal_sorted_ptr[index*4+2] / offset) * offset;output_ptr[i*5+3] = proposal_sorted_ptr[index*4+3] - (int)(proposal_sorted_ptr[index*4+3] / offset) * offset;output_ptr[i*5+4] = score_sorted_ptr[index];
}

五、cuda源码解析

int threadPerBlock = 32*4;
int blockPerGrid = 0;
void* storage_ptr = nullptr;//保存临时空间的指针
size_t storage_bytes = 0;//保存临时空间的大小//step1 整合并排序N级score，bbox_pred和anchor
for(int i=0; i<level; i++){int score_num = Input<Tensor>(i).Size();blockPerGrid = (score_num + threadPerBlock - 1) / threadPerBlock;Iota<int><<<blockPerGrid, threadPerBlock, 0, stream>>>(score_id, 0, score_num);//创建score的索引const float* input_score = Input<Tensor>(i)->template Data<float>();const float* input_bbox = Input<Tensor>(i+level)->template Data<float>();const float* input_anchor = Input<Tensor>(i+level*2)->template Data<float>();cub::DoubleBuffer<float> d_keys(input_score, input_score_bak);//input_score_bak是和input_score大小相同的空间，用于双buffercub::DoubleBuffer<float> d_values(score_id, score_id_bak);void* temp_storage_ptr = nullptr;size_t temp_storage_bytes = 0;cub::DeviceRadixSort::SortPairsDescending(temp_storage_ptr, temp_storage_bytes, d_keys, d_values, score_num, 0, 8*sizeof(float), stream);//同步排序d_keys和d_values，但是当temp_storage_ptr为空时，只是算出temp_storage_bytes的大小，其余什么都不做RE_CUDA_MALLOC(storage_ptr, storage_bytes, temp_storage_bytes, 1); //只有当前需要的临时空间比之前分配的大，才重新分配cub::DeviceRadixSort::SortPairsDescending(temp_storage_ptr, temp_storage_bytes, d_keys, d_values, score_num, 0, 8*sizeof(float), stream);blockPerGrid = (nms_pre + threadPerBlock - 1) / threadPerBlock;merge_input<<<blockPerGrid, threadPerBlock, 0, stream>>>(input_score_bak, score_ptr, input_bbox, bbox_pred, input_anchor, anchor, score_id, nms_pre, i);
}
template <typename T>
__global__ Iota(T* to_fill, const T offset, const int num){for(int idx=blockIdx.x*blockDim.x+threadIdx.x; idx<num; idx+=blockDim.x*gridDim.x){to_fill[idx] = static_cast<T>(idx) + offset;}
}#define RE_CUDA_MALLOC(ptr, pre_size, now_size, ele_size) \if(pre_size<now_size){ \if(ptr == nullptr) cudaMalloc(&ptr, now_size*ele_size); \else{ \cudaFree(ptr); \cudaMalloc(ptr, now_size*ele_size); \} \pre_size = now_size; \}__global__ void merge_input(...){int idx = blockIdx.x * blockDimDim.x + threadIdx.x;if(idx < nms_pre){int index = score_id[idx];int dest = i * nms_pre + idx;score_ptr[dest] = input_score_bak[idx];//input_score_bak已经是排好序的scorebbox_pred[dest*4] = input_bbox[index*4];bbox_pred[dest*4+1] = input_bbox[index*4+1];bbox_pred[dest*4+2] = input_bbox[index*4+2];bbox_pred[dest*4+3] = input_bbox[index*4+3];anchor[dest*4] = input_anchor[index*4];anchor[dest*4+1] = input_anchor[index*4+1];anchor[dest*4+2] = input_anchor[index*4+2];anchor[dest*4+3] = input_anchor[index*4+3];}
}//step2 通过anchor和框的偏移(bbox_pred)得到proposal
float means[4] = {0,0,0,0};
float stds[4] = {1,1,1,1};
int num_boxes = level*nms_pre;
const int* image_shape = Input<Tensor>(15)->template Data<int>();
//参考python的delta2bbox实现，路径mmdetection/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py:delta2bbox
blockPerGrid = (num_boxes + threadPerBlock - 1) / threadPerBlock;
delta2bbox<<<blockPerGrid, threadPerBlock, 0, stream>>>(bbox_pred, anchor, image_shape, means, stds, proposal_ptr, num_boxes);//step3 处理proposal坐标，去除无效值，添加offset，对整体score和proposal排序
valid_w_h<<<blockPerGrid, threadPerBlock, 0, stream>>>(proposal_ptr);
__global__ void valid_w_h(...){int i = blockIdx.x * blockDim.x + threadIdx.x;if(i<num_boxes){float w = proposal_ptr[i*4+2] - proposal_ptr[i*4];float h = proposal_ptr[i*4+3] - proposal_ptr[i*4+1];if(w<=min_w || h<=min_h){//min_w和min_h一般为0proposal_ptr[i*4+2] = proposal_ptr[i*4] + 0.5;//变成小框proposal_ptr[i*4+3] = proposal_ptr[i*4+1] + 0.5;}int level_id = i / nms_pre;//为每个level添加offsetproposal_ptr[i*4] += level_id*offset;proposal_ptr[i*4+1] += level_id*offset;proposal_ptr[i*4+2] += level_id*offset;proposal_ptr[i*4+3] += level_id*offset;}
}//step4 NMS，参考onnxruntime的gpu实现，onnxruntime/onnxruntime/core/providers/gpu/object_detection/non_max_suppression.cc
struct __align__(16) Box{float x1, y1, x2, y2;
};
cub::DoubleBuffer<float> d_keys(score_ptr, score_ptr_bak);
cub::DoubleBuffer<Box> d_values((Box*)proposal_ptr, (Box*)proposal_ptr_bak);
void* temp_storage_ptr = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending(temp_storage_ptr, temp_storage_bytes, d_keys, d_values, score_num, 0, 8*sizeof(float), stream);//同步排序d_keys和d_values，但是当temp_storage_ptr为空时，只是算出temp_storage_bytes的大小，其余什么都不做
RE_CUDA_MALLOC(storage_ptr, storage_bytes, temp_storage_bytes, 1); //只有当前需要的临时空间比之前分配的大，才重新分配
cub::DeviceRadixSort::SortPairsDescending(temp_storage_ptr, temp_storage_bytes, d_keys, d_values, score_num, 0, 8*sizeof(float), stream);
nms(stream, 0, score_ptr_bak, proposal_ptr_bak, proposal_sorted_ptr, selected_indices);//step5 选择前num个框
blockPerGrid = (num + threadPerBlock - 1) / threadPerBlock;
select_top_n<<<blockPerGrid, threadPerBlock, 0, stream>>>(proposal_sorted_ptr, score_ptr_bak, output_ptr, selected_indices, num);
__global__ void select_top_n(...){int i = blockIdx.x * blockDim.x + threadIdx.x;if(idx < num){int index = selected_indices[i];output_ptr[i*5] = proposal_sorted_ptr[index*4] - (int)(proposal_sorted_ptr[index*4] / offset) * offset;output_ptr[i*5+1] = proposal_sorted_ptr[index*4+1] - (int)(proposal_sorted_ptr[index*4+1] / offset) * offset;output_ptr[i*5+2] = proposal_sorted_ptr[index*4+2] - (int)(proposal_sorted_ptr[index*4+2] / offset) * offset;output_ptr[i*5+3] = proposal_sorted_ptr[index*4+3] - (int)(proposal_sorted_ptr[index*4+3] / offset) * offset;output_ptr[i*5+4] = score_ptr_bak[index];}
}

MMdetection的Proposal原理和代码解析相关推荐

视觉SLAM开源算法ORB-SLAM3 原理与代码解析
来源:深蓝学院,文稿整理者:何常鑫,审核&修改:刘国庆本文总结于上交感知与导航研究所科研助理--刘国庆关于[视觉SLAM开源算法ORB-SLAM3 原理与代码解析]的公开课. ORB-SLA ...
Tensorflow2.0---SSD网络原理及代码解析（二）-锚点框的生成
Tensorflow2.0-SSD网络原理及代码解析(二)-锚点框的生成分析完SSD网络的原理之后,一起来看看代码吧~ 代码转载于:https://github.com/bubbliiiing/ss ...
ICCV2017跟踪算法BACF原理及代码解析
文章和代码下载地址: Galoogahi H K, Fagg A, Lucey S. Learning Background-Aware Correlation Filters for Visual ...
DBNet++（TPAMI）原理与代码解析
paper:Real-Time Scene Text Detection with Differentiable Binarization and Adaptive Scale Fusion code ...
G-GhostNet（IJCV 2022）原理与代码解析
paper:GhostNets on Heterogeneous Devices via Cheap Operations code:https://github.com/huawei-noah/Ef ...
YOLOv3原理及代码解析
博主完整翻译了YOLOV1和YOLOV3的论文:请移步查看: YOLOV1:https://blog.csdn.net/taifengzikai/article/details/81988891 YO ...
【资源】CenterNet原理与代码解析
<CenterNet原理与代码解析>是首发于GiantPandaCV公众号的电子书教程,由pprp总结并整理CenterNet相关解析,这本电子书是基于非官方的CenterNet实现,ht ...
TPAMI2015跟踪算法KCF原理及代码解析
文章和代码下载地址: http://www.robots.ox.ac.uk/~joao/circulant/ 一.基础公式的由来相关滤波跟踪器可以表示为最小化的岭回归问题: 表示期望相应,表示正则系 ...
ViBe算法原理和代码解析
ViBe - a powerful technique for background detection and subtraction in video sequences 算法官网:http:// ...
ORB-SLAM / ORB-SLAM2原理解读+代码解析（汇总了资料，方便大家学习）
注释:本文非原创,初学搜集了很多资料附上链接,方便初学者学习,避免盲目搜索浪费时间. 目录官方代码链接代码框架思维导图参考解读参考链接- -一步步带你看懂orbslam2源码 ORB-SLAM ...

MMdetection的Proposal原理和代码解析

一、算法原理

二、执行步骤

三、python源码解析

四，cpu源码解析

五、cuda源码解析

MMdetection的Proposal原理和代码解析相关推荐

最新文章

热门文章