CTPN-Detecting Text in Natural Image with Connectionist Text Proposal Network

CTPN,它能够准确定位自然图像中的文本行。CTPN直接在卷积特征图中的一系列细粒度文本建议中检测文本行。CTPN提出了一个垂直锚点机制,联合预测每个固定宽度提议的位置和文本/非文本分数,大大提高了定位精度。序列建议网络通过循环神经网络自然地连接起来,该网络无缝地结合到卷积网络中,从而形成端到端的可训练模型。这使得CTPN可以探索丰富的图像上下文信息,使其能够检测极其模糊的文本。CTPN在多尺度和多语言文本上可靠地工作,而不需要进一步的后处理,脱离了以前的自底向上需要多步后过滤的方法。它在ICDAR 2013和2015的基准数据集上达到了0.88和0.61的F-measure,CTPN的计算效率为0.14s每张图像。





第四,该方法在许多基准数据集上达到了新的最先进成果,显著改善了最近的结果(例如,0.88的F-measure超过了2013年ICDAR的[8]中的0.83,而0.64的F-measure超过了ICDAR2015上[35]中的0.54 )。此外,通过使用非常深的VGG16模型[27],这在计算上是高效的,导致了每张图像0.14s的运行时间(在ICDAR 2013上)。


def vgg_ctpn(backbone='vgg16', inputs=None, modifier=None, **kwargs):""" Constructs a ctpn model using a vgg backbone.Argsnum_classes: Number of classes to predict.backbone: Which backbone to use (one of ('vgg16', 'vgg19')).inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)).modifier: A function handler which can modify the backbone before using it in ctpn (this can be used to freeze backbone layers for example).Returnsctpn model with a VGG backbone."""# choose default inputif inputs is None:if keras.backend.image_data_format() == 'channels_first':inputs = keras.layers.Input(shape=(3, None, None))else:inputs = keras.layers.Input(shape=(None, None, 3))# create the vgg backboneif backbone == 'vgg16':vgg = keras.applications.VGG16(input_tensor=inputs, include_top=False, weights=None)elif backbone == 'vgg19':vgg = keras.applications.VGG19(input_tensor=inputs, include_top=False, weights=None)else:raise ValueError("Backbone '{}' not recognized.".format(backbone))if modifier:vgg = modifier(vgg)backbone_layers = vgg.get_layer('block5_conv3').outputreturn ctpn.ctpn(inputs=inputs, backbone_layers=backbone_layers, **kwargs)def ctpn(inputs,backbone_layers,config                  = None,name                    = 'ctpn'
):""" Construct a ctpn model on top of a backbone."""base_layers =  backbone_layersif config is None:config = ctpn_cfgcfg = configx = Convolution2D(512, (3, 3), strides=(1, 1), padding='same', activation='relu',name='rpn_conv1')(base_layers)x1 = Lambda(_reshape, output_shape=(None, 512))(x)x2 = Bidirectional(GRU(128, return_sequences=True), name='blstm')(x1)x3 = Lambda(_reshape2, output_shape=(None, None, 256))([x2, x])x3 = Convolution2D(512, (1, 1), padding='same', activation='relu', name='lstm_fc')(x3)cls = Convolution2D(10 * 2, (1, 1), padding='same', activation='linear', name='rpn_class_origin')(x3)regr = Convolution2D(10 * 2, (1, 1), padding='same', activation='linear', name='rpn_regress_origin')(x3)cls = Lambda(_reshape3, output_shape=(None, 2), name='rpn_class')(cls)regr = Lambda(_reshape3, output_shape=(None, 2), name='rpn_regress')(regr)return keras.models.Model(inputs, [cls, regr], name=name)
def _reshape(x):b = keras.backend.shape(x)x = keras.backend.reshape(x, [b[0] * b[1], b[2], b[3]])  # (N x H, W, C)return xdef _reshape2(x):x1, x2 = xb = keras.backend.shape(x2)x = keras.backend.reshape(x1, [b[0], b[1], b[2], 256])  # (N, H, W, 256)return xdef _reshape3(x):b = keras.backend.shape(x)x = keras.backend.reshape(x, [b[0], b[1] * b[2] * 10, 2])  # (N, H x W x 10, 2)return x
  • 首先,获取VGG的conv5特征图;
  • 其次,用3*3的窗口滑动,融合3*3邻域的特征信息。
  • 将特征图【N, H, W, C】reshape为 [N*H, W, C]
  • 连接双向LSTM,LSTM隐藏层个数为128,输出为128,双向则为256
  • 将上一步的输出reshape为[N, H, W, 256]
  • 用1*1的卷积融合通道信息,并使得输出的特征为512
  • 由于每个位置产生10个anchor,分类输出为2*10,回归输出由于只考虑上下边界,因此也为2*10




1. Anchor Softmax loss:交叉熵损失,是否包含文本

2. Anchor y coord regression loss: Anchor 外接矩形的Y方向offset,Smooth L1损失,Vj为判定为正Anchor的文本,score>0.7,或者与Groud truth vertical IoU>0.5

3. side-refinement损失, 代码里没有。
固定要regression的box的宽度和水平位置会导致predict的box的水平位置不准确,所以作者引入了side-refinement,用于水平位置的regression,损失函数是smooth L1函数

作者解释,side-proposals被定义为开始和结束提议,仅使用side-proposal的偏移量来细化最终的文本行边界框。 side-refinement进一步提高了定位精度,使得SWT和 Multi-Lingual datasets的性能提高约2%。 另外,作者说在第一张图里面的模型结构中同时预测了side-refinement的偏移量,它不是通过额外的后处理步骤计算出来的,这一点体现在loss function里,但是代码中并没有这部分,并不是作者的源代码,所以会有出入,后面的loss function的补充里面有说明这一点。



def ctpn_regr_loss(sigma = 3.0):def _rpn_loss_regr(y_true, y_pred):"""smooth L1 lossy_ture [1][HXWX10][3] (class,regr)y_pred [1][HXWX10][2] (reger)"""sigma_square = sigma ** 2cls = y_true[0, :, 0]regr = y_true[0, :, 1:3]regr_keep = backend.where(keras.backend.equal(cls, 1))[:, 0]regr_true = keras.backend.gather(regr, regr_keep)regr_pred = keras.backend.gather(y_pred[0], regr_keep)diff = keras.backend.abs(regr_true - regr_pred)less_one = backend.to_float(keras.backend.less(diff, 1.0 / sigma_square))loss = less_one * 0.5 * diff ** 2 * sigma_square + keras.backend.abs(1 - less_one) * (diff - 0.5 / sigma_square)loss = keras.backend.sum(loss, axis=1)return keras.backend.switch(keras.backend.size(loss) > 0, keras.backend.mean(loss), keras.backend.constant(0.0))return _rpn_loss_regr
def ctpn_cls_loss():def _rpn_loss_cls(y_true, y_pred):"""softmax lossy_true [1][1][HXWX10] classy_pred [1][HXWX10][2] class"""y_true = y_true[0][0]cls_keep = backend.where(keras.backend.not_equal(y_true, -1))[:, 0]cls_true = keras.backend.gather(y_true, cls_keep)cls_pred = keras.backend.gather(y_pred[0], cls_keep)cls_true = backend.cast(cls_true, 'int64')# loss = K.sparse_categorical_crossentropy(cls_true,cls_pred,from_logits=True)loss = backend.sparse_softmax_cross_entropy_with_logits(labels=cls_true, logits=cls_pred)return keras.backend.switch(keras.backend.size(loss) > 0,keras.backend.clip(keras.backend.mean(loss), 0, 10),keras.backend.constant(0.0))return _rpn_loss_cls


    def compute_targets_ctpn(self, imggroup, anngroup):batch_regr = []batch_cls = []for img, ann in zip(imggroup, anngroup):gtbox = ann['bboxes']h, w, c = img.shape[cls, regr], _ = cal_rpn((h, w), (int(h / 16), int(w / 16)), 16, gtbox, self.config)# zero-center by mean pixelregr = np.hstack([cls.reshape(cls.shape[0], 1), regr])#cls = np.expand_dims(cls, axis=0)cls = np.expand_dims(cls, axis=1)# regr = np.expand_dims(regr,axis=1)regr = np.expand_dims(regr, axis=0)batch_regr.append(regr)batch_cls.append(cls)return [np.concatenate(batch_cls, axis=0), np.concatenate(batch_regr, axis=0)]
def cal_rpn(imgsize, featuresize, scale, gtboxes, config=None):"""gtboxes: (Msample, 4)"""if config is None:config = cfgimgh, imgw = imgsize# gen base anchorbase_anchor = gen_anchor(featuresize, scale)  # (Nsample, 4)# calculate iouoverlaps = cal_overlaps(base_anchor, gtboxes)  # (Nsample, Msample)# init labels -1 don't care  0 is negative  1 is positivelabels = np.empty(base_anchor.shape[0])labels.fill(-1)  # (Nsample,)# for each GT box corresponds to an anchor which has highest IOUgt_argmax_overlaps = overlaps.argmax(axis=0)  # (Msample, )# the anchor with the highest IOU overlap with a GT boxanchor_argmax_overlaps = overlaps.argmax(axis=1)  # (Nsample, )anchor_max_overlaps = overlaps[range(overlaps.shape[0]), anchor_argmax_overlaps]  # (Nsample, )# IOU > IOU_POSITIVElabels[anchor_max_overlaps > config.IOU_POSITIVE] = 1# IOU <IOU_NEGATIVElabels[anchor_max_overlaps < config.IOU_NEGATIVE] = 0# ensure that every GT box has at least one positive RPN regionlabels[gt_argmax_overlaps] = 1# only keep anchors inside the imageoutside_anchor = np.where((base_anchor[:, 0] < 0) |(base_anchor[:, 1] < 0) |(base_anchor[:, 2] >= imgw) |(base_anchor[:, 3] >= imgh))[0]labels[outside_anchor] = -1# 剔除掉多余的正负样例# subsample positive labels ,if greater than RPN_POSITIVE_NUM(default 128)fg_index = np.where(labels == 1)[0]if (len(fg_index) > config.RPN_POSITIVE_NUM):labels[np.random.choice(fg_index, len(fg_index) - config.RPN_POSITIVE_NUM, replace=False)] = -1# subsample negative labelsbg_index = np.where(labels == 0)[0]num_bg = config.RPN_TOTAL_NUM - np.sum(labels == 1)if (len(bg_index) > num_bg):# print('bgindex:',len(bg_index),'num_bg',num_bg)labels[np.random.choice(bg_index, len(bg_index) - num_bg, replace=False)] = -1# calculate bbox targets# debug herebbox_targets = bbox_transfrom(base_anchor, gtboxes[anchor_argmax_overlaps, :])# bbox_targets=[]return [labels, bbox_targets], base_anchor
def gen_anchor(featuresize, scale):"""gen base anchor from feature map [HXW][10][4]reshape  [HXW][10][4] to [HXWX10][4]生成的锚框是相对于原图的,即原图中每16像素就有10个锚框"""heights = [11, 16, 23, 33, 48, 68, 97, 139, 198, 283]widths = [16, 16, 16, 16, 16, 16, 16, 16, 16, 16]# gen k=9 anchor size (h,w)heights = np.array(heights).reshape(len(heights), 1)widths = np.array(widths).reshape(len(widths), 1)# 锚框大小为16像素base_anchor = np.array([0, 0, 15, 15])# center x,yxt = (base_anchor[0] + base_anchor[2]) * 0.5yt = (base_anchor[1] + base_anchor[3]) * 0.5# x1 y1 x2 y2x1 = xt - widths * 0.5y1 = yt - heights * 0.5x2 = xt + widths * 0.5y2 = yt + heights * 0.5# 一组十个锚框base_anchor = np.hstack((x1, y1, x2, y2))h, w = featuresizeshift_x = np.arange(0, w) * scaleshift_y = np.arange(0, h) * scale# apply shiftanchor = []for i in shift_y:for j in shift_x:anchor.append(base_anchor + [j, i, j, i])return np.array(anchor).reshape((-1, 4))
def cal_iou(box1, box1_area, boxes2, boxes2_area):"""box1 [x1,y1,x2,y2]boxes2 [Msample,x1,y1,x2,y2]"""x1 = np.maximum(box1[0], boxes2[:, 0])x2 = np.minimum(box1[2], boxes2[:, 2])y1 = np.maximum(box1[1], boxes2[:, 1])y2 = np.minimum(box1[3], boxes2[:, 3])intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)iou = intersection / (box1_area + boxes2_area[:] - intersection[:])return iou
def cal_overlaps(boxes1, boxes2):"""boxes1 [Nsample,x1,y1,x2,y2]  anchorboxes2 [Msample,x1,y1,x2,y2]  grouth-box"""area1 = (boxes1[:, 0] - boxes1[:, 2]) * (boxes1[:, 1] - boxes1[:, 3])  # (Nsample, 1)area2 = (boxes2[:, 0] - boxes2[:, 2]) * (boxes2[:, 1] - boxes2[:, 3])  # (Msample, 1)overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0]))  # (Nsample, Msample)# calculate the intersection of  boxes1(anchor) and boxes2(GT box)for i in range(boxes1.shape[0]):overlaps[i][:] = cal_iou(boxes1[i], area1[i], boxes2, area2)return overlaps
def bbox_transfrom(anchors, gtboxes):"""anchors: (Nsample, 4)gtboxes: (Nsample, 4)compute relative predicted vertical coordinates Vc ,Vhwith respect to the bounding box location of an anchor"""Cy = (gtboxes[:, 1] + gtboxes[:, 3]) * 0.5  # (Nsample, )Cya = (anchors[:, 1] + anchors[:, 3]) * 0.5  # (Nsample, )h = gtboxes[:, 3] - gtboxes[:, 1] + 1.0  # (Nsample, )ha = anchors[:, 3] - anchors[:, 1] + 1.0  # (Nsample, )Vc = (Cy - Cya) / ha  # (Nsample, )Vh = np.log(h / ha)  # (Nsample, )ret = np.vstack((Vc, Vh))return ret.transpose()  # (Nsample, 2)



class CTPN:def __init__(self,ctpn_weight_path,show_log = True,):self.ctpn_weight_path = ctpn_weight_pathself.show_log = show_logself.model = VGGBackbone("vgg16").ctpn(isTest=True, )if self.show_log:print(self.model.summary())self.model.load_weights(self.ctpn_weight_path , by_name=True)def detect_img_path(self, path):image = read_image_bgr(path)return self.detect_rgbimg(image)def detect_rgbimg(self,img):image = preprocess_image(img)h, w, c = img.shape# process imagestart = time.time()cls, regr, cls_prod = self.model.predict_on_batch(np.expand_dims(image, axis=0))if self.show_log:print("ctpn processing time: ", time.time() - start)anchor = gen_anchor((int(h / 16), int(w / 16)), 16)bbox = bbox_transfor_inv(anchor, regr[0, ...])bbox = clip_boxes(bbox, [h, w])# score > 0.7fg = np.where(cls_prod[0, :, 1] > cfg.IOU_SELECT)[0]select_anchor = bbox[fg, :]select_score = cls_prod[0, fg, 1]select_anchor = select_anchor.astype('int32')# filter sizekeep_index = filter_small_boxes(select_anchor, 16)# nsmselect_anchor = select_anchor[keep_index]select_score = select_score[keep_index]select_score = np.reshape(select_score, (select_score.shape[0], 1))nmsbox = np.hstack((select_anchor, select_score))keep = non_max_suppression_fast_withoutsize(nmsbox[:, :4], nmsbox[:, 4], 1 - cfg.IOU_SELECT)select_anchor = select_anchor[keep]select_score = select_score[keep]# text linetextConn = TextProposalConnectorOriented()text = textConn.get_text_lines(select_anchor, select_score, [h, w])text = text.astype('int32')return textdef detect_with_show(self, path):image = read_image_bgr(path)text = self.detect_rgbimg(image)img = np.copy(image)for i in text:cv2.line(img, (i[0], i[1]), (i[2], i[3]), (255, 0, 0), 2)cv2.line(img, (i[2], i[3]), (i[6], i[7]), (255, 0, 0), 2)cv2.line(img, (i[6], i[7]), (i[4], i[5]), (255, 0, 0), 2)cv2.line(img, (i[4], i[5]), (i[0], i[1]), (255, 0, 0), 2)plt.imshow(img)plt.show()


def bbox_transfor_inv(anchor, regr):"""anchor: (NSample, 4)regr: (NSample, 2)根据锚框和偏移量反向得到GTBox"""Cya = (anchor[:, 1] + anchor[:, 3]) * 0.5  # 锚框y中心点ha = anchor[:, 3] - anchor[:, 1] + 1Vcx = regr[..., 0]  # y中心点偏移Vhx = regr[..., 1]  # 高度偏移Cyx = Vcx * ha + Cya  # GTBox y中心点hx = np.exp(Vhx) * ha  # GTBox 高xt = (anchor[:, 0] + anchor[:, 2]) * 0.5  # 锚框x中心点x1 = xt - 16 * 0.5y1 = Cyx - hx * 0.5x2 = xt + 16 * 0.5y2 = Cyx + hx * 0.5bbox = np.vstack((x1, y1, x2, y2)).transpose()return bbox






Bj->Bi条件2:Bj和Bi的vertical overlap大于0.7

class Graph:def __init__(self, graph):self.graph=graphdef sub_graphs_connected(self):sub_graphs=[]for index in range(self.graph.shape[0]):if not self.graph[:, index].any() and self.graph[index, :].any():v=indexsub_graphs.append([v])while self.graph[v, :].any():v=np.where(self.graph[v, :])[0][0]sub_graphs[-1].append(v)return sub_graphs
class TextProposalGraphBuilder:"""Build Text proposals into a graph."""def get_successions(self, index):box=self.text_proposals[index]results=[]for left in range(int(box[0])+1, min(int(box[0])+cfg.CONN.MAX_HORIZONTAL_GAP+1, self.im_size[1])):adj_box_indices=self.boxes_table[left]for adj_box_index in adj_box_indices:if self.meet_v_iou(adj_box_index, index):results.append(adj_box_index)if len(results)!=0:return resultsreturn resultsdef get_precursors(self, index):box=self.text_proposals[index]results=[]for left in range(int(box[0])-1, max(int(box[0]-cfg.CONN.MAX_HORIZONTAL_GAP), 0)-1, -1):adj_box_indices=self.boxes_table[left]for adj_box_index in adj_box_indices:if self.meet_v_iou(adj_box_index, index):results.append(adj_box_index)if len(results)!=0:return resultsreturn resultsdef is_succession_node(self, index, succession_index):precursors=self.get_precursors(succession_index)if self.scores[index]>=np.max(self.scores[precursors]):return Truereturn Falsedef meet_v_iou(self, index1, index2):def overlaps_v(index1, index2):h1=self.heights[index1]h2=self.heights[index2]y0=max(self.text_proposals[index2][1], self.text_proposals[index1][1])y1=min(self.text_proposals[index2][3], self.text_proposals[index1][3])return max(0, y1-y0+1)/min(h1, h2)def size_similarity(index1, index2):h1=self.heights[index1]h2=self.heights[index2]return min(h1, h2)/max(h1, h2)return overlaps_v(index1, index2)>=cfg.CONN.MIN_V_OVERLAPS and \size_similarity(index1, index2)>=cfg.CONN.MIN_SIZE_SIMdef build_graph(self, text_proposals, scores, im_size):self.text_proposals=text_proposalsself.scores=scoresself.im_size=im_sizeself.heights=text_proposals[:, 3]-text_proposals[:, 1]+1boxes_table=[[] for _ in range(self.im_size[1])]for index, box in enumerate(text_proposals):boxes_table[int(box[0])].append(index)self.boxes_table=boxes_tablegraph=np.zeros((text_proposals.shape[0], text_proposals.shape[0]), np.bool)for index, box in enumerate(text_proposals):successions=self.get_successions(index)if len(successions)==0:continuesuccession_index=successions[np.argmax(scores[successions])]if self.is_succession_node(index, succession_index):# NOTE: a box can have multiple successions(precursors) if multiple successions(precursors)# have equal scores.graph[index, succession_index]=Truereturn Graph(graph)class TextProposalConnectorOriented:"""Connect text proposals into text lines"""def __init__(self):self.graph_builder = TextProposalGraphBuilder()def group_text_proposals(self, text_proposals, scores, im_size):graph = self.graph_builder.build_graph(text_proposals, scores, im_size)return graph.sub_graphs_connected()def fit_y(self, X, Y, x1, x2):len(X) != 0# if X only include one point, the function will get line y=Y[0]if np.sum(X == X[0]) == len(X):return Y[0], Y[0]p = np.poly1d(np.polyfit(X, Y, 1))return p(x1), p(x2)def get_text_lines(self, text_proposals, scores, im_size):"""text_proposals:boxes"""# tp=text proposaltp_groups = self.group_text_proposals(text_proposals, scores, im_size)  # 首先还是建图,获取到文本行由哪几个小框构成text_lines = np.zeros((len(tp_groups), 8), np.float32)for index, tp_indices in enumerate(tp_groups):text_line_boxes = text_proposals[list(tp_indices)]  # 每个文本行的全部小框X = (text_line_boxes[:, 0] + text_line_boxes[:, 2]) / 2  # 求每一个小框的中心x,y坐标Y = (text_line_boxes[:, 1] + text_line_boxes[:, 3]) / 2z1 = np.polyfit(X, Y, 1)  # 多项式拟合,根据之前求的中心店拟合一条直线(最小二乘)x0 = np.min(text_line_boxes[:, 0])  # 文本行x坐标最小值x1 = np.max(text_line_boxes[:, 2])  # 文本行x坐标最大值offset = (text_line_boxes[0, 2] - text_line_boxes[0, 0]) * 0.5  # 小框宽度的一半# 以全部小框的左上角这个点去拟合一条直线,然后计算一下文本行x坐标的极左极右对应的y坐标lt_y, rt_y = self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 1], x0 + offset, x1 - offset)# 以全部小框的左下角这个点去拟合一条直线,然后计算一下文本行x坐标的极左极右对应的y坐标lb_y, rb_y = self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 3], x0 + offset, x1 - offset)score = scores[list(tp_indices)].sum() / float(len(tp_indices))  # 求全部小框得分的均值作为文本行的均值text_lines[index, 0] = x0text_lines[index, 1] = min(lt_y, rt_y)  # 文本行上端 线段 的y坐标的小值text_lines[index, 2] = x1text_lines[index, 3] = max(lb_y, rb_y)  # 文本行下端 线段 的y坐标的大值text_lines[index, 4] = score  # 文本行得分text_lines[index, 5] = z1[0]  # 根据中心点拟合的直线的k,btext_lines[index, 6] = z1[1]height = np.mean((text_line_boxes[:, 3] - text_line_boxes[:, 1]))  # 小框平均高度text_lines[index, 7] = height + 2.5text_recs = np.zeros((len(text_lines), 9), np.float)index = 0for line in text_lines:b1 = line[6] - line[7] / 2  # 根据高度和文本行中心线,求取文本行上下两条线的b值b2 = line[6] + line[7] / 2x1 = line[0]y1 = line[5] * line[0] + b1  # 左上x2 = line[2]y2 = line[5] * line[2] + b1  # 右上x3 = line[0]y3 = line[5] * line[0] + b2  # 左下x4 = line[2]y4 = line[5] * line[2] + b2  # 右下disX = x2 - x1disY = y2 - y1width = np.sqrt(disX * disX + disY * disY)  # 文本行宽度fTmp0 = y3 - y1  # 文本行高度fTmp1 = fTmp0 * disY / widthx = np.fabs(fTmp1 * disX / width)  # 做补偿y = np.fabs(fTmp1 * disY / width)if line[5] < 0:x1 -= xy1 += yx4 += xy4 -= yelse:x2 += xy2 += yx3 -= xy3 -= ytext_recs[index, 0] = x1text_recs[index, 1] = y1text_recs[index, 2] = x2text_recs[index, 3] = y2text_recs[index, 4] = x3text_recs[index, 5] = y3text_recs[index, 6] = x4text_recs[index, 7] = y4text_recs[index, 8] = line[4]index = index + 1return text_recs





