44
44
#include " layers_common.hpp"
45
45
#include < float.h>
46
46
#include < string>
47
- #include < caffe.pb.h>
48
47
#include " ../nms.inl.hpp"
49
48
50
49
namespace cv
@@ -55,14 +54,35 @@ namespace dnn
55
54
namespace util
56
55
{
57
56
57
+ class NormalizedBBox
58
+ {
59
+ public:
60
+ float xmin, ymin, xmax, ymax;
61
+
62
+ NormalizedBBox ()
63
+ : xmin(0 ), ymin(0 ), xmax(0 ), ymax(0 ), has_size_(false ), size_(0 ) {}
64
+
65
+ float size () const { return size_; }
66
+
67
+ bool has_size () const { return has_size_; }
68
+
69
+ void set_size (float value) { size_ = value; has_size_ = true ; }
70
+
71
+ void clear_size () { size_ = 0 ; has_size_ = false ; }
72
+
73
+ private:
74
+ bool has_size_;
75
+ float size_;
76
+ };
77
+
58
78
template <typename T>
59
79
static inline bool SortScorePairDescend (const std::pair<float , T>& pair1,
60
80
const std::pair<float , T>& pair2)
61
81
{
62
82
return pair1.first > pair2.first ;
63
83
}
64
84
65
- static inline float caffe_box_overlap (const caffe ::NormalizedBBox& a, const caffe ::NormalizedBBox& b);
85
+ static inline float caffe_box_overlap (const util ::NormalizedBBox& a, const util ::NormalizedBBox& b);
66
86
67
87
} // namespace
68
88
@@ -75,8 +95,7 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
75
95
76
96
int _backgroundLabelId;
77
97
78
- typedef caffe::PriorBoxParameter_CodeType CodeType;
79
- CodeType _codeType;
98
+ cv::String _codeType;
80
99
81
100
bool _varianceEncodedInTarget;
82
101
int _keepTopK;
@@ -90,7 +109,7 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
90
109
enum { _numAxes = 4 };
91
110
static const std::string _layerName;
92
111
93
- typedef std::map<int , std::vector<caffe ::NormalizedBBox> > LabelBBox;
112
+ typedef std::map<int , std::vector<util ::NormalizedBBox> > LabelBBox;
94
113
95
114
bool getParameterDict (const LayerParams ¶ms,
96
115
const std::string ¶meterName,
@@ -135,12 +154,10 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
135
154
void getCodeType (const LayerParams ¶ms)
136
155
{
137
156
String codeTypeString = params.get <String>(" code_type" ).toLowerCase ();
138
- if (codeTypeString == " corner" )
139
- _codeType = caffe::PriorBoxParameter_CodeType_CORNER;
140
- else if (codeTypeString == " center_size" )
141
- _codeType = caffe::PriorBoxParameter_CodeType_CENTER_SIZE;
157
+ if (codeTypeString == " center_size" )
158
+ _codeType = " CENTER_SIZE" ;
142
159
else
143
- _codeType = caffe::PriorBoxParameter_CodeType_CORNER ;
160
+ _codeType = " CORNER " ;
144
161
}
145
162
146
163
DetectionOutputLayerImpl (const LayerParams ¶ms)
@@ -229,7 +246,7 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
229
246
GetConfidenceScores (confidenceData, num, numPriors, _numClasses, allConfidenceScores);
230
247
231
248
// Retrieve all prior bboxes
232
- std::vector<caffe ::NormalizedBBox> priorBBoxes;
249
+ std::vector<util ::NormalizedBBox> priorBBoxes;
233
250
std::vector<std::vector<float > > priorVariances;
234
251
GetPriorBBoxes (priorData, numPriors, priorBBoxes, priorVariances);
235
252
@@ -310,7 +327,7 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
310
327
GetConfidenceScores (confidenceData, num, numPriors, _numClasses, allConfidenceScores);
311
328
312
329
// Retrieve all prior bboxes
313
- std::vector<caffe ::NormalizedBBox> priorBBoxes;
330
+ std::vector<util ::NormalizedBBox> priorBBoxes;
314
331
std::vector<std::vector<float > > priorVariances;
315
332
GetPriorBBoxes (priorData, numPriors, priorBBoxes, priorVariances);
316
333
@@ -370,14 +387,14 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
370
387
for (size_t j = 0 ; j < indices.size (); ++j, ++count)
371
388
{
372
389
int idx = indices[j];
373
- const caffe ::NormalizedBBox& decode_bbox = label_bboxes->second [idx];
390
+ const util ::NormalizedBBox& decode_bbox = label_bboxes->second [idx];
374
391
outputsData[count * 7 ] = i;
375
392
outputsData[count * 7 + 1 ] = label;
376
393
outputsData[count * 7 + 2 ] = scores[idx];
377
- outputsData[count * 7 + 3 ] = decode_bbox.xmin () ;
378
- outputsData[count * 7 + 4 ] = decode_bbox.ymin () ;
379
- outputsData[count * 7 + 5 ] = decode_bbox.xmax () ;
380
- outputsData[count * 7 + 6 ] = decode_bbox.ymax () ;
394
+ outputsData[count * 7 + 3 ] = decode_bbox.xmin ;
395
+ outputsData[count * 7 + 4 ] = decode_bbox.ymin ;
396
+ outputsData[count * 7 + 5 ] = decode_bbox.xmax ;
397
+ outputsData[count * 7 + 6 ] = decode_bbox.ymax ;
381
398
}
382
399
}
383
400
return count;
@@ -454,9 +471,9 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
454
471
455
472
// Compute bbox size
456
473
template <bool normalized>
457
- static float BBoxSize (const caffe ::NormalizedBBox& bbox)
474
+ static float BBoxSize (const util ::NormalizedBBox& bbox)
458
475
{
459
- if (bbox.xmax () < bbox.xmin () || bbox.ymax () < bbox.ymin () )
476
+ if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin )
460
477
{
461
478
return 0 ; // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
462
479
}
@@ -468,8 +485,8 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
468
485
}
469
486
else
470
487
{
471
- float width = bbox.xmax () - bbox.xmin () ;
472
- float height = bbox.ymax () - bbox.ymin () ;
488
+ float width = bbox.xmax - bbox.xmin ;
489
+ float height = bbox.ymax - bbox.ymin ;
473
490
if (normalized)
474
491
{
475
492
return width * height;
@@ -487,66 +504,64 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
487
504
// Decode a bbox according to a prior bbox
488
505
template <bool variance_encoded_in_target>
489
506
static void DecodeBBox (
490
- const caffe ::NormalizedBBox& prior_bbox, const std::vector<float >& prior_variance,
491
- const CodeType code_type,
492
- const bool clip_bbox, const caffe ::NormalizedBBox& bbox,
493
- caffe ::NormalizedBBox& decode_bbox)
507
+ const util ::NormalizedBBox& prior_bbox, const std::vector<float >& prior_variance,
508
+ const cv::String& code_type,
509
+ const bool clip_bbox, const util ::NormalizedBBox& bbox,
510
+ util ::NormalizedBBox& decode_bbox)
494
511
{
495
- float bbox_xmin = variance_encoded_in_target ? bbox.xmin () : prior_variance[0 ] * bbox.xmin ();
496
- float bbox_ymin = variance_encoded_in_target ? bbox.ymin () : prior_variance[1 ] * bbox.ymin ();
497
- float bbox_xmax = variance_encoded_in_target ? bbox.xmax () : prior_variance[2 ] * bbox.xmax ();
498
- float bbox_ymax = variance_encoded_in_target ? bbox.ymax () : prior_variance[3 ] * bbox.ymax ();
499
- switch (code_type)
500
- {
501
- case caffe::PriorBoxParameter_CodeType_CORNER:
502
- decode_bbox.set_xmin (prior_bbox.xmin () + bbox_xmin);
503
- decode_bbox.set_ymin (prior_bbox.ymin () + bbox_ymin);
504
- decode_bbox.set_xmax (prior_bbox.xmax () + bbox_xmax);
505
- decode_bbox.set_ymax (prior_bbox.ymax () + bbox_ymax);
506
- break ;
507
- case caffe::PriorBoxParameter_CodeType_CENTER_SIZE:
508
- {
509
- float prior_width = prior_bbox.xmax () - prior_bbox.xmin ();
510
- CV_Assert (prior_width > 0 );
511
- float prior_height = prior_bbox.ymax () - prior_bbox.ymin ();
512
- CV_Assert (prior_height > 0 );
513
- float prior_center_x = (prior_bbox.xmin () + prior_bbox.xmax ()) * .5 ;
514
- float prior_center_y = (prior_bbox.ymin () + prior_bbox.ymax ()) * .5 ;
515
-
516
- float decode_bbox_center_x, decode_bbox_center_y;
517
- float decode_bbox_width, decode_bbox_height;
518
- decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
519
- decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
520
- decode_bbox_width = exp (bbox_xmax) * prior_width;
521
- decode_bbox_height = exp (bbox_ymax) * prior_height;
522
- decode_bbox.set_xmin (decode_bbox_center_x - decode_bbox_width * .5 );
523
- decode_bbox.set_ymin (decode_bbox_center_y - decode_bbox_height * .5 );
524
- decode_bbox.set_xmax (decode_bbox_center_x + decode_bbox_width * .5 );
525
- decode_bbox.set_ymax (decode_bbox_center_y + decode_bbox_height * .5 );
526
- break ;
527
- }
528
- default :
529
- CV_ErrorNoReturn (Error::StsBadArg, " Unknown type." );
530
- };
512
+ float bbox_xmin = variance_encoded_in_target ? bbox.xmin : prior_variance[0 ] * bbox.xmin ;
513
+ float bbox_ymin = variance_encoded_in_target ? bbox.ymin : prior_variance[1 ] * bbox.ymin ;
514
+ float bbox_xmax = variance_encoded_in_target ? bbox.xmax : prior_variance[2 ] * bbox.xmax ;
515
+ float bbox_ymax = variance_encoded_in_target ? bbox.ymax : prior_variance[3 ] * bbox.ymax ;
516
+ if (code_type == " CORNER" )
517
+ {
518
+ decode_bbox.xmin = prior_bbox.xmin + bbox_xmin;
519
+ decode_bbox.ymin = prior_bbox.ymin + bbox_ymin;
520
+ decode_bbox.xmax = prior_bbox.xmax + bbox_xmax;
521
+ decode_bbox.ymax = prior_bbox.ymax + bbox_ymax;
522
+ }
523
+ else if (code_type == " CENTER_SIZE" )
524
+ {
525
+ float prior_width = prior_bbox.xmax - prior_bbox.xmin ;
526
+ CV_Assert (prior_width > 0 );
527
+ float prior_height = prior_bbox.ymax - prior_bbox.ymin ;
528
+ CV_Assert (prior_height > 0 );
529
+ float prior_center_x = (prior_bbox.xmin + prior_bbox.xmax ) * .5 ;
530
+ float prior_center_y = (prior_bbox.ymin + prior_bbox.ymax ) * .5 ;
531
+
532
+ float decode_bbox_center_x, decode_bbox_center_y;
533
+ float decode_bbox_width, decode_bbox_height;
534
+ decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
535
+ decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
536
+ decode_bbox_width = exp (bbox_xmax) * prior_width;
537
+ decode_bbox_height = exp (bbox_ymax) * prior_height;
538
+ decode_bbox.xmin = decode_bbox_center_x - decode_bbox_width * .5 ;
539
+ decode_bbox.ymin = decode_bbox_center_y - decode_bbox_height * .5 ;
540
+ decode_bbox.xmax = decode_bbox_center_x + decode_bbox_width * .5 ;
541
+ decode_bbox.ymax = decode_bbox_center_y + decode_bbox_height * .5 ;
542
+ }
543
+ else
544
+ CV_ErrorNoReturn (Error::StsBadArg, " Unknown type." );
545
+
531
546
if (clip_bbox)
532
547
{
533
- // Clip the caffe ::NormalizedBBox such that the range for each corner is [0, 1]
534
- decode_bbox.set_xmin ( std::max (std::min (decode_bbox.xmin () , 1 .f ), 0 .f ) );
535
- decode_bbox.set_ymin ( std::max (std::min (decode_bbox.ymin () , 1 .f ), 0 .f ) );
536
- decode_bbox.set_xmax ( std::max (std::min (decode_bbox.xmax () , 1 .f ), 0 .f ) );
537
- decode_bbox.set_ymax ( std::max (std::min (decode_bbox.ymax () , 1 .f ), 0 .f ) );
548
+ // Clip the util ::NormalizedBBox such that the range for each corner is [0, 1]
549
+ decode_bbox.xmin = std::max (std::min (decode_bbox.xmin , 1 .f ), 0 .f );
550
+ decode_bbox.ymin = std::max (std::min (decode_bbox.ymin , 1 .f ), 0 .f );
551
+ decode_bbox.xmax = std::max (std::min (decode_bbox.xmax , 1 .f ), 0 .f );
552
+ decode_bbox.ymax = std::max (std::min (decode_bbox.ymax , 1 .f ), 0 .f );
538
553
}
539
554
decode_bbox.clear_size ();
540
555
decode_bbox.set_size (BBoxSize<true >(decode_bbox));
541
556
}
542
557
543
558
// Decode a set of bboxes according to a set of prior bboxes
544
559
static void DecodeBBoxes (
545
- const std::vector<caffe ::NormalizedBBox>& prior_bboxes,
560
+ const std::vector<util ::NormalizedBBox>& prior_bboxes,
546
561
const std::vector<std::vector<float > >& prior_variances,
547
- const CodeType code_type, const bool variance_encoded_in_target,
548
- const bool clip_bbox, const std::vector<caffe ::NormalizedBBox>& bboxes,
549
- std::vector<caffe ::NormalizedBBox>& decode_bboxes)
562
+ const cv::String& code_type, const bool variance_encoded_in_target,
563
+ const bool clip_bbox, const std::vector<util ::NormalizedBBox>& bboxes,
564
+ std::vector<util ::NormalizedBBox>& decode_bboxes)
550
565
{
551
566
CV_Assert (prior_bboxes.size () == prior_variances.size ());
552
567
CV_Assert (prior_bboxes.size () == bboxes.size ());
@@ -569,11 +584,11 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
569
584
570
585
// Decode all bboxes in a batch
571
586
static void DecodeBBoxesAll (const std::vector<LabelBBox>& all_loc_preds,
572
- const std::vector<caffe ::NormalizedBBox>& prior_bboxes,
587
+ const std::vector<util ::NormalizedBBox>& prior_bboxes,
573
588
const std::vector<std::vector<float > >& prior_variances,
574
589
const int num, const bool share_location,
575
590
const int num_loc_classes, const int background_label_id,
576
- const CodeType code_type, const bool variance_encoded_in_target,
591
+ const cv::String& code_type, const bool variance_encoded_in_target,
577
592
const bool clip, std::vector<LabelBBox>& all_decode_bboxes)
578
593
{
579
594
CV_Assert (all_loc_preds.size () == num);
@@ -602,22 +617,22 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
602
617
// Get prior bounding boxes from prior_data
603
618
// prior_data: 1 x 2 x num_priors * 4 x 1 blob.
604
619
// num_priors: number of priors.
605
- // prior_bboxes: stores all the prior bboxes in the format of caffe ::NormalizedBBox.
620
+ // prior_bboxes: stores all the prior bboxes in the format of util ::NormalizedBBox.
606
621
// prior_variances: stores all the variances needed by prior bboxes.
607
622
static void GetPriorBBoxes (const float * priorData, const int & numPriors,
608
- std::vector<caffe ::NormalizedBBox>& priorBBoxes,
623
+ std::vector<util ::NormalizedBBox>& priorBBoxes,
609
624
std::vector<std::vector<float > >& priorVariances)
610
625
{
611
626
priorBBoxes.clear (); priorBBoxes.resize (numPriors);
612
627
priorVariances.clear (); priorVariances.resize (numPriors);
613
628
for (int i = 0 ; i < numPriors; ++i)
614
629
{
615
630
int startIdx = i * 4 ;
616
- caffe ::NormalizedBBox& bbox = priorBBoxes[i];
617
- bbox.set_xmin ( priorData[startIdx]) ;
618
- bbox.set_ymin ( priorData[startIdx + 1 ]) ;
619
- bbox.set_xmax ( priorData[startIdx + 2 ]) ;
620
- bbox.set_ymax ( priorData[startIdx + 3 ]) ;
631
+ util ::NormalizedBBox& bbox = priorBBoxes[i];
632
+ bbox.xmin = priorData[startIdx];
633
+ bbox.ymin = priorData[startIdx + 1 ];
634
+ bbox.xmax = priorData[startIdx + 2 ];
635
+ bbox.ymax = priorData[startIdx + 3 ];
621
636
bbox.set_size (BBoxSize<true >(bbox));
622
637
}
623
638
@@ -667,20 +682,20 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
667
682
{
668
683
labelBBox[label].resize (numPredsPerClass);
669
684
}
670
- caffe ::NormalizedBBox& bbox = labelBBox[label][p];
685
+ util ::NormalizedBBox& bbox = labelBBox[label][p];
671
686
if (locPredTransposed)
672
687
{
673
- bbox.set_ymin ( locData[startIdx + c * 4 ]) ;
674
- bbox.set_xmin ( locData[startIdx + c * 4 + 1 ]) ;
675
- bbox.set_ymax ( locData[startIdx + c * 4 + 2 ]) ;
676
- bbox.set_xmax ( locData[startIdx + c * 4 + 3 ]) ;
688
+ bbox.ymin = locData[startIdx + c * 4 ];
689
+ bbox.xmin = locData[startIdx + c * 4 + 1 ];
690
+ bbox.ymax = locData[startIdx + c * 4 + 2 ];
691
+ bbox.xmax = locData[startIdx + c * 4 + 3 ];
677
692
}
678
693
else
679
694
{
680
- bbox.set_xmin ( locData[startIdx + c * 4 ]) ;
681
- bbox.set_ymin ( locData[startIdx + c * 4 + 1 ]) ;
682
- bbox.set_xmax ( locData[startIdx + c * 4 + 2 ]) ;
683
- bbox.set_ymax ( locData[startIdx + c * 4 + 3 ]) ;
695
+ bbox.xmin = locData[startIdx + c * 4 ];
696
+ bbox.ymin = locData[startIdx + c * 4 + 1 ];
697
+ bbox.xmax = locData[startIdx + c * 4 + 2 ];
698
+ bbox.ymax = locData[startIdx + c * 4 + 3 ];
684
699
}
685
700
}
686
701
}
@@ -717,30 +732,30 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
717
732
718
733
// Compute the jaccard (intersection over union IoU) overlap between two bboxes.
719
734
template <bool normalized>
720
- static float JaccardOverlap (const caffe ::NormalizedBBox& bbox1,
721
- const caffe ::NormalizedBBox& bbox2)
735
+ static float JaccardOverlap (const util ::NormalizedBBox& bbox1,
736
+ const util ::NormalizedBBox& bbox2)
722
737
{
723
- caffe ::NormalizedBBox intersect_bbox;
724
- if (bbox2.xmin () > bbox1.xmax () || bbox2.xmax () < bbox1.xmin () ||
725
- bbox2.ymin () > bbox1.ymax () || bbox2.ymax () < bbox1.ymin () )
738
+ util ::NormalizedBBox intersect_bbox;
739
+ if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin ||
740
+ bbox2.ymin > bbox1.ymax || bbox2.ymax < bbox1.ymin )
726
741
{
727
742
// Return [0, 0, 0, 0] if there is no intersection.
728
- intersect_bbox.set_xmin ( 0 ) ;
729
- intersect_bbox.set_ymin ( 0 ) ;
730
- intersect_bbox.set_xmax ( 0 ) ;
731
- intersect_bbox.set_ymax ( 0 ) ;
743
+ intersect_bbox.xmin = 0 ;
744
+ intersect_bbox.ymin = 0 ;
745
+ intersect_bbox.xmax = 0 ;
746
+ intersect_bbox.ymax = 0 ;
732
747
}
733
748
else
734
749
{
735
- intersect_bbox.set_xmin ( std::max (bbox1.xmin () , bbox2.xmin ()) );
736
- intersect_bbox.set_ymin ( std::max (bbox1.ymin () , bbox2.ymin ()) );
737
- intersect_bbox.set_xmax ( std::min (bbox1.xmax () , bbox2.xmax ()) );
738
- intersect_bbox.set_ymax ( std::min (bbox1.ymax () , bbox2.ymax ()) );
750
+ intersect_bbox.xmin = std::max (bbox1.xmin , bbox2.xmin );
751
+ intersect_bbox.ymin = std::max (bbox1.ymin , bbox2.ymin );
752
+ intersect_bbox.xmax = std::min (bbox1.xmax , bbox2.xmax );
753
+ intersect_bbox.ymax = std::min (bbox1.ymax , bbox2.ymax );
739
754
}
740
755
741
756
float intersect_width, intersect_height;
742
- intersect_width = intersect_bbox.xmax () - intersect_bbox.xmin () ;
743
- intersect_height = intersect_bbox.ymax () - intersect_bbox.ymin () ;
757
+ intersect_width = intersect_bbox.xmax - intersect_bbox.xmin ;
758
+ intersect_height = intersect_bbox.ymax - intersect_bbox.ymin ;
744
759
if (intersect_width > 0 && intersect_height > 0 )
745
760
{
746
761
if (!normalized)
@@ -760,7 +775,7 @@ class DetectionOutputLayerImpl : public DetectionOutputLayer
760
775
}
761
776
};
762
777
763
- float util::caffe_box_overlap (const caffe ::NormalizedBBox& a, const caffe ::NormalizedBBox& b)
778
+ float util::caffe_box_overlap (const util ::NormalizedBBox& a, const util ::NormalizedBBox& b)
764
779
{
765
780
return DetectionOutputLayerImpl::JaccardOverlap<true >(a, b);
766
781
}
0 commit comments