Skip to content

Commit 639b800

Browse files
committed
Merge pull request opencv#9941 from catree:improve_dnn_samples
2 parents 712689e + 48e0743 commit 639b800

File tree

3 files changed

+285
-119
lines changed

3 files changed

+285
-119
lines changed

samples/dnn/ssd_mobilenet_object_detection.cpp

Lines changed: 58 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,23 +23,25 @@ const char* classNames[] = {"background",
2323
"motorbike", "person", "pottedplant",
2424
"sheep", "sofa", "train", "tvmonitor"};
2525

26-
const char* about = "This sample uses Single-Shot Detector "
27-
"(https://arxiv.org/abs/1512.02325)"
28-
"to detect objects on image.\n"
29-
".caffemodel model's file is avaliable here: "
30-
"https://github.com/chuanqi305/MobileNet-SSD\n";
26+
const char* about = "This sample uses MobileNet Single-Shot Detector "
27+
"(https://arxiv.org/abs/1704.04861) "
28+
"to detect objects on camera/video/image.\n"
29+
".caffemodel model's file is available here: "
30+
"https://github.com/chuanqi305/MobileNet-SSD\n"
31+
"Default network is 300x300 and 20-classes VOC.\n";
3132

3233
const char* params
3334
= "{ help | false | print usage }"
3435
"{ proto | MobileNetSSD_deploy.prototxt | model configuration }"
3536
"{ model | MobileNetSSD_deploy.caffemodel | model weights }"
36-
"{ video | | video for detection }"
37+
"{ camera_device | 0 | camera device number }"
38+
"{ video | | video or image for detection}"
3739
"{ out | | path to output video file}"
3840
"{ min_confidence | 0.2 | min confidence }";
3941

4042
int main(int argc, char** argv)
4143
{
42-
cv::CommandLineParser parser(argc, argv, params);
44+
CommandLineParser parser(argc, argv, params);
4345

4446
if (parser.get<bool>("help"))
4547
{
@@ -55,19 +57,40 @@ int main(int argc, char** argv)
5557
dnn::Net net = readNetFromCaffe(modelConfiguration, modelBinary);
5658
//! [Initialize network]
5759

58-
VideoCapture cap(parser.get<String>("video"));
59-
if(!cap.isOpened()) // check if we succeeded
60+
if (net.empty())
6061
{
61-
cap = VideoCapture(0);
62+
cerr << "Can't load network by using the following files: " << endl;
63+
cerr << "prototxt: " << modelConfiguration << endl;
64+
cerr << "caffemodel: " << modelBinary << endl;
65+
cerr << "Models can be downloaded here:" << endl;
66+
cerr << "https://github.com/chuanqi305/MobileNet-SSD" << endl;
67+
exit(-1);
68+
}
69+
70+
VideoCapture cap;
71+
if (parser.get<String>("video").empty())
72+
{
73+
int cameraDevice = parser.get<int>("camera_device");
74+
cap = VideoCapture(cameraDevice);
75+
if(!cap.isOpened())
76+
{
77+
cout << "Couldn't find camera: " << cameraDevice << endl;
78+
return -1;
79+
}
80+
}
81+
else
82+
{
83+
cap.open(parser.get<String>("video"));
6284
if(!cap.isOpened())
6385
{
64-
cout << "Couldn't find camera" << endl;
86+
cout << "Couldn't open image or video: " << parser.get<String>("video") << endl;
6587
return -1;
6688
}
6789
}
6890

69-
Size inVideoSize = Size((int) cap.get(CV_CAP_PROP_FRAME_WIDTH), //Acquire input size
70-
(int) cap.get(CV_CAP_PROP_FRAME_HEIGHT));
91+
Size inVideoSize;
92+
inVideoSize = Size((int) cap.get(CV_CAP_PROP_FRAME_WIDTH), //Acquire input size
93+
(int) cap.get(CV_CAP_PROP_FRAME_HEIGHT));
7194

7295
Size cropSize;
7396
if (inVideoSize.width / (float)inVideoSize.height > WHRatio)
@@ -93,9 +116,18 @@ int main(int argc, char** argv)
93116
for(;;)
94117
{
95118
Mat frame;
96-
cap >> frame; // get a new frame from camera
97-
//! [Prepare blob]
119+
cap >> frame; // get a new frame from camera/video or read image
120+
121+
if (frame.empty())
122+
{
123+
waitKey();
124+
break;
125+
}
126+
127+
if (frame.channels() == 4)
128+
cvtColor(frame, frame, COLOR_BGRA2BGR);
98129

130+
//! [Prepare blob]
99131
Mat inputBlob = blobFromImage(frame, inScaleFactor,
100132
Size(inWidth, inHeight), meanVal, false); //Convert Mat to batch of images
101133
//! [Prepare blob]
@@ -108,15 +140,23 @@ int main(int argc, char** argv)
108140
Mat detection = net.forward("detection_out"); //compute output
109141
//! [Make forward pass]
110142

111-
std::vector<double> layersTimings;
143+
vector<double> layersTimings;
112144
double freq = getTickFrequency() / 1000;
113145
double time = net.getPerfProfile(layersTimings) / freq;
114-
cout << "Inference time, ms: " << time << endl;
115146

116147
Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>());
117148

118149
frame = frame(crop);
119150

151+
ostringstream ss;
152+
if (!outputVideo.isOpened())
153+
{
154+
ss << "FPS: " << 1000/time << " ; time: " << time << " ms";
155+
putText(frame, ss.str(), Point(20,20), 0, 0.5, Scalar(0,0,255));
156+
}
157+
else
158+
cout << "Inference time, ms: " << time << endl;
159+
120160
float confidenceThreshold = parser.get<float>("min_confidence");
121161
for(int i = 0; i < detectionMat.rows; i++)
122162
{
@@ -131,7 +171,7 @@ int main(int argc, char** argv)
131171
int xRightTop = static_cast<int>(detectionMat.at<float>(i, 5) * frame.cols);
132172
int yRightTop = static_cast<int>(detectionMat.at<float>(i, 6) * frame.rows);
133173

134-
ostringstream ss;
174+
ss.str("");
135175
ss << confidence;
136176
String conf(ss.str());
137177

samples/dnn/ssd_object_detection.cpp

Lines changed: 98 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,26 @@ static Mat preprocess(const Mat& frame)
4040
return preprocessed;
4141
}
4242

43+
const char* classNames[] = {"background",
44+
"aeroplane", "bicycle", "bird", "boat",
45+
"bottle", "bus", "car", "cat", "chair",
46+
"cow", "diningtable", "dog", "horse",
47+
"motorbike", "person", "pottedplant",
48+
"sheep", "sofa", "train", "tvmonitor"};
49+
4350
const char* about = "This sample uses Single-Shot Detector "
44-
"(https://arxiv.org/abs/1512.02325)"
45-
"to detect objects on image\n"; // TODO: link
51+
"(https://arxiv.org/abs/1512.02325) "
52+
"to detect objects on camera/video/image.\n"
53+
".caffemodel model's file is available here: "
54+
"https://github.com/weiliu89/caffe/tree/ssd#models\n"
55+
"Default network is 300x300 and 20-classes VOC.\n";
4656

4757
const char* params
4858
= "{ help | false | print usage }"
4959
"{ proto | | model configuration }"
5060
"{ model | | model weights }"
51-
"{ image | | image for detection }"
61+
"{ camera_device | 0 | camera device number}"
62+
"{ video | | video or image for detection}"
5263
"{ min_confidence | 0.5 | min confidence }";
5364

5465
int main(int argc, char** argv)
@@ -57,7 +68,7 @@ int main(int argc, char** argv)
5768

5869
if (parser.get<bool>("help"))
5970
{
60-
std::cout << about << std::endl;
71+
cout << about << endl;
6172
parser.printMessage();
6273
return 0;
6374
}
@@ -79,58 +90,101 @@ int main(int argc, char** argv)
7990
exit(-1);
8091
}
8192

82-
cv::Mat frame = cv::imread(parser.get<string>("image"), -1);
83-
84-
if (frame.channels() == 4)
85-
cvtColor(frame, frame, COLOR_BGRA2BGR);
86-
//! [Prepare blob]
87-
Mat preprocessedFrame = preprocess(frame);
88-
89-
Mat inputBlob = blobFromImage(preprocessedFrame, 1.0f, Size(), Scalar(), false); //Convert Mat to batch of images
90-
//! [Prepare blob]
93+
VideoCapture cap;
94+
if (parser.get<String>("video").empty())
95+
{
96+
int cameraDevice = parser.get<int>("camera_device");
97+
cap = VideoCapture(cameraDevice);
98+
if(!cap.isOpened())
99+
{
100+
cout << "Couldn't find camera: " << cameraDevice << endl;
101+
return -1;
102+
}
103+
}
104+
else
105+
{
106+
cap.open(parser.get<String>("video"));
107+
if(!cap.isOpened())
108+
{
109+
cout << "Couldn't open image or video: " << parser.get<String>("video") << endl;
110+
return -1;
111+
}
112+
}
91113

92-
//! [Set input blob]
93-
net.setInput(inputBlob, "data"); //set the network input
94-
//! [Set input blob]
114+
for (;;)
115+
{
116+
cv::Mat frame;
117+
cap >> frame; // get a new frame from camera/video or read image
95118

96-
//! [Make forward pass]
97-
Mat detection = net.forward("detection_out"); //compute output
98-
//! [Make forward pass]
119+
if (frame.empty())
120+
{
121+
waitKey();
122+
break;
123+
}
99124

100-
Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>());
125+
if (frame.channels() == 4)
126+
cvtColor(frame, frame, COLOR_BGRA2BGR);
101127

102-
float confidenceThreshold = parser.get<float>("min_confidence");
103-
for(int i = 0; i < detectionMat.rows; i++)
104-
{
105-
float confidence = detectionMat.at<float>(i, 2);
128+
//! [Prepare blob]
129+
Mat preprocessedFrame = preprocess(frame);
106130

107-
if(confidence > confidenceThreshold)
108-
{
109-
size_t objectClass = (size_t)(detectionMat.at<float>(i, 1));
131+
Mat inputBlob = blobFromImage(preprocessedFrame, 1.0f, Size(), Scalar(), false); //Convert Mat to batch of images
132+
//! [Prepare blob]
110133

111-
float xLeftBottom = detectionMat.at<float>(i, 3) * frame.cols;
112-
float yLeftBottom = detectionMat.at<float>(i, 4) * frame.rows;
113-
float xRightTop = detectionMat.at<float>(i, 5) * frame.cols;
114-
float yRightTop = detectionMat.at<float>(i, 6) * frame.rows;
134+
//! [Set input blob]
135+
net.setInput(inputBlob, "data"); //set the network input
136+
//! [Set input blob]
115137

116-
std::cout << "Class: " << objectClass << std::endl;
117-
std::cout << "Confidence: " << confidence << std::endl;
138+
//! [Make forward pass]
139+
Mat detection = net.forward("detection_out"); //compute output
140+
//! [Make forward pass]
118141

119-
std::cout << " " << xLeftBottom
120-
<< " " << yLeftBottom
121-
<< " " << xRightTop
122-
<< " " << yRightTop << std::endl;
142+
vector<double> layersTimings;
143+
double freq = getTickFrequency() / 1000;
144+
double time = net.getPerfProfile(layersTimings) / freq;
145+
ostringstream ss;
146+
ss << "FPS: " << 1000/time << " ; time: " << time << " ms";
147+
putText(frame, ss.str(), Point(20,20), 0, 0.5, Scalar(0,0,255));
123148

124-
Rect object((int)xLeftBottom, (int)yLeftBottom,
125-
(int)(xRightTop - xLeftBottom),
126-
(int)(yRightTop - yLeftBottom));
149+
Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>());
127150

128-
rectangle(frame, object, Scalar(0, 255, 0));
151+
float confidenceThreshold = parser.get<float>("min_confidence");
152+
for(int i = 0; i < detectionMat.rows; i++)
153+
{
154+
float confidence = detectionMat.at<float>(i, 2);
155+
156+
if(confidence > confidenceThreshold)
157+
{
158+
size_t objectClass = (size_t)(detectionMat.at<float>(i, 1));
159+
160+
int xLeftBottom = static_cast<int>(detectionMat.at<float>(i, 3) * frame.cols);
161+
int yLeftBottom = static_cast<int>(detectionMat.at<float>(i, 4) * frame.rows);
162+
int xRightTop = static_cast<int>(detectionMat.at<float>(i, 5) * frame.cols);
163+
int yRightTop = static_cast<int>(detectionMat.at<float>(i, 6) * frame.rows);
164+
165+
ss.str("");
166+
ss << confidence;
167+
String conf(ss.str());
168+
169+
Rect object(xLeftBottom, yLeftBottom,
170+
xRightTop - xLeftBottom,
171+
yRightTop - yLeftBottom);
172+
173+
rectangle(frame, object, Scalar(0, 255, 0));
174+
String label = String(classNames[objectClass]) + ": " + conf;
175+
int baseLine = 0;
176+
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
177+
rectangle(frame, Rect(Point(xLeftBottom, yLeftBottom - labelSize.height),
178+
Size(labelSize.width, labelSize.height + baseLine)),
179+
Scalar(255, 255, 255), CV_FILLED);
180+
putText(frame, label, Point(xLeftBottom, yLeftBottom),
181+
FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0));
182+
}
129183
}
130-
}
131184

132-
imshow("detections", frame);
133-
waitKey();
185+
imshow("detections", frame);
186+
if (waitKey(1) >= 0) break;
187+
}
134188

135189
return 0;
136190
} // main

0 commit comments

Comments
 (0)