1
+ #include " alpha_pose.hpp"
2
+ #include < atomic>
3
+ #include < mutex>
4
+ #include < queue>
5
+ #include < condition_variable>
6
+ #include < infer/trt_infer.hpp>
7
+ #include < common/ilogger.hpp>
8
+ #include < common/infer_controller.hpp>
9
+ #include < common/monopoly_allocator.hpp>
10
+ #include < common/preprocess_kernel.cuh>
11
+
12
+ namespace AlphaPoseOld {
13
+
14
+ struct AffineMatrix {
15
+ float i2d[6 ]; // image to dst(network), 2x3 matrix
16
+ float d2i[6 ]; // dst to image, 2x3 matrix
17
+
18
+ void compute (const cv::Size& image_size, const cv::Rect& box, const cv::Size& net_size){
19
+ Rect box_ = box;
20
+ if (box_.width == 0 || box_.height == 0 ){
21
+ box_.width = image_size.width ;
22
+ box_.height = image_size.height ;
23
+ box_.x = 0 ;
24
+ box_.y = 0 ;
25
+ }
26
+
27
+ float rate = box_.width > 100 ? 0 .1f : 0 .15f ;
28
+ float pad_width = box_.width * (1 + 2 * rate);
29
+ float pad_height = box_.height * (1 + 1 * rate);
30
+ float scale = min (net_size.width / pad_width, net_size.height / pad_height);
31
+ i2d[0 ] = scale; i2d[1 ] = 0 ; i2d[2 ] = -(box_.x - box_.width * 1 * rate + pad_width * 0.5 ) * scale + net_size.width * 0.5 + scale * 0.5 - 0.5 ;
32
+ i2d[3 ] = 0 ; i2d[4 ] = scale; i2d[5 ] = -(box_.y - box_.height * 1 * rate + pad_height * 0.5 ) * scale + net_size.height * 0.5 + scale * 0.5 - 0.5 ;
33
+
34
+ cv::Mat m2x3_i2d (2 , 3 , CV_32F, i2d);
35
+ cv::Mat m2x3_d2i (2 , 3 , CV_32F, d2i);
36
+ cv::invertAffineTransform (m2x3_i2d, m2x3_d2i);
37
+ }
38
+
39
+ cv::Mat i2d_mat (){
40
+ return cv::Mat (2 , 3 , CV_32F, i2d);
41
+ }
42
+ };
43
+
44
+ static tuple<float , float > affine_project (float x, float y, float * pmatrix){
45
+
46
+ float newx = x * pmatrix[0 ] + y * pmatrix[1 ] + pmatrix[2 ];
47
+ float newy = x * pmatrix[3 ] + y * pmatrix[4 ] + pmatrix[5 ];
48
+ return make_tuple (newx, newy);
49
+ }
50
+
51
+ using ControllerImpl = InferController
52
+ <
53
+ Input, // input
54
+ vector<Point3f>, // output
55
+ tuple<string, int >, // start param
56
+ AffineMatrix // additional
57
+ >;
58
+ class InferImpl : public Infer , public ControllerImpl {
59
+ public:
60
+ /* * 要求在InferImpl里面执行stop,而不是在基类执行stop **/
61
+ virtual ~InferImpl (){
62
+ TRT::set_device (gpu_);
63
+ stop ();
64
+ }
65
+
66
+ bool startup (const string& file, int gpuid){
67
+ return ControllerImpl::startup (make_tuple (file, gpuid));
68
+ }
69
+
70
+ virtual void worker (promise<bool >& result) override {
71
+
72
+ string file = get<0 >(start_param_);
73
+ int gpuid = get<1 >(start_param_);
74
+
75
+ TRT::set_device (gpuid);
76
+ auto engine = TRT::load_infer (file);
77
+ if (engine == nullptr ){
78
+ INFOE (" Engine %s load failed" , file.c_str ());
79
+ result.set_value (false );
80
+ return ;
81
+ }
82
+
83
+ engine->print ();
84
+
85
+ int max_batch_size = engine->get_max_batch_size ();
86
+ auto input = engine->input ();
87
+ auto output = engine->output ();
88
+ int stride = input->width () / output->width ();
89
+ input_width_ = input->width ();
90
+ input_height_ = input->height ();
91
+ gpu_ = gpuid;
92
+ tensor_allocator_ = make_shared<MonopolyAllocator<TRT::Tensor>>(max_batch_size * 2 );
93
+ stream_ = engine->get_stream ();
94
+ result.set_value (true );
95
+ input->resize_single_dim (0 , max_batch_size);
96
+
97
+ int n = 0 ;
98
+ vector<Job> fetch_jobs;
99
+ while (get_jobs_and_wait (fetch_jobs, max_batch_size)){
100
+
101
+ int infer_batch_size = fetch_jobs.size ();
102
+ input->resize_single_dim (0 , infer_batch_size);
103
+
104
+ for (int ibatch = 0 ; ibatch < infer_batch_size; ++ibatch){
105
+ auto & job = fetch_jobs[ibatch];
106
+ input->copy_from_gpu (input->offset (ibatch), job.mono_tensor ->data ()->gpu (), input->count (1 ));
107
+ job.mono_tensor ->release ();
108
+ }
109
+
110
+ engine->forward (false );
111
+ for (int ibatch = 0 ; ibatch < infer_batch_size; ++ibatch){
112
+
113
+ auto & job = fetch_jobs[ibatch];
114
+ float * image_based_output = output->cpu <float >(ibatch);
115
+ auto & image_based_keypoints = job.output ;
116
+ auto & affine_matrix = job.additional ;
117
+ int begin_channel = 17 ;
118
+ int area = output->width () * output->height ();
119
+ image_based_keypoints.resize (output->channel () - begin_channel);
120
+
121
+ for (int i = begin_channel; i < output->channel (); ++i){
122
+ float * output_channel = output->cpu <float >(ibatch, i);
123
+ int location = std::max_element (output_channel, output_channel + area) - output_channel;
124
+ float confidence = output_channel[location];
125
+ float x = (location % output->width ()) * stride;
126
+ float y = (location / output->width ()) * stride;
127
+ auto & output_point = image_based_keypoints[i-begin_channel];
128
+
129
+ output_point.z = confidence;
130
+ tie (output_point.x , output_point.y ) = affine_project (x, y, job.additional .d2i );
131
+ }
132
+ job.pro ->set_value (job.output );
133
+ }
134
+ fetch_jobs.clear ();
135
+ }
136
+ stream_ = nullptr ;
137
+ tensor_allocator_.reset ();
138
+ INFO (" Engine destroy." );
139
+ }
140
+
141
+ virtual shared_future<vector<Point3f>> commit (const Input& input) override {
142
+ return ControllerImpl::commit (input);
143
+ }
144
+
145
+ virtual vector<shared_future<vector<Point3f>>> commits (const vector<Input>& inputs) override {
146
+ return ControllerImpl::commits (inputs);
147
+ }
148
+
149
+ virtual bool preprocess (Job& job, const Input& input) override {
150
+
151
+ if (tensor_allocator_ == nullptr ){
152
+ INFOE (" tensor_allocator_ is nullptr" );
153
+ return false ;
154
+ }
155
+
156
+ job.mono_tensor = tensor_allocator_->query ();
157
+ if (job.mono_tensor == nullptr ){
158
+ INFOE (" Tensor allocator query failed." );
159
+ return false ;
160
+ }
161
+
162
+ CUDATools::AutoDevice auto_device (gpu_);
163
+ auto & tensor = job.mono_tensor ->data ();
164
+ if (tensor == nullptr ){
165
+ // not init
166
+ tensor = make_shared<TRT::Tensor>();
167
+ tensor->set_workspace (make_shared<TRT::MixMemory>());
168
+ }
169
+
170
+ auto & image = get<0 >(input);
171
+ auto & box = get<1 >(input);
172
+ Size input_size (input_width_, input_height_);
173
+ job.additional .compute (image.size (), box, input_size);
174
+
175
+ tensor->set_stream (stream_);
176
+ tensor->resize (1 , 3 , input_height_, input_width_);
177
+ float mean[] = {0.406 , 0.457 , 0.480 };
178
+ float std[] = {1 , 1 , 1 };
179
+
180
+ size_t size_image = image.cols * image.rows * 3 ;
181
+ size_t size_matrix = iLogger::upbound (sizeof (job.additional .d2i ), 32 );
182
+ auto workspace = tensor->get_workspace ();
183
+ uint8_t * gpu_workspace = (uint8_t *)workspace->gpu (size_image + size_matrix);
184
+ float * affine_matrix_device = (float *)gpu_workspace;
185
+ uint8_t * image_device = gpu_workspace + size_matrix;
186
+ checkCudaRuntime (cudaMemcpyAsync (image_device, image.data , size_image, cudaMemcpyHostToDevice, stream_));
187
+ checkCudaRuntime (cudaMemcpyAsync (affine_matrix_device, job.additional .d2i , sizeof (job.additional .d2i ), cudaMemcpyHostToDevice, stream_));
188
+
189
+ auto normalize = CUDAKernel::Norm::mean_std (mean, std, 1 /255 .0f , CUDAKernel::ChannelType::Invert);
190
+ CUDAKernel::warp_affine_bilinear_and_normalize_plane (
191
+ image_device, image.cols * 3 , image.cols , image.rows ,
192
+ tensor->gpu <float >(), input_width_, input_height_,
193
+ affine_matrix_device, 127 ,
194
+ normalize, stream_
195
+ );
196
+ return true ;
197
+ }
198
+
199
+ private:
200
+ int input_width_ = 0 ;
201
+ int input_height_ = 0 ;
202
+ int gpu_ = 0 ;
203
+ TRT::CUStream stream_ = nullptr ;
204
+ };
205
+
206
+ shared_ptr<Infer> create_infer (const string& engine_file, int gpuid){
207
+ shared_ptr<InferImpl> instance (new InferImpl ());
208
+ if (!instance->startup (engine_file, gpuid)){
209
+ instance.reset ();
210
+ }
211
+ return instance;
212
+ }
213
+ };
0 commit comments