Skip to content

Commit d577fba

Browse files
author
wud
committed
fix fall
1 parent f92ccb3 commit d577fba

File tree

5 files changed

+245
-4
lines changed

5 files changed

+245
-4
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ set(PythonRoot "/data/datav/newbb/lean/anaconda3/envs/torch1.8")
1212
set(PythonName "python3.9")
1313

1414
# 如果你是不同显卡,请设置为显卡对应的号码参考这里:https://developer.nvidia.com/zh-cn/cuda-gpus#compute
15-
set(CUDA_GEN_CODE "-gencode=arch=compute_75,code=sm_75")
15+
#set(CUDA_GEN_CODE "-gencode=arch=compute_75,code=sm_75")
1616

1717
# 如果你的opencv找不到,可以自己指定目录
1818
set(OpenCV_DIR "/data/datav/expstation/lean/opencv4.2.0/lib/cmake/opencv4/")

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ nvcc = ${lean_cuda}/bin/nvcc
33

44
# 如果是其他显卡,请修改-gencode=arch=compute_75,code=sm_75为对应显卡的能力
55
# 显卡对应的号码参考这里:https://developer.nvidia.com/zh-cn/cuda-gpus#compute
6-
cuda_arch := -gencode=arch=compute_75,code=sm_75
6+
cuda_arch := # -gencode=arch=compute_75,code=sm_75
77

88
cpp_srcs := $(shell find src -name "*.cpp")
99
cpp_objs := $(cpp_srcs:.cpp=.cpp.o)
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
#include "alpha_pose.hpp"
2+
#include <atomic>
3+
#include <mutex>
4+
#include <queue>
5+
#include <condition_variable>
6+
#include <infer/trt_infer.hpp>
7+
#include <common/ilogger.hpp>
8+
#include <common/infer_controller.hpp>
9+
#include <common/monopoly_allocator.hpp>
10+
#include <common/preprocess_kernel.cuh>
11+
12+
namespace AlphaPoseOld{
13+
14+
struct AffineMatrix{
15+
float i2d[6]; // image to dst(network), 2x3 matrix
16+
float d2i[6]; // dst to image, 2x3 matrix
17+
18+
void compute(const cv::Size& image_size, const cv::Rect& box, const cv::Size& net_size){
19+
Rect box_ = box;
20+
if(box_.width == 0 || box_.height == 0){
21+
box_.width = image_size.width;
22+
box_.height = image_size.height;
23+
box_.x = 0;
24+
box_.y = 0;
25+
}
26+
27+
float rate = box_.width > 100 ? 0.1f : 0.15f;
28+
float pad_width = box_.width * (1 + 2 * rate);
29+
float pad_height = box_.height * (1 + 1 * rate);
30+
float scale = min(net_size.width / pad_width, net_size.height / pad_height);
31+
i2d[0] = scale; i2d[1] = 0; i2d[2] = -(box_.x - box_.width * 1 * rate + pad_width * 0.5) * scale + net_size.width * 0.5 + scale * 0.5 - 0.5;
32+
i2d[3] = 0; i2d[4] = scale; i2d[5] = -(box_.y - box_.height * 1 * rate + pad_height * 0.5) * scale + net_size.height * 0.5 + scale * 0.5 - 0.5;
33+
34+
cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);
35+
cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);
36+
cv::invertAffineTransform(m2x3_i2d, m2x3_d2i);
37+
}
38+
39+
cv::Mat i2d_mat(){
40+
return cv::Mat(2, 3, CV_32F, i2d);
41+
}
42+
};
43+
44+
static tuple<float, float> affine_project(float x, float y, float* pmatrix){
45+
46+
float newx = x * pmatrix[0] + y * pmatrix[1] + pmatrix[2];
47+
float newy = x * pmatrix[3] + y * pmatrix[4] + pmatrix[5];
48+
return make_tuple(newx, newy);
49+
}
50+
51+
using ControllerImpl = InferController
52+
<
53+
Input, // input
54+
vector<Point3f>, // output
55+
tuple<string, int>, // start param
56+
AffineMatrix // additional
57+
>;
58+
class InferImpl : public Infer, public ControllerImpl{
59+
public:
60+
/** 要求在InferImpl里面执行stop,而不是在基类执行stop **/
61+
virtual ~InferImpl(){
62+
TRT::set_device(gpu_);
63+
stop();
64+
}
65+
66+
bool startup(const string& file, int gpuid){
67+
return ControllerImpl::startup(make_tuple(file, gpuid));
68+
}
69+
70+
virtual void worker(promise<bool>& result) override{
71+
72+
string file = get<0>(start_param_);
73+
int gpuid = get<1>(start_param_);
74+
75+
TRT::set_device(gpuid);
76+
auto engine = TRT::load_infer(file);
77+
if(engine == nullptr){
78+
INFOE("Engine %s load failed", file.c_str());
79+
result.set_value(false);
80+
return;
81+
}
82+
83+
engine->print();
84+
85+
int max_batch_size = engine->get_max_batch_size();
86+
auto input = engine->input();
87+
auto output = engine->output();
88+
int stride = input->width() / output->width();
89+
input_width_ = input->width();
90+
input_height_ = input->height();
91+
gpu_ = gpuid;
92+
tensor_allocator_ = make_shared<MonopolyAllocator<TRT::Tensor>>(max_batch_size * 2);
93+
stream_ = engine->get_stream();
94+
result.set_value(true);
95+
input->resize_single_dim(0, max_batch_size);
96+
97+
int n = 0;
98+
vector<Job> fetch_jobs;
99+
while(get_jobs_and_wait(fetch_jobs, max_batch_size)){
100+
101+
int infer_batch_size = fetch_jobs.size();
102+
input->resize_single_dim(0, infer_batch_size);
103+
104+
for(int ibatch = 0; ibatch < infer_batch_size; ++ibatch){
105+
auto& job = fetch_jobs[ibatch];
106+
input->copy_from_gpu(input->offset(ibatch), job.mono_tensor->data()->gpu(), input->count(1));
107+
job.mono_tensor->release();
108+
}
109+
110+
engine->forward(false);
111+
for(int ibatch = 0; ibatch < infer_batch_size; ++ibatch){
112+
113+
auto& job = fetch_jobs[ibatch];
114+
float* image_based_output = output->cpu<float>(ibatch);
115+
auto& image_based_keypoints = job.output;
116+
auto& affine_matrix = job.additional;
117+
int begin_channel = 17;
118+
int area = output->width() * output->height();
119+
image_based_keypoints.resize(output->channel() - begin_channel);
120+
121+
for(int i = begin_channel; i < output->channel(); ++i){
122+
float* output_channel = output->cpu<float>(ibatch, i);
123+
int location = std::max_element(output_channel, output_channel + area) - output_channel;
124+
float confidence = output_channel[location];
125+
float x = (location % output->width()) * stride;
126+
float y = (location / output->width()) * stride;
127+
auto& output_point = image_based_keypoints[i-begin_channel];
128+
129+
output_point.z = confidence;
130+
tie(output_point.x, output_point.y) = affine_project(x, y, job.additional.d2i);
131+
}
132+
job.pro->set_value(job.output);
133+
}
134+
fetch_jobs.clear();
135+
}
136+
stream_ = nullptr;
137+
tensor_allocator_.reset();
138+
INFO("Engine destroy.");
139+
}
140+
141+
virtual shared_future<vector<Point3f>> commit(const Input& input) override{
142+
return ControllerImpl::commit(input);
143+
}
144+
145+
virtual vector<shared_future<vector<Point3f>>> commits(const vector<Input>& inputs) override{
146+
return ControllerImpl::commits(inputs);
147+
}
148+
149+
virtual bool preprocess(Job& job, const Input& input) override{
150+
151+
if(tensor_allocator_ == nullptr){
152+
INFOE("tensor_allocator_ is nullptr");
153+
return false;
154+
}
155+
156+
job.mono_tensor = tensor_allocator_->query();
157+
if(job.mono_tensor == nullptr){
158+
INFOE("Tensor allocator query failed.");
159+
return false;
160+
}
161+
162+
CUDATools::AutoDevice auto_device(gpu_);
163+
auto& tensor = job.mono_tensor->data();
164+
if(tensor == nullptr){
165+
// not init
166+
tensor = make_shared<TRT::Tensor>();
167+
tensor->set_workspace(make_shared<TRT::MixMemory>());
168+
}
169+
170+
auto& image = get<0>(input);
171+
auto& box = get<1>(input);
172+
Size input_size(input_width_, input_height_);
173+
job.additional.compute(image.size(), box, input_size);
174+
175+
tensor->set_stream(stream_);
176+
tensor->resize(1, 3, input_height_, input_width_);
177+
float mean[] = {0.406, 0.457, 0.480};
178+
float std[] = {1, 1, 1};
179+
180+
size_t size_image = image.cols * image.rows * 3;
181+
size_t size_matrix = iLogger::upbound(sizeof(job.additional.d2i), 32);
182+
auto workspace = tensor->get_workspace();
183+
uint8_t* gpu_workspace = (uint8_t*)workspace->gpu(size_image + size_matrix);
184+
float* affine_matrix_device = (float*)gpu_workspace;
185+
uint8_t* image_device = gpu_workspace + size_matrix;
186+
checkCudaRuntime(cudaMemcpyAsync(image_device, image.data, size_image, cudaMemcpyHostToDevice, stream_));
187+
checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, job.additional.d2i, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));
188+
189+
auto normalize = CUDAKernel::Norm::mean_std(mean, std, 1/255.0f, CUDAKernel::ChannelType::Invert);
190+
CUDAKernel::warp_affine_bilinear_and_normalize_plane(
191+
image_device, image.cols * 3, image.cols, image.rows,
192+
tensor->gpu<float>(), input_width_, input_height_,
193+
affine_matrix_device, 127,
194+
normalize, stream_
195+
);
196+
return true;
197+
}
198+
199+
private:
200+
int input_width_ = 0;
201+
int input_height_ = 0;
202+
int gpu_ = 0;
203+
TRT::CUStream stream_ = nullptr;
204+
};
205+
206+
shared_ptr<Infer> create_infer(const string& engine_file, int gpuid){
207+
shared_ptr<InferImpl> instance(new InferImpl());
208+
if(!instance->startup(engine_file, gpuid)){
209+
instance.reset();
210+
}
211+
return instance;
212+
}
213+
};
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#ifndef ALPHA_POSE_HPP
2+
#define ALPHA_POSE_HPP
3+
4+
#include <vector>
5+
#include <memory>
6+
#include <string>
7+
#include <future>
8+
#include <opencv2/opencv.hpp>
9+
10+
// based on https://github.com/MVIG-SJTU/AlphaPose v0.3.0 version
11+
namespace AlphaPoseOld{
12+
13+
using namespace std;
14+
using namespace cv;
15+
16+
typedef tuple<Mat, Rect> Input;
17+
18+
class Infer{
19+
public:
20+
virtual shared_future<vector<Point3f>> commit(const Input& input) = 0;
21+
virtual vector<shared_future<vector<Point3f>>> commits(const vector<Input>& inputs) = 0;
22+
};
23+
24+
shared_ptr<Infer> create_infer(const string& engine_file, int gpuid);
25+
26+
}; // namespace AlphaPose
27+
28+
#endif // ALPHA_POSE_HPP

src/application/app_fall_recognize.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include <common/ilogger.hpp>
88

99
#include "app_yolo/yolo.hpp"
10-
#include "app_alphapose/alpha_pose.hpp"
10+
#include "app_alphapose_old/alpha_pose.hpp"
1111
#include "app_fall_gcn/fall_gcn.hpp"
1212
#include "tools/zmq_remote_show.hpp"
1313
#include "tools/deepsort.hpp"
@@ -54,7 +54,7 @@ int app_fall_recognize(){
5454
auto detector_model_file = "yolox_m.FP32.trtmodel";
5555
auto gcn_model_file = "fall_bp.FP32.trtmodel";
5656

57-
auto pose_model = AlphaPose::create_infer(pose_model_file, 0);
57+
auto pose_model = AlphaPoseOld::create_infer(pose_model_file, 0);
5858
auto detector_model = Yolo::create_infer(detector_model_file, Yolo::Type::X, 0, 0.4f);
5959
auto gcn_model = FallGCN::create_infer(gcn_model_file, 0);
6060

0 commit comments

Comments
 (0)