Skip to content

Commit 8f5b66f

Browse files
committed
GEMM kernel optimization for Intel GEN
The optimized kernels uses cl_intel_subgroups extension for better performance. Note: This optimized kernels will be part of ISAAC in a code generation way under MIT license. Signed-off-by: Woo, Insoo <insoo.woo@intel.com>
1 parent cc7f9f5 commit 8f5b66f

File tree

5 files changed

+1298
-38
lines changed

5 files changed

+1298
-38
lines changed

modules/core/include/opencv2/core/ocl.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ class CV_EXPORTS Device
160160
uint imagePitchAlignment() const;
161161
uint imageBaseAddressAlignment() const;
162162

163+
bool intelSubgroupsSupport() const;
164+
163165
size_t image2DMaxWidth() const;
164166
size_t image2DMaxHeight() const;
165167

modules/core/src/intel_gpu_gemm.cpp

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
/*
2+
* Copyright 2015-2017 Philippe Tillet
3+
* Copyright © 2017, Intel Corporation
4+
*
5+
* Permission is hereby granted, free of charge, to any person obtaining
6+
* a copy of this software and associated documentation files
7+
* (the "Software"), to deal in the Software without restriction,
8+
* including without limitation the rights to use, copy, modify, merge,
9+
* publish, distribute, sublicense, and/or sell copies of the Software,
10+
* and to permit persons to whom the Software is furnished to do so,
11+
* subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be
14+
* included in all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18+
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19+
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20+
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21+
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22+
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23+
*/
24+
25+
#ifdef HAVE_OPENCL
26+
27+
#include <sstream>
28+
#include "precomp.hpp"
29+
#include "opencl_kernels_core.hpp"
30+
#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
31+
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
32+
33+
namespace cv
34+
{
35+
36+
bool intel_gpu_gemm(
37+
UMat A, Size sizeA,
38+
UMat B, Size sizeB,
39+
UMat D, Size sizeD,
40+
double alpha, double beta,
41+
bool atrans, bool btrans)
42+
{
43+
sizeA; sizeB;
44+
45+
int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width);
46+
47+
std::string kernelName;
48+
bool ret = true;
49+
std::string opts;
50+
51+
int lx = 8, ly = 4;
52+
int dx = 4, dy = 8;
53+
54+
if(!atrans && !btrans)
55+
{
56+
kernelName = "intelblas_gemm_buffer_NN";
57+
if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0)
58+
{
59+
kernelName += "_sp";
60+
}
61+
}
62+
else if(atrans && !btrans)
63+
{
64+
kernelName = "intelblas_gemm_buffer_TN";
65+
}
66+
else if(!atrans && btrans)
67+
{
68+
kernelName = "intelblas_gemm_buffer_NT";
69+
ly = 16;
70+
dx = 1;
71+
}
72+
else
73+
{
74+
kernelName = "intelblas_gemm_buffer_TT";
75+
}
76+
77+
const size_t gx = (size_t)(N + dx - 1) / dx;
78+
const size_t gy = (size_t)(M + dy - 1) / dy;
79+
80+
size_t local[] = {lx, ly, 1};
81+
size_t global[] = {(gx + lx - 1) / lx * lx, (gy + ly - 1) / ly * ly, 1};
82+
83+
ocl::Kernel k(kernelName.c_str(), cv::ocl::core::intel_gemm_oclsrc, opts);
84+
if (k.empty())
85+
{
86+
return false;
87+
}
88+
89+
int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
90+
k.args(ocl::KernelArg::PtrReadOnly(A), // 0
91+
(int) (A.offset / sizeof(float)),
92+
ocl::KernelArg::PtrReadOnly(B),
93+
(int) (B.offset / sizeof(float)),
94+
ocl::KernelArg::PtrWriteOnly(D),
95+
(int) (D.offset / sizeof(float)),
96+
M, N, K,
97+
(float)alpha,
98+
(float)beta,
99+
(int)(A.step / sizeof(float)),
100+
(int)(B.step / sizeof(float)),
101+
(int)(D.step / sizeof(float)), // 13
102+
(int) 0, // 14 start_index
103+
stride); // 15
104+
105+
ocl::Queue q;
106+
if(!atrans && btrans)
107+
{
108+
ret = k.run(2, global, local, false, q, false);
109+
}
110+
else
111+
{
112+
for(int start_index = 0; start_index < K; start_index += stride)
113+
{
114+
k.set(14, &start_index, sizeof(start_index));
115+
if ((start_index + stride) < K)
116+
{
117+
ret = k.run(2, global, local, false, q, true);
118+
if (!ret) return ret;
119+
}
120+
else
121+
{
122+
ret = k.run(2, global, local, false, q, false);
123+
}
124+
}
125+
}
126+
127+
return ret;
128+
}
129+
130+
} // namespace cv
131+
132+
#endif
133+

modules/core/src/matmul.cpp

Lines changed: 64 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,11 @@
4141
//
4242
//M*/
4343

44+
#include <sstream>
4445
#include "precomp.hpp"
4546
#include "opencl_kernels_core.hpp"
4647
#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
48+
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
4749

4850
namespace cv
4951
{
@@ -787,6 +789,8 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
787789
#endif
788790

789791
#ifdef HAVE_OPENCL
792+
extern bool intel_gpu_gemm(UMat A, Size sizeA, UMat B, Size sizeB, UMat D, Size sizeD,
793+
double alpha, double beta, bool atrans, bool btrans);
790794

791795
static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
792796
InputArray matC, double beta, OutputArray matD, int flags )
@@ -805,63 +809,85 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
805809
bool haveC = matC.kind() != cv::_InputArray::NONE;
806810
Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
807811
bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
812+
813+
CV_Assert( !haveC || matC.type() == type );
808814

809-
if (atrans)
810-
sizeA = Size(sizeA.height, sizeA.width);
811-
if (btrans)
812-
sizeB = Size(sizeB.height, sizeB.width);
813-
if (haveC && ctrans)
814-
sizeC = Size(sizeC.height, sizeC.width);
815+
Size sizeD(((btrans)? sizeB.height : sizeB.width),
816+
((atrans)? sizeA.width : sizeA.height));
817+
matD.create(sizeD, type);
815818

816-
Size sizeD(sizeB.width, sizeA.height);
819+
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
817820

818-
CV_Assert( !haveC || matC.type() == type );
819-
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
820821

821-
int max_wg_size = (int)dev.maxWorkGroupSize();
822-
int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
822+
if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1)
823+
{
824+
String opts;
823825

824-
matD.create(sizeD, type);
826+
if (atrans)
827+
sizeA = Size(sizeA.height, sizeA.width);
828+
if (btrans)
829+
sizeB = Size(sizeB.height, sizeB.width);
830+
if (haveC && ctrans)
831+
sizeC = Size(sizeC.height, sizeC.width);
825832

826-
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
833+
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
827834

828-
if (atrans)
829-
A = A.t();
835+
int max_wg_size = (int)dev.maxWorkGroupSize();
836+
int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
830837

831-
if (btrans)
832-
B = B.t();
838+
if (atrans)
839+
A = A.t();
833840

834-
if (haveC)
835-
ctrans ? transpose(matC, D) : matC.copyTo(D);
841+
if (btrans)
842+
B = B.t();
843+
844+
if (haveC)
845+
ctrans ? transpose(matC, D) : matC.copyTo(D);
836846

837-
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
838-
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
847+
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
848+
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
839849

840-
String opts = format("-D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d %s %s %s",
850+
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d %s %s %s",
841851
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
842852
cn, kercn, block_size,
843853
(sizeA.width % block_size !=0) ? "-D NO_MULT" : "",
844854
haveC ? "-D HAVE_C" : "",
845855
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
846856

847-
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
848-
if (k.empty())
849-
return false;
857+
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
858+
if (k.empty())
859+
return false;
860+
861+
if (depth == CV_64F)
862+
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
863+
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
864+
ocl::KernelArg::ReadWrite(D, cn, kercn),
865+
sizeA.width, alpha, beta);
866+
else
867+
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
868+
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
869+
ocl::KernelArg::ReadWrite(D, cn, kercn),
870+
sizeA.width, (float)alpha, (float)beta);
871+
872+
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
873+
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
850874

851-
if (depth == CV_64F)
852-
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
853-
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
854-
ocl::KernelArg::ReadWrite(D, cn, kercn),
855-
sizeA.width, alpha, beta);
875+
return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
876+
}
856877
else
857-
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
858-
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
859-
ocl::KernelArg::ReadWrite(D, cn, kercn),
860-
sizeA.width, (float)alpha, (float)beta);
861-
862-
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
863-
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
864-
return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
878+
{
879+
if (haveC && beta != 0.0)
880+
{
881+
ctrans ? transpose(matC, D) : matC.copyTo(D);
882+
}
883+
884+
return intel_gpu_gemm(A, sizeA,
885+
B, sizeB,
886+
D, sizeD,
887+
alpha,
888+
beta,
889+
atrans, btrans);
890+
}
865891
}
866892
#endif
867893

modules/core/src/ocl.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1812,6 +1812,8 @@ struct Device::Impl
18121812
String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
18131813
parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
18141814

1815+
intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1816+
18151817
vendorName_ = getStrProp(CL_DEVICE_VENDOR);
18161818
if (vendorName_ == "Advanced Micro Devices, Inc." ||
18171819
vendorName_ == "AMD")
@@ -1851,6 +1853,18 @@ struct Device::Impl
18511853
sz < sizeof(buf) ? String(buf) : String();
18521854
}
18531855

1856+
bool isExtensionSupported(const String& extensionName) const
1857+
{
1858+
bool ret = false;
1859+
size_t pos = getStrProp(CL_DEVICE_EXTENSIONS).find(extensionName);
1860+
if (pos != String::npos)
1861+
{
1862+
ret = true;
1863+
}
1864+
return ret;
1865+
}
1866+
1867+
18541868
IMPLEMENT_REFCOUNTABLE();
18551869
cl_device_id handle;
18561870

@@ -1866,6 +1880,7 @@ struct Device::Impl
18661880
String driverVersion_;
18671881
String vendorName_;
18681882
int vendorID_;
1883+
bool intelSubgroupsSupport_;
18691884
};
18701885

18711886

@@ -2072,6 +2087,9 @@ size_t Device::imageMaxArraySize() const
20722087
{ CV_REQUIRE_OPENCL_1_2_ERROR; }
20732088
#endif
20742089

2090+
bool Device::intelSubgroupsSupport() const
2091+
{ return p ? p->intelSubgroupsSupport_ : false; }
2092+
20752093
int Device::maxClockFrequency() const
20762094
{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
20772095

0 commit comments

Comments
 (0)