GEMM kernel optimization for Intel GEN

insoow · insoow · commit 8f5b66ff4573 · 2017-01-30T15:25:25.000-08:00
The optimized kernels uses cl_intel_subgroups extension for better
performance.

Note: This optimized kernels will be part of ISAAC in a code generation
way under MIT license.

Signed-off-by: Woo, Insoo &lt;insoo.woo@intel.com&gt;
diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
@@ -160,6 +160,8 @@ class CV_EXPORTS Device
     uint imagePitchAlignment() const;
     uint imageBaseAddressAlignment() const;
 
+    bool intelSubgroupsSupport() const;
+
     size_t image2DMaxWidth() const;
     size_t image2DMaxHeight() const;
 
diff --git a/modules/core/src/intel_gpu_gemm.cpp b/modules/core/src/intel_gpu_gemm.cpp
@@ -0,0 +1,133 @@
+/*
+* Copyright 2015-2017 Philippe Tillet
+* Copyright © 2017, Intel Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifdef HAVE_OPENCL
+
+#include <sstream>
+#include "precomp.hpp"
+#include "opencl_kernels_core.hpp"
+#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
+#include "opencv2/core/opencl/runtime/opencl_core.hpp"
+
+namespace cv
+{
+
+bool intel_gpu_gemm(
+    UMat A, Size sizeA,
+    UMat B, Size sizeB,
+    UMat D, Size sizeD,
+    double alpha, double beta, 
+    bool atrans, bool btrans)
+{
+    sizeA; sizeB;
+
+    int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width);
+
+    std::string kernelName;
+    bool ret = true;
+    std::string opts;
+
+    int lx = 8, ly = 4;
+    int dx = 4, dy = 8;
+
+    if(!atrans && !btrans)
+    {
+        kernelName = "intelblas_gemm_buffer_NN";
+        if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0)
+        {
+            kernelName += "_sp";
+        }
+    }
+    else if(atrans && !btrans) 
+    {
+        kernelName = "intelblas_gemm_buffer_TN";
+    }
+    else if(!atrans && btrans) 
+    {
+        kernelName = "intelblas_gemm_buffer_NT";
+        ly = 16;
+        dx = 1;
+    } 
+    else 
+    {
+        kernelName = "intelblas_gemm_buffer_TT";
+    }
+
+    const size_t gx = (size_t)(N + dx - 1) / dx;
+    const size_t gy = (size_t)(M + dy - 1) / dy;
+
+    size_t local[] = {lx, ly, 1};
+    size_t global[] = {(gx + lx - 1) / lx * lx, (gy + ly - 1) / ly * ly, 1};
+ 
+    ocl::Kernel k(kernelName.c_str(), cv::ocl::core::intel_gemm_oclsrc, opts);
+    if (k.empty())
+    {
+        return false;
+    }
+
+    int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
+    k.args(ocl::KernelArg::PtrReadOnly(A),   // 0
+           (int) (A.offset / sizeof(float)),
+           ocl::KernelArg::PtrReadOnly(B),
+           (int) (B.offset / sizeof(float)),
+           ocl::KernelArg::PtrWriteOnly(D),
+           (int) (D.offset / sizeof(float)),
+           M, N, K,
+           (float)alpha,
+           (float)beta,
+           (int)(A.step / sizeof(float)),
+           (int)(B.step / sizeof(float)),
+           (int)(D.step / sizeof(float)),    // 13
+           (int) 0,                          // 14 start_index
+           stride);                          // 15
+
+    ocl::Queue q;
+    if(!atrans && btrans)
+    {
+        ret = k.run(2, global, local, false, q, false);
+    }
+    else
+    {
+        for(int start_index = 0; start_index < K; start_index += stride)
+        {
+       	    k.set(14, &start_index, sizeof(start_index));
+            if ((start_index + stride) < K)
+    	    {
+    	        ret = k.run(2, global, local, false, q, true);
+                if (!ret) return ret;
+    	    }
+    	    else
+            {
+                ret = k.run(2, global, local, false, q, false);
+    	    }
+        }
+    }
+
+    return ret;
+}
+
+} // namespace cv
+
+#endif
+
diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp
@@ -41,9 +41,11 @@
 //
 //M*/
 
+#include <sstream>
 #include "precomp.hpp"
 #include "opencl_kernels_core.hpp"
 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
+#include "opencv2/core/opencl/runtime/opencl_core.hpp"
 
 namespace cv
 {
@@ -787,6 +789,8 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
 #endif
 
 #ifdef HAVE_OPENCL
+extern bool intel_gpu_gemm(UMat A, Size sizeA, UMat B, Size sizeB, UMat D, Size sizeD,
+       double alpha, double beta, bool atrans, bool btrans);
 
 static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
                       InputArray matC, double beta, OutputArray matD, int flags )
@@ -805,63 +809,85 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
     bool haveC = matC.kind() != cv::_InputArray::NONE;
     Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
     bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
+    
+    CV_Assert( !haveC || matC.type() == type );
 
-    if (atrans)
-        sizeA = Size(sizeA.height, sizeA.width);
-    if (btrans)
-        sizeB = Size(sizeB.height, sizeB.width);
-    if (haveC && ctrans)
-        sizeC = Size(sizeC.height, sizeC.width);
+    Size sizeD(((btrans)? sizeB.height : sizeB.width),
+               ((atrans)? sizeA.width : sizeA.height)); 
+    matD.create(sizeD, type);
 
-    Size sizeD(sizeB.width, sizeA.height);
+    UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
 
-    CV_Assert( !haveC || matC.type() == type );
-    CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
 
-    int max_wg_size = (int)dev.maxWorkGroupSize();
-    int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
+    if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1)
+    {
+        String opts;
 
-    matD.create(sizeD, type);
+        if (atrans)
+            sizeA = Size(sizeA.height, sizeA.width);
+        if (btrans)
+            sizeB = Size(sizeB.height, sizeB.width);
+        if (haveC && ctrans)
+            sizeC = Size(sizeC.height, sizeC.width);
 
-    UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
+        CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
 
-    if (atrans)
-        A = A.t();
+        int max_wg_size = (int)dev.maxWorkGroupSize();
+        int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
 
-    if (btrans)
-        B = B.t();
+        if (atrans)
+            A = A.t();
 
-    if (haveC)
-        ctrans ? transpose(matC, D) : matC.copyTo(D);
+        if (btrans)
+            B = B.t();
+
+        if (haveC)
+            ctrans ? transpose(matC, D) : matC.copyTo(D);
 
-    int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
-    int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
+        int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
+        int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
 
-    String opts = format("-D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d %s %s %s",
+        opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d %s %s %s",
                           ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
                           cn, kercn, block_size,
                           (sizeA.width % block_size !=0) ? "-D NO_MULT" : "",
                           haveC ? "-D HAVE_C" : "",
                           doubleSupport ? " -D DOUBLE_SUPPORT" : "");
 
-    ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
-    if (k.empty())
-        return false;
+        ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
+        if (k.empty())
+            return false;
+
+        if (depth == CV_64F)
+            k.args(ocl::KernelArg::ReadOnlyNoSize(A),
+                   ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
+                   ocl::KernelArg::ReadWrite(D, cn, kercn),
+                   sizeA.width, alpha, beta);
+        else
+            k.args(ocl::KernelArg::ReadOnlyNoSize(A),
+                   ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
+                   ocl::KernelArg::ReadWrite(D, cn, kercn),
+                   sizeA.width, (float)alpha, (float)beta);
+
+        size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
+        size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
 
-    if (depth == CV_64F)
-        k.args(ocl::KernelArg::ReadOnlyNoSize(A),
-               ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
-               ocl::KernelArg::ReadWrite(D, cn, kercn),
-               sizeA.width, alpha, beta);
+        return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
+    }
     else
-        k.args(ocl::KernelArg::ReadOnlyNoSize(A),
-               ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
-               ocl::KernelArg::ReadWrite(D, cn, kercn),
-               sizeA.width, (float)alpha, (float)beta);
-
-    size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
-    size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
-    return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
+    {
+        if (haveC && beta != 0.0)
+        {
+            ctrans ? transpose(matC, D) : matC.copyTo(D);
+        }
+
+        return intel_gpu_gemm(A, sizeA,
+                              B, sizeB,
+                              D, sizeD, 
+                              alpha,
+                              beta,
+                              atrans, btrans);
+    }
 }
 #endif
 
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
@@ -1812,6 +1812,8 @@ struct Device::Impl
         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
 
+        intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
+        
         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
         if (vendorName_ == "Advanced Micro Devices, Inc." ||
             vendorName_ == "AMD")
@@ -1851,6 +1853,18 @@ struct Device::Impl
             sz < sizeof(buf) ? String(buf) : String();
     }
 
+    bool isExtensionSupported(const String& extensionName) const
+    {
+        bool ret = false;
+        size_t pos = getStrProp(CL_DEVICE_EXTENSIONS).find(extensionName);
+        if (pos != String::npos)
+        {
+            ret = true;
+        }
+        return ret;
+    }
+
+
     IMPLEMENT_REFCOUNTABLE();
     cl_device_id handle;
 
@@ -1866,6 +1880,7 @@ struct Device::Impl
     String driverVersion_;
     String vendorName_;
     int vendorID_;
+    bool intelSubgroupsSupport_;
 };
 
 
@@ -2072,6 +2087,9 @@ size_t Device::imageMaxArraySize() const
 { CV_REQUIRE_OPENCL_1_2_ERROR; }
 #endif
 
+bool Device::intelSubgroupsSupport() const
+{ return p ? p->intelSubgroupsSupport_ : false; }
+
 int Device::maxClockFrequency() const
 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
 
diff --git a/modules/core/src/opencl/intel_gemm.cl b/modules/core/src/opencl/intel_gemm.cl