41
41
//
42
42
//M*/
43
43
44
+ #include < sstream>
44
45
#include " precomp.hpp"
45
46
#include " opencl_kernels_core.hpp"
46
47
#include " opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
48
+ #include " opencv2/core/opencl/runtime/opencl_core.hpp"
47
49
48
50
namespace cv
49
51
{
@@ -787,6 +789,8 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
787
789
#endif
788
790
789
791
#ifdef HAVE_OPENCL
792
+ extern bool intel_gpu_gemm (UMat A, Size sizeA, UMat B, Size sizeB, UMat D, Size sizeD,
793
+ double alpha, double beta, bool atrans, bool btrans);
790
794
791
795
static bool ocl_gemm ( InputArray matA, InputArray matB, double alpha,
792
796
InputArray matC, double beta, OutputArray matD, int flags )
@@ -805,63 +809,85 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
805
809
bool haveC = matC.kind () != cv::_InputArray::NONE;
806
810
Size sizeA = matA.size (), sizeB = matB.size (), sizeC = haveC ? matC.size () : Size (0 , 0 );
807
811
bool atrans = (flags & GEMM_1_T) != 0 , btrans = (flags & GEMM_2_T) != 0 , ctrans = (flags & GEMM_3_T) != 0 ;
812
+
813
+ CV_Assert ( !haveC || matC.type () == type );
808
814
809
- if (atrans)
810
- sizeA = Size (sizeA.height , sizeA.width );
811
- if (btrans)
812
- sizeB = Size (sizeB.height , sizeB.width );
813
- if (haveC && ctrans)
814
- sizeC = Size (sizeC.height , sizeC.width );
815
+ Size sizeD (((btrans)? sizeB.height : sizeB.width ),
816
+ ((atrans)? sizeA.width : sizeA.height ));
817
+ matD.create (sizeD, type);
815
818
816
- Size sizeD (sizeB. width , sizeA. height );
819
+ UMat A = matA. getUMat (), B = matB. getUMat (), D = matD. getUMat ( );
817
820
818
- CV_Assert ( !haveC || matC.type () == type );
819
- CV_Assert ( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
820
821
821
- int max_wg_size = (int )dev.maxWorkGroupSize ();
822
- int block_size = (max_wg_size / (32 *cn) < 32 ) ? (max_wg_size / (16 *cn) < 16 ) ? (max_wg_size / (8 *cn) < 8 ) ? 1 : 8 : 16 : 32 ;
822
+ if (!dev.intelSubgroupsSupport () || (depth == CV_64F) || cn != 1 )
823
+ {
824
+ String opts;
823
825
824
- matD.create (sizeD, type);
826
+ if (atrans)
827
+ sizeA = Size (sizeA.height , sizeA.width );
828
+ if (btrans)
829
+ sizeB = Size (sizeB.height , sizeB.width );
830
+ if (haveC && ctrans)
831
+ sizeC = Size (sizeC.height , sizeC.width );
825
832
826
- UMat A = matA. getUMat (), B = matB. getUMat (), D = matD. getUMat ( );
833
+ CV_Assert ( sizeA. width == sizeB. height && (!haveC || sizeC == sizeD) );
827
834
828
- if (atrans)
829
- A = A. t () ;
835
+ int max_wg_size = ( int )dev. maxWorkGroupSize ();
836
+ int block_size = (max_wg_size / ( 32 *cn) < 32 ) ? (max_wg_size / ( 16 *cn) < 16 ) ? (max_wg_size / ( 8 *cn) < 8 ) ? 1 : 8 : 16 : 32 ;
830
837
831
- if (btrans )
832
- B = B .t ();
838
+ if (atrans )
839
+ A = A .t ();
833
840
834
- if (haveC)
835
- ctrans ? transpose (matC, D) : matC.copyTo (D);
841
+ if (btrans)
842
+ B = B.t ();
843
+
844
+ if (haveC)
845
+ ctrans ? transpose (matC, D) : matC.copyTo (D);
836
846
837
- int vectorWidths[] = { 4 , 4 , 2 , 2 , 1 , 4 , cn, -1 };
838
- int kercn = ocl::checkOptimalVectorWidth (vectorWidths, B, D);
847
+ int vectorWidths[] = { 4 , 4 , 2 , 2 , 1 , 4 , cn, -1 };
848
+ int kercn = ocl::checkOptimalVectorWidth (vectorWidths, B, D);
839
849
840
- String opts = format (" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d %s %s %s" ,
850
+ opts + = format (" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d %s %s %s" ,
841
851
ocl::typeToStr (type), ocl::typeToStr (depth), ocl::typeToStr (CV_MAKETYPE (depth, kercn)),
842
852
cn, kercn, block_size,
843
853
(sizeA.width % block_size !=0 ) ? " -D NO_MULT" : " " ,
844
854
haveC ? " -D HAVE_C" : " " ,
845
855
doubleSupport ? " -D DOUBLE_SUPPORT" : " " );
846
856
847
- ocl::Kernel k (" gemm" , cv::ocl::core::gemm_oclsrc, opts);
848
- if (k.empty ())
849
- return false ;
857
+ ocl::Kernel k (" gemm" , cv::ocl::core::gemm_oclsrc, opts);
858
+ if (k.empty ())
859
+ return false ;
860
+
861
+ if (depth == CV_64F)
862
+ k.args (ocl::KernelArg::ReadOnlyNoSize (A),
863
+ ocl::KernelArg::ReadOnlyNoSize (B, cn, kercn),
864
+ ocl::KernelArg::ReadWrite (D, cn, kercn),
865
+ sizeA.width , alpha, beta);
866
+ else
867
+ k.args (ocl::KernelArg::ReadOnlyNoSize (A),
868
+ ocl::KernelArg::ReadOnlyNoSize (B, cn, kercn),
869
+ ocl::KernelArg::ReadWrite (D, cn, kercn),
870
+ sizeA.width , (float )alpha, (float )beta);
871
+
872
+ size_t globalsize[2 ] = { (size_t )sizeD.width * cn / kercn, (size_t )sizeD.height };
873
+ size_t localsize[2 ] = { (size_t )block_size, (size_t )block_size};
850
874
851
- if (depth == CV_64F)
852
- k.args (ocl::KernelArg::ReadOnlyNoSize (A),
853
- ocl::KernelArg::ReadOnlyNoSize (B, cn, kercn),
854
- ocl::KernelArg::ReadWrite (D, cn, kercn),
855
- sizeA.width , alpha, beta);
875
+ return k.run (2 , globalsize, block_size!=1 ? localsize : NULL , false );
876
+ }
856
877
else
857
- k.args (ocl::KernelArg::ReadOnlyNoSize (A),
858
- ocl::KernelArg::ReadOnlyNoSize (B, cn, kercn),
859
- ocl::KernelArg::ReadWrite (D, cn, kercn),
860
- sizeA.width , (float )alpha, (float )beta);
861
-
862
- size_t globalsize[2 ] = { (size_t )sizeD.width * cn / kercn, (size_t )sizeD.height };
863
- size_t localsize[2 ] = { (size_t )block_size, (size_t )block_size};
864
- return k.run (2 , globalsize, block_size!=1 ? localsize : NULL , false );
878
+ {
879
+ if (haveC && beta != 0.0 )
880
+ {
881
+ ctrans ? transpose (matC, D) : matC.copyTo (D);
882
+ }
883
+
884
+ return intel_gpu_gemm (A, sizeA,
885
+ B, sizeB,
886
+ D, sizeD,
887
+ alpha,
888
+ beta,
889
+ atrans, btrans);
890
+ }
865
891
}
866
892
#endif
867
893
0 commit comments