webgpu: Increase work per thread for maxpool (tensorflow#2628)

qjia7 · annxingyuan · annxingyuan · commit 677d48e4ccc8 · 2020-01-06T13:14:21.000-05:00
PERF

With this change, maxPool[1, 131, 131, 64] has 50%~90% speedup
on different platforms.

* Add workPerThread to shaderKey

Co-authored-by: Ann Yuan &lt;annyuan@google.com&gt;
diff --git a/tfjs-backend-webgpu/src/benchmark_ops_test.ts b/tfjs-backend-webgpu/src/benchmark_ops_test.ts
@@ -159,7 +159,7 @@ describeWebGPU('Ops benchmarks', () => {
   it('maxPool', async () => {
     const x = tf.randomNormal<tf.Rank.R4>([1, 131, 131, 64]);
 
-    await time(() => tf.maxPool(x, 2, 1, 'same'));
+    await time(() => tf.maxPool(x, 2, 1, 'same'), null, true, 10, 10);
   });
 
   it('prelu', async () => {
diff --git a/tfjs-backend-webgpu/src/kernels/maxpool_webgpu.ts b/tfjs-backend-webgpu/src/kernels/maxpool_webgpu.ts
@@ -29,17 +29,20 @@ export class MaxPoolProgram implements WebGPUProgram {
   dispatch: [number, number, number];
   variableNames = ['x'];
   uniforms = 'ivec2 pad, stride, dilation, convDims, filterDims;';
-  workGroupSize: [number, number, number] = [4, 4, 4];
+  // TODO(jiajia.qin@intel.com): Dynamically choose different workGroupSize and
+  // workPerThead for different output shapes.
+  workGroupSize: [number, number, number] = [4, 4, 1];
+  workPerThread = 16;
 
   constructor(convInfo: backend_util.Conv2DInfo) {
     this.outputShape = convInfo.outShape;
 
-    this.dispatchLayout = {x: [2], y: [1], z: [0, 3]};
+    this.dispatchLayout = {x: [0, 1], y: [2], z: [3]};
 
     this.dispatch = computeDispatch(
-        this.dispatchLayout, this.outputShape, this.workGroupSize);
+        this.dispatchLayout, this.outputShape, this.workGroupSize,
+        [1, 1, this.workPerThread]);
 
-    // TODO: Parallelize max computation by thread and merge result.
     this.userCode = `
       float getValue(int batch, int xR, int xC, int d) {
         if (xC < 0 || xC >= convDims.x) {
@@ -50,15 +53,17 @@ export class MaxPoolProgram implements WebGPUProgram {
 
       void main() {
         ivec4 coords = getOutputCoords();
-        int batch = coords[0];
-        int d = coords[3];
-
         if (all(lessThan(coords, outShape))) {
+          int batch = coords[0];
           ivec2 xRCCorner = coords.yz * stride - pad;
           int xRCorner = xRCCorner.x;
           int xCCorner = xRCCorner.y;
 
-          float minMaxValue = 0.0;
+          float minMaxValue[${this.workPerThread}];
+          for (int i = 0; i < ${this.workPerThread}; i++)
+          {
+            minMaxValue[i] = 0.0;
+          }
 
           for (int wR = 0; wR < filterDims.y; wR += dilation.y) {
             int xR = xRCorner + wR;
@@ -69,14 +74,36 @@ export class MaxPoolProgram implements WebGPUProgram {
 
             for (int wC = 0; wC < filterDims.x; wC += dilation.x) {
               int xC = xCCorner + wC * dilation.x;
-              float value = getValue(batch, xR, xC, d);
-              minMaxValue = max(value, minMaxValue);
+              for (int i = 0; i < ${this.workPerThread}; i++)
+              {
+                int d = coords[3] * ${this.workPerThread} + i;
+                if (d < outShape[3])
+                {
+                  float value = getValue(batch, xR, xC, d);
+                  minMaxValue[i] = max(value, minMaxValue[i]);
+                }
+                else
+                {
+                  break;
+                }
+              }
+            }
+          }
+          for (int i = 0; i < ${this.workPerThread}; i++)
+          {
+            int d = coords[3] * ${this.workPerThread} + i;
+            if (d < outShape[3])
+            {
+              setOutput(batch, coords[1], coords[2], d, minMaxValue[i]);
+            }
+            else
+            {
+              break;
             }
           }
-          setOutput(batch, coords[1], coords[2], d, minMaxValue);
         }
       }
     `;
-   this.shaderKey = 'maxpool';
+    this.shaderKey = `maxpool${this.workPerThread}`;
   }
 }