pad input of face landmark net to square and center to avoid stretching of non square images

justadudewhohacks · justadudewhohacks · commit 554bbe05db13 · 2018-07-01T15:43:32.000+02:00
diff --git a/src/Rect.ts b/src/Rect.ts
@@ -18,6 +18,20 @@ export class Rect implements IRect {
     this.height = height
   }
 
+  public toSquare(): Rect {
+    let { x, y, width, height } = this
+    const diff = Math.abs(width - height)
+    if (width < height) {
+      x -= (diff / 2)
+      width += diff
+    }
+    if (height < width) {
+      y -= (diff / 2)
+      height += diff
+    }
+    return new Rect(x, y, width, height)
+  }
+
   public floor(): Rect {
     return new Rect(
       Math.floor(this.x),
diff --git a/src/faceLandmarkNet/FaceLandmarkNet.ts b/src/faceLandmarkNet/FaceLandmarkNet.ts
@@ -4,9 +4,11 @@ import { convLayer } from '../commons/convLayer';
 import { getImageTensor } from '../commons/getImageTensor';
 import { ConvParams } from '../commons/types';
 import { NetInput } from '../NetInput';
+import { padToSquare } from '../padToSquare';
 import { Point } from '../Point';
 import { toNetInput } from '../toNetInput';
 import { Dimensions, TNetInput } from '../types';
+import { isEven } from '../utils';
 import { extractParams } from './extractParams';
 import { FaceLandmarks } from './FaceLandmarks';
 import { fullyConnectedLayer } from './fullyConnectedLayer';
@@ -41,31 +43,25 @@ export class FaceLandmarkNet {
     this._params = extractParams(weights)
   }
 
-  public async detectLandmarks(input: tf.Tensor | NetInput | TNetInput) {
-    if (!this._params) {
+  public forwardTensor(imgTensor: tf.Tensor4D): tf.Tensor2D {
+    const params = this._params
+
+    if (!params) {
       throw new Error('FaceLandmarkNet - load model before inference')
     }
 
-    const netInput = input instanceof tf.Tensor
-      ? input
-      : await toNetInput(input)
-
-    let imageDimensions: Dimensions | undefined
-
-    const outTensor = tf.tidy(() => {
-      const params = this._params
-
-      let imgTensor = getImageTensor(netInput)
-      const [height, width] = imgTensor.shape.slice(1)
-      imageDimensions = { width, height }
+    return tf.tidy(() => {
+      const [batchSize, height, width] = imgTensor.shape.slice()
 
+      let x = padToSquare(imgTensor, true)
+      const [heightAfterPadding, widthAfterPadding] = x.shape.slice(1)
 
       // work with 128 x 128 sized face images
-      if (imgTensor.shape[1] !== 128 || imgTensor.shape[2] !== 128) {
-        imgTensor = tf.image.resizeBilinear(imgTensor, [128, 128])
+      if (heightAfterPadding !== 128 || widthAfterPadding !== 128) {
+        x = tf.image.resizeBilinear(x, [128, 128])
       }
 
-      let out = conv(imgTensor, params.conv0_params)
+      let out = conv(x, params.conv0_params)
       out = maxPool(out)
       out = conv(out, params.conv1_params)
       out = conv(out, params.conv2_params)
@@ -80,14 +76,58 @@ export class FaceLandmarkNet {
       const fc0 = tf.relu(fullyConnectedLayer(out.as2D(out.shape[0], -1), params.fc0_params))
       const fc1 = fullyConnectedLayer(fc0, params.fc1_params)
 
-      return fc1
+
+      const createInterleavedTensor = (fillX: number, fillY: number) =>
+      tf.stack([
+        tf.fill([68], fillX),
+        tf.fill([68], fillY)
+      ], 1).as2D(batchSize, 136)
+
+
+      /* shift coordinates back, to undo centered padding
+        ((x * widthAfterPadding) - shiftX) / width
+        ((y * heightAfterPadding) - shiftY) / height
+      */
+      const shiftX = Math.floor(Math.abs(widthAfterPadding - width) / 2)
+      const shiftY = Math.floor(Math.abs(heightAfterPadding - height) / 2)
+      const landmarkTensor = fc1
+        .mul(createInterleavedTensor(widthAfterPadding, heightAfterPadding))
+        .sub(createInterleavedTensor(shiftX, shiftY))
+        .div(createInterleavedTensor(width, height))
+
+      return landmarkTensor as tf.Tensor2D
+    })
+  }
+
+  public async forward(input: tf.Tensor | NetInput | TNetInput): Promise<tf.Tensor2D> {
+    const netInput = input instanceof tf.Tensor
+      ? input
+      : await toNetInput(input)
+
+    return this.forwardTensor(getImageTensor(netInput))
+  }
+
+  public async detectLandmarks(input: tf.Tensor | NetInput | TNetInput) {
+    const netInput = input instanceof tf.Tensor
+      ? input
+      : await toNetInput(input)
+
+    let imageDimensions: Dimensions | undefined
+
+    const outTensor = tf.tidy(() => {
+      const imgTensor = getImageTensor(netInput)
+
+      const [height, width] = imgTensor.shape.slice(1)
+      imageDimensions = { width, height }
+
+      return this.forwardTensor(imgTensor)
     })
 
     const faceLandmarksArray = Array.from(await outTensor.data())
     outTensor.dispose()
 
-    const xCoords = faceLandmarksArray.filter((c, i) => (i - 1) % 2)
-    const yCoords = faceLandmarksArray.filter((c, i) => i % 2)
+    const xCoords = faceLandmarksArray.filter((_, i) => isEven(i))
+    const yCoords = faceLandmarksArray.filter((_, i) => !isEven(i))
 
     return new FaceLandmarks(
       Array(68).fill(0).map((_, i) => new Point(xCoords[i], yCoords[i])),
diff --git a/test/tests/e2e/faceLandmarkNet.test.ts b/test/tests/e2e/faceLandmarkNet.test.ts
@@ -31,7 +31,10 @@ describe('faceLandmarkNet', () => {
       expect(result.getImageHeight()).toEqual(height)
       expect(result.getShift().x).toEqual(0)
       expect(result.getShift().y).toEqual(0)
-      expect(result.getPositions().map(({ x, y }) => ({ x, y }))).toEqual(faceLandmarkPositions)
+      result.getPositions().forEach(({ x, y }, i) => {
+        expectMaxDelta(x, faceLandmarkPositions[i].x, 0.1)
+        expectMaxDelta(y, faceLandmarkPositions[i].y, 0.1)
+      })
     })
 
   })