Apply the dropout strategy of the paper

zysite · zysite · commit 2f32fbd50376 · 2019-10-15T05:09:41.000Z
diff --git a/parser/modules/dropout.py b/parser/modules/dropout.py
@@ -13,11 +13,11 @@ def __init__(self, p=0.5, batch_first=True):
         self.batch_first = batch_first
 
     def extra_repr(self):
-        info = f"p={self.p}"
+        s = f"p={self.p}"
         if self.batch_first:
-            info += f", batch_first={self.batch_first}"
+            s += f", batch_first={self.batch_first}"
 
-        return info
+        return s
 
     def forward(self, x):
         if self.training:
@@ -31,8 +31,8 @@ def forward(self, x):
 
     @staticmethod
     def get_mask(x, p):
-        mask = x.new_full(x.shape, 1 - p)
-        mask = torch.bernoulli(mask) / (1 - p)
+        mask = x.new_empty(x.shape).bernoulli_(1 - p)
+        mask = mask / (1 - p)
 
         return mask
 
@@ -47,14 +47,14 @@ def __init__(self, p=0.5):
     def extra_repr(self):
         return f"p={self.p}"
 
-    def forward(self, x, y, eps=1e-12):
+    def forward(self, *items):
         if self.training:
-            x_mask = torch.bernoulli(x.new_full(x.shape[:2], 1 - self.p))
-            y_mask = torch.bernoulli(y.new_full(y.shape[:2], 1 - self.p))
-            scale = 3.0 / (2.0 * x_mask + y_mask + eps)
-            x_mask *= scale
-            y_mask *= scale
-            x *= x_mask.unsqueeze(dim=-1)
-            y *= y_mask.unsqueeze(dim=-1)
-
-        return x, y
+            masks = [x.new_empty(x.shape[:2]).bernoulli_(1 - self.p)
+                     for x in items]
+            total = sum(masks)
+            scale = len(items) / total.max(torch.ones_like(total))
+            masks = [mask * scale for mask in masks]
+            items = [item * mask.unsqueeze(dim=-1)
+                     for item, mask in zip(items, masks)]
+
+        return items