ConvBERT fix torch <> tf weights conversion (huggingface#10314)

abhishekkrthakur · patrickvonplaten · web-flow · commit 2d458b2c7d6f · 2021-02-24T14:55:34.000+03:00
* convbert conversion test

* fin

* fin

* fin

* clean up tf&lt;-&gt;pt conversion

* remove from_pt

Co-authored-by: patrickvonplaten &lt;patrick.v.platen@gmail.com&gt;
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
@@ -56,7 +56,11 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
         tf_name = tf_name[1:]  # Remove level zero
 
     # When should we transpose the weights
-    transpose = bool(tf_name[-1] == "kernel" or "emb_projs" in tf_name or "out_projs" in tf_name)
+    transpose = bool(
+        tf_name[-1] in ["kernel", "pointwise_kernel", "depthwise_kernel"]
+        or "emb_projs" in tf_name
+        or "out_projs" in tf_name
+    )
 
     # Convert standard TF2.0 names in PyTorch names
     if tf_name[-1] == "kernel" or tf_name[-1] == "embeddings" or tf_name[-1] == "gamma":
diff --git a/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py b/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py
@@ -16,7 +16,7 @@
 
 import argparse
 
-from transformers import ConvBertConfig, ConvBertModel, load_tf_weights_in_convbert
+from transformers import ConvBertConfig, ConvBertModel, TFConvBertModel, load_tf_weights_in_convbert
 from transformers.utils import logging
 
 
@@ -30,6 +30,9 @@ def convert_orig_tf1_checkpoint_to_pytorch(tf_checkpoint_path, convbert_config_f
     model = load_tf_weights_in_convbert(model, conf, tf_checkpoint_path)
     model.save_pretrained(pytorch_dump_path)
 
+    tf_model = TFConvBertModel.from_pretrained(pytorch_dump_path, from_pt=True)
+    tf_model.save_pretrained(pytorch_dump_path)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py
@@ -343,7 +343,7 @@ def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kw
     def build(self, input_shape):
         self.kernel = self.add_weight(
             "kernel",
-            shape=[self.num_groups, self.group_in_dim, self.group_out_dim],
+            shape=[self.group_out_dim, self.group_in_dim, self.num_groups],
             initializer=self.kernel_initializer,
             trainable=True,
         )
@@ -355,7 +355,7 @@ def build(self, input_shape):
     def call(self, hidden_states):
         batch_size = shape_list(hidden_states)[0]
         x = tf.transpose(tf.reshape(hidden_states, [-1, self.num_groups, self.group_in_dim]), [1, 0, 2])
-        x = tf.matmul(x, self.kernel)
+        x = tf.matmul(x, tf.transpose(self.kernel, [2, 1, 0]))
         x = tf.transpose(x, [1, 0, 2])
         x = tf.reshape(x, [batch_size, -1, self.output_size])
         x = tf.nn.bias_add(value=x, bias=self.bias)
diff --git a/tests/test_modeling_tf_convbert.py b/tests/test_modeling_tf_convbert.py
@@ -399,14 +399,12 @@ def test_inference_masked_lm(self):
         expected_shape = [1, 6, 768]
         self.assertEqual(output.shape, expected_shape)
 
-        print(output[:, :3, :3])
-
         expected_slice = tf.constant(
             [
                 [
-                    [-0.10334751, -0.37152207, -0.2682219],
-                    [0.20078957, -0.3918426, -0.78811496],
-                    [0.08000169, -0.509474, -0.59314483],
+                    [-0.03475493, -0.4686034, -0.30638832],
+                    [0.22637248, -0.26988646, -0.7423424],
+                    [0.10324868, -0.45013508, -0.58280784],
                 ]
             ]
         )

Original file line number	Diff line number	Diff line change
`@@ -399,14 +399,12 @@ def test_inference_masked_lm(self):`
`399`	`399`	`expected_shape = [1, 6, 768]`
`400`	`400`	`self.assertEqual(output.shape, expected_shape)`
`401`	`401`
`402`		`- print(output[:, :3, :3])`
`403`		`-`
`404`	`402`	`expected_slice = tf.constant(`
`405`	`403`	`[`
`406`	`404`	`[`
`407`		`- [-0.10334751, -0.37152207, -0.2682219],`
`408`		`- [0.20078957, -0.3918426, -0.78811496],`
`409`		`- [0.08000169, -0.509474, -0.59314483],`
	`405`	`+ [-0.03475493, -0.4686034, -0.30638832],`
	`406`	`+ [0.22637248, -0.26988646, -0.7423424],`
	`407`	`+ [0.10324868, -0.45013508, -0.58280784],`
`410`	`408`	`]`
`411`	`409`	`]`
`412`	`410`	`)`