Accelerating on Apple Silicon M1 chips

hankcs · hankcs · commit c36dbaa26072 · 2021-11-04T04:50:16.000-04:00
diff --git a/docs/install.md b/docs/install.md
@@ -63,10 +63,16 @@ Installation on Windows is **perfectly** supported. The full version `hanlp[full
 ```
 ````
 
+````{margin} **Apple Silicon**
+```{note}
+HanLP also perfectly supports accelerating on Apple Silicon M1 chips, see [tutorial](https://www.hankcs.com/nlp/hanlp-official-m1-support.html).
+```
+````
+
 | Flavor  | Description                                                  |
 | ------- | ------------------------------------------------------------ |
 | default | This installs the default version which delivers the most commonly used functionalities. However, some heavy dependencies like TensorFlow are not installed. |
-| full    | For experts who seek to maximize the efficiency via TensorFlow and C++ extensions, `pip install hanlp[full]` installs every dependency HanLP will use in production. `hanlp[full]` requires `Python<=3.8` due to `tensorflow==2.3.0`, which can be easily installed through conda: `conda install python=3.8 -y` |
+| full    | For experts who seek to maximize the efficiency via TensorFlow and C++ extensions, `pip install hanlp[full]` installs every dependency HanLP will use in production. |
 
 
 ## Install Models
@@ -103,4 +109,3 @@ Some TensorFlow/fastText models will ask you to install the missing TensorFlow/f
 pip install hanlp[full]
 ```
 
-
diff --git a/hanlp/components/classifiers/transformer_classifier_tf.py b/hanlp/components/classifiers/transformer_classifier_tf.py
@@ -50,11 +50,11 @@ def inputs_to_samples(self, inputs, gold=False):
             attention_mask = [1] * len(token_ids)
             diff = max_length - len(token_ids)
             if diff < 0:
-                logger.warning(
-                    f'Input tokens {tokens} exceed the max sequence length of {max_length - 2}. '
-                    f'The exceeded part will be truncated and ignored. '
-                    f'You are recommended to split your long text into several sentences within '
-                    f'{max_length - 2} tokens beforehand.')
+                # logger.warning(
+                #     f'Input tokens {tokens} exceed the max sequence length of {max_length - 2}. '
+                #     f'The exceeded part will be truncated and ignored. '
+                #     f'You are recommended to split your long text into several sentences within '
+                #     f'{max_length - 2} tokens beforehand.')
                 token_ids = token_ids[:max_length]
                 attention_mask = attention_mask[:max_length]
                 segment_ids = segment_ids[:max_length]
diff --git a/hanlp/version.py b/hanlp/version.py
@@ -2,5 +2,5 @@
 # Author: hankcs
 # Date: 2019-12-28 19:26
 
-__version__ = '2.1.0-alpha.62'
+__version__ = '2.1.0-alpha.63'
 """HanLP version"""
diff --git a/plugins/hanlp_demo/hanlp_demo/zh/tf/cws/train_ctb6_cws_albert.py b/plugins/hanlp_demo/hanlp_demo/zh/tf/cws/train_ctb6_cws_albert.py
@@ -3,7 +3,7 @@
 # Date: 2019-12-28 22:22
 
 from hanlp.components.tok_tf import TransformerTokenizerTF
-from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST
+from hanlp.datasets.cws.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST
 from tests import cdroot
 
 cdroot()
diff --git a/plugins/hanlp_demo/hanlp_demo/zh/tf/cws/train_ctb6_cws_bert.py b/plugins/hanlp_demo/hanlp_demo/zh/tf/cws/train_ctb6_cws_bert.py
@@ -3,14 +3,14 @@
 # Date: 2019-12-28 22:22
 
 from hanlp.components.tok_tf import TransformerTokenizerTF
-from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST
+from hanlp.datasets.cws.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST
 from tests import cdroot
 
 cdroot()
 tokenizer = TransformerTokenizerTF()
 save_dir = 'data/model/cws_bert_base_ctb6'
-# tagger.fit(CTB6_CWS_TRAIN, CTB6_CWS_DEV, save_dir, transformer='bert-base-chinese',
-#               metrics='f1')
+tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_DEV, save_dir, transformer='chinese_L-12_H-768_A-12',
+              metrics='f1')
 tokenizer.load(save_dir)
 print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
 tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
diff --git a/setup.py b/setup.py
@@ -49,8 +49,8 @@
     extras_require={
         'full': [
             'fasttext==0.9.1',
-            'tensorflow==2.3.0',
-            'bert-for-tf2==0.14.6',
+            'tensorflow==2.6.0',
+            'bert-for-tf2-mod==0.14.10',
             'py-params==0.9.7',
             'params-flow==0.8.2',
             'penman==0.6.2',