Add support for making prediction on huge files

yzhangcs · yzhangcs · commit ba1861666cbc · 2022-06-17T14:51:27.000+08:00
diff --git a/supar/parsers/const.py b/supar/parsers/const.py
@@ -99,7 +99,7 @@ def evaluate(self, data, buckets=8, workers=0, batch_size=5000, mbr=True,
 
         return super().evaluate(**Config().update(locals()))
 
-    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False, mbr=True,
+    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False, cache=False, mbr=True,
                 verbose=True, **kwargs):
         r"""
         Args:
@@ -121,6 +121,8 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 The number of tokens in each batch. Default: 5000.
             prob (bool):
                 If ``True``, outputs the probabilities. Default: ``False``.
+            cache (bool):
+                If ``True``, caches the data first, suggested if parsing huge files (e.g., > 1M sentences). Default: ``False``.
             mbr (bool):
                 If ``True``, performs MBR decoding. Default: ``True``.
             verbose (bool):
@@ -129,7 +131,7 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 A dict holding unconsumed arguments for updating prediction configs.
 
         Returns:
-            A :class:`~supar.utils.Dataset` object that stores the predicted results.
+            A :class:`~supar.utils.Dataset` object containing all predictions if ``cache=False``, otherwise ``None``.
         """
 
         return super().predict(**Config().update(locals()))
@@ -227,6 +229,7 @@ def _predict(self, loader):
                            for tree, chart in zip(trees, chart_preds)]
             if self.args.prob:
                 batch.probs = [prob[:i-1, 1:i].cpu() for i, prob in zip(lens, s_span)]
+            yield from batch.sentences
 
     @classmethod
     def build(cls, path, min_freq=2, fix_len=20, **kwargs):
@@ -398,7 +401,8 @@ def evaluate(self, data, buckets=8, workers=0, batch_size=5000,
 
         return super().evaluate(**Config().update(locals()))
 
-    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False,  verbose=True, **kwargs):
+    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False, cache=False,
+                verbose=True, **kwargs):
         r"""
         Args:
             data (str or Iterable):
@@ -419,6 +423,8 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 The number of tokens in each batch. Default: 5000.
             prob (bool):
                 If ``True``, outputs the probabilities. Default: ``False``.
+            cache (bool):
+                If ``True``, caches the data first, suggested if parsing huge files (e.g., > 1M sentences). Default: ``False``.
             mbr (bool):
                 If ``True``, performs MBR decoding. Default: ``True``.
             verbose (bool):
@@ -427,7 +433,7 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 A dict holding unconsumed arguments for updating prediction configs.
 
         Returns:
-            A :class:`~supar.utils.Dataset` object that stores the predicted results.
+            A :class:`~supar.utils.Dataset` object containing all predictions if ``cache=False``, otherwise ``None``.
         """
 
         return super().predict(**Config().update(locals()))
@@ -525,3 +531,4 @@ def _predict(self, loader):
                            for tree, chart in zip(trees, chart_preds)]
             if self.args.prob:
                 batch.probs = [prob[:i-1, 1:i].cpu() for i, prob in zip(lens, s_span)]
+            yield from batch.sentences
diff --git a/supar/parsers/dep.py b/supar/parsers/dep.py
@@ -94,7 +94,7 @@ def evaluate(self, data, buckets=8, workers=0, batch_size=5000,
 
         return super().evaluate(**Config().update(locals()))
 
-    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False,
+    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False, cache=False,
                 tree=True, proj=False, verbose=True, **kwargs):
         r"""
         Args:
@@ -116,6 +116,8 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 The number of tokens in each batch. Default: 5000.
             prob (bool):
                 If ``True``, outputs the probabilities. Default: ``False``.
+            cache (bool):
+                If ``True``, caches the data first, suggested if parsing huge files (e.g., > 1M sentences). Default: ``False``.
             tree (bool):
                 If ``True``, ensures to output well-formed trees. Default: ``False``.
             proj (bool):
@@ -126,7 +128,7 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 A dict holding unconsumed arguments for updating prediction configs.
 
         Returns:
-            A :class:`~supar.utils.Dataset` object that stores the predicted results.
+            A :class:`~supar.utils.Dataset` object containing all predictions if ``cache=False``, otherwise ``None``.
         """
 
         return super().predict(**Config().update(locals()))
@@ -233,6 +235,7 @@ def _predict(self, loader):
             batch.rels = [self.REL.vocab[i.tolist()] for i in rel_preds[mask].split(lens)]
             if self.args.prob:
                 batch.probs = [prob[1:i+1, :i+1].cpu() for i, prob in zip(lens, s_arc.softmax(-1).unbind())]
+            yield from batch.sentences
 
     @classmethod
     def build(cls, path, min_freq=2, fix_len=20, **kwargs):
@@ -408,7 +411,7 @@ def evaluate(self, data, buckets=8, workers=0, batch_size=5000, punct=False,
 
         return super().evaluate(**Config().update(locals()))
 
-    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False,
+    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False, cache=False,
                 mbr=True, tree=True, proj=True, verbose=True, **kwargs):
         r"""
         Args:
@@ -430,6 +433,8 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 The number of tokens in each batch. Default: 5000.
             prob (bool):
                 If ``True``, outputs the probabilities. Default: ``False``.
+            cache (bool):
+                If ``True``, caches the data first, suggested if parsing huge files (e.g., > 1M sentences). Default: ``False``.
             mbr (bool):
                 If ``True``, performs MBR decoding. Default: ``True``.
             tree (bool):
@@ -442,7 +447,7 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 A dict holding unconsumed arguments for updating prediction configs.
 
         Returns:
-            A :class:`~supar.utils.Dataset` object that stores the predicted results.
+            A :class:`~supar.utils.Dataset` object containing all predictions if ``cache=False``, otherwise ``None``.
         """
 
         return super().predict(**Config().update(locals()))
@@ -553,6 +558,7 @@ def _predict(self, loader):
             if self.args.prob:
                 arc_probs = s_arc if self.args.mbr else s_arc.softmax(-1)
                 batch.probs = [prob[1:i+1, :i+1].cpu() for i, prob in zip(lens, arc_probs.unbind())]
+            yield from batch.sentences
 
 
 class CRF2oDependencyParser(BiaffineDependencyParser):
@@ -631,7 +637,7 @@ def evaluate(self, data, buckets=8, workers=0, batch_size=5000, punct=False,
 
         return super().evaluate(**Config().update(locals()))
 
-    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False,
+    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False, cache=False,
                 mbr=True, tree=True, proj=True, verbose=True, **kwargs):
         r"""
         Args:
@@ -653,6 +659,8 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 The number of tokens in each batch. Default: 5000.
             prob (bool):
                 If ``True``, outputs the probabilities. Default: ``False``.
+            cache (bool):
+                If ``True``, caches the data first, suggested if parsing huge files (e.g., > 1M sentences). Default: ``False``.
             mbr (bool):
                 If ``True``, performs MBR decoding. Default: ``True``.
             tree (bool):
@@ -665,7 +673,7 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 A dict holding unconsumed arguments for updating prediction configs.
 
         Returns:
-            A :class:`~supar.utils.Dataset` object that stores the predicted results.
+            A :class:`~supar.utils.Dataset` object containing all predictions if ``cache=False``, otherwise ``None``.
         """
 
         return super().predict(**Config().update(locals()))
@@ -775,6 +783,7 @@ def _predict(self, loader):
             if self.args.prob:
                 arc_probs = s_arc if self.args.mbr else s_arc.softmax(-1)
                 batch.probs = [prob[1:i+1, :i+1].cpu() for i, prob in zip(lens, arc_probs.unbind())]
+            yield from batch.sentences
 
     @classmethod
     def build(cls, path, min_freq=2, fix_len=20, **kwargs):
@@ -945,7 +954,7 @@ def evaluate(self, data, buckets=8, workers=0, batch_size=5000, punct=False,
 
         return super().evaluate(**Config().update(locals()))
 
-    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False,
+    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False, cache=False,
                 tree=True, proj=True, verbose=True, **kwargs):
         r"""
         Args:
@@ -967,6 +976,8 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 The number of tokens in each batch. Default: 5000.
             prob (bool):
                 If ``True``, outputs the probabilities. Default: ``False``.
+            cache (bool):
+                If ``True``, caches the data first, suggested if parsing huge files (e.g., > 1M sentences). Default: ``False``.
             tree (bool):
                 If ``True``, ensures to output well-formed trees. Default: ``False``.
             proj (bool):
@@ -977,7 +988,7 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 A dict holding unconsumed arguments for updating prediction configs.
 
         Returns:
-            A :class:`~supar.utils.Dataset` object that stores the predicted results.
+            A :class:`~supar.utils.Dataset` object containing all predictions if ``cache=False``, otherwise ``None``.
         """
 
         return super().predict(**Config().update(locals()))
@@ -1085,3 +1096,4 @@ def _predict(self, loader):
             batch.rels = [self.REL.vocab[i.tolist()] for i in rel_preds[mask].split(lens)]
             if self.args.prob:
                 batch.probs = [prob[1:i+1, :i+1].cpu() for i, prob in zip(lens, s_arc.unbind())]
+            yield from batch.sentences
diff --git a/supar/parsers/parser.py b/supar/parsers/parser.py
@@ -2,15 +2,17 @@
 
 import os
 from datetime import datetime, timedelta
+import shutil
 
 import dill
 import supar
 import torch
 import torch.distributed as dist
 from supar.utils import Config, Dataset
+import tempfile
 from supar.utils.field import Field
 from supar.utils.fn import download, get_rng_state, set_rng_state
-from supar.utils.logging import init_logger, logger
+from supar.utils.logging import init_logger, logger, progress_bar
 from supar.utils.metric import Metric
 from supar.utils.parallel import DistributedDataParallel as DDP
 from supar.utils.parallel import is_master
@@ -128,7 +130,7 @@ def evaluate(self, data, buckets=8, workers=0, batch_size=5000, **kwargs):
 
         return loss, metric
 
-    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False, **kwargs):
+    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False, cache=False, **kwargs):
         args = self.args.update(locals())
         init_logger(logger, verbose=args.verbose)
 
@@ -143,15 +145,30 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
 
         logger.info("Making predictions on the dataset")
         start = datetime.now()
-        self._predict(dataset.loader)
-        elapsed = datetime.now() - start
-
-        if pred is not None and is_master():
-            logger.info(f"Saving predicted results to {pred}")
-            self.transform.save(pred, dataset)
+        with tempfile.TemporaryDirectory() as t:
+            # we have clustered the sentences by length here to speed up prediction,
+            # so the order of the yielded sentences can't be guaranteed
+            for s in self._predict(dataset.loader):
+                if args.cache:
+                    with open(os.path.join(t, f"{s.index}"), 'w') as f:
+                        f.write(str(s) + '\n')
+            elapsed = datetime.now() - start
+
+            if pred is not None and is_master():
+                logger.info(f"Saving predicted results to {pred}")
+                with open(pred, 'w') as f:
+                    # merge all predictions into one single file
+                    if args.cache:
+                        for s in progress_bar(sorted(os.listdir(t), key=lambda x: int(x))):
+                            with open(os.path.join(t, s)) as s:
+                                shutil.copyfileobj(s, f)
+                    else:
+                        for s in progress_bar(dataset):
+                            f.write(str(s) + '\n')
         logger.info(f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s")
 
-        return dataset
+        if not cache:
+            return dataset
 
     def _train(self, loader):
         raise NotImplementedError
diff --git a/supar/parsers/sdp.py b/supar/parsers/sdp.py
@@ -75,7 +75,8 @@ def evaluate(self, data, buckets=8, workers=0, batch_size=5000, verbose=True, **
 
         return super().evaluate(**Config().update(locals()))
 
-    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, verbose=True, **kwargs):
+    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False, cache=False,
+                verbose=True, **kwargs):
         r"""
         Args:
             data (str or Iterable):
@@ -96,13 +97,15 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 The number of tokens in each batch. Default: 5000.
             prob (bool):
                 If ``True``, outputs the probabilities. Default: ``False``.
+            cache (bool):
+                If ``True``, caches the data first, suggested if parsing huge files (e.g., > 1M sentences). Default: ``False``.
             verbose (bool):
                 If ``True``, increases the output verbosity. Default: ``True``.
             kwargs (dict):
                 A dict holding unconsumed arguments for updating prediction configs.
 
         Returns:
-            A :class:`~supar.utils.Dataset` object that stores the predicted results.
+            A :class:`~supar.utils.Dataset` object containing all predictions if ``cache=False``, otherwise ``None``.
         """
 
         return super().predict(**Config().update(locals()))
@@ -201,6 +204,7 @@ def _predict(self, loader):
                             for i, chart in zip(lens, label_preds)]
             if self.args.prob:
                 batch.probs = [prob[1:i, :i].cpu() for i, prob in zip(lens, s_edge.softmax(-1).unbind())]
+            yield from batch.sentences
 
     @classmethod
     def build(cls, path, min_freq=7, fix_len=20, **kwargs):
@@ -360,7 +364,8 @@ def evaluate(self, data, buckets=8, workers=0, batch_size=5000, verbose=True, **
 
         return super().evaluate(**Config().update(locals()))
 
-    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, verbose=True, **kwargs):
+    def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5000, prob=False, cache=False,
+                verbose=True, **kwargs):
         r"""
         Args:
             data (str or Iterable):
@@ -381,13 +386,15 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
                 The number of tokens in each batch. Default: 5000.
             prob (bool):
                 If ``True``, outputs the probabilities. Default: ``False``.
+            cache (bool):
+                If ``True``, caches the data first, suggested if parsing huge files (e.g., > 1M sentences). Default: ``False``.
             verbose (bool):
                 If ``True``, increases the output verbosity. Default: ``True``.
             kwargs (dict):
                 A dict holding unconsumed arguments for updating prediction configs.
 
         Returns:
-            A :class:`~supar.utils.Dataset` object that stores the predicted results.
+            A :class:`~supar.utils.Dataset` object containing all predictions if ``cache=False``, otherwise ``None``.
         """
 
         return super().predict(**Config().update(locals()))
@@ -487,3 +494,4 @@ def _predict(self, loader):
                             for i, chart in zip(lens, label_preds)]
             if self.args.prob:
                 batch.probs = [prob[1:i, :i].cpu() for i, prob in zip(lens, s_edge.unbind())]
+            yield from batch.sentences
diff --git a/supar/utils/transform.py b/supar/utils/transform.py
@@ -22,7 +22,7 @@
 from torch.distributions.utils import lazy_property
 
 if TYPE_CHECKING:
-    from supar.utils import Dataset, Field
+    from supar.utils import Field
 
 
 class Transform(object):
@@ -129,11 +129,6 @@ def src(self):
     def tgt(self):
         raise AttributeError
 
-    def save(self, path: str, data: Dataset) -> None:
-        with open(path, 'w') as f:
-            for i in data:
-                f.write(str(i) + '\n')
-
 
 class CoNLL(Transform):
     r"""