Support parsing plain texts

yzhangcs · yzhangcs · commit 8decdb1aeb18 · 2022-06-17T02:41:59.000Z
diff --git a/supar/parsers/const.py b/supar/parsers/const.py
@@ -104,7 +104,9 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
         r"""
         Args:
             data (str or Iterable):
-                The data for prediction. Both a filename and a list of instances are allowed.
+                The data for prediction.
+                - a filename. If ends with `.txt`, the parser will seek to make predictions line by line from plain texts.
+                - a list of instances.
             pred (str):
                 If specified, the predicted results will be saved to the file. Default: ``None``.
             lang (str):
@@ -400,7 +402,9 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
         r"""
         Args:
             data (str or Iterable):
-                The data for prediction. Both a filename and a list of instances are allowed.
+                The data for prediction.
+                - a filename. If ends with `.txt`, the parser will seek to make predictions line by line from plain texts.
+                - a list of instances.
             pred (str):
                 If specified, the predicted results will be saved to the file. Default: ``None``.
             lang (str):
diff --git a/supar/parsers/dep.py b/supar/parsers/dep.py
@@ -99,7 +99,9 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
         r"""
         Args:
             data (str or Iterable):
-                The data for prediction. Both a filename and a list of instances are allowed.
+                The data for prediction.
+                - a filename. If ends with `.txt`, the parser will seek to make predictions line by line from plain texts.
+                - a list of instances.
             pred (str):
                 If specified, the predicted results will be saved to the file. Default: ``None``.
             lang (str):
@@ -411,7 +413,9 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
         r"""
         Args:
             data (str or Iterable):
-                The data for prediction. Both a filename and a list of instances are allowed.
+                The data for prediction.
+                - a filename. If ends with `.txt`, the parser will seek to make predictions line by line from plain texts.
+                - a list of instances.
             pred (str):
                 If specified, the predicted results will be saved to the file. Default: ``None``.
             lang (str):
@@ -632,7 +636,9 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
         r"""
         Args:
             data (str or Iterable):
-                The data for prediction. Both a filename and a list of instances are allowed.
+                The data for prediction.
+                - a filename. If ends with `.txt`, the parser will seek to make predictions line by line from plain texts.
+                - a list of instances.
             pred (str):
                 If specified, the predicted results will be saved to the file. Default: ``None``.
             lang (str):
@@ -944,7 +950,9 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
         r"""
         Args:
             data (str or Iterable):
-                The data for prediction. Both a filename and a list of instances are allowed.
+                The data for prediction.
+                - a filename. If ends with `.txt`, the parser will seek to make predictions line by line from plain texts.
+                - a list of instances.
             pred (str):
                 If specified, the predicted results will be saved to the file. Default: ``None``.
             lang (str):
diff --git a/supar/parsers/parser.py b/supar/parsers/parser.py
@@ -115,7 +115,7 @@ def evaluate(self, data, buckets=8, workers=0, batch_size=5000, **kwargs):
 
         self.transform.train()
         logger.info("Loading the data")
-        dataset = Dataset(self.transform, data)
+        dataset = Dataset(self.transform, **args)
         dataset.build(batch_size, buckets, False, False, workers)
         logger.info(f"\n{dataset}")
 
@@ -137,7 +137,7 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
             self.transform.append(Field('probs'))
 
         logger.info("Loading the data")
-        dataset = Dataset(self.transform, data, lang=lang)
+        dataset = Dataset(self.transform, **args)
         dataset.build(batch_size, buckets, False, False, workers)
         logger.info(f"\n{dataset}")
 
diff --git a/supar/parsers/sdp.py b/supar/parsers/sdp.py
@@ -79,7 +79,9 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
         r"""
         Args:
             data (str or Iterable):
-                The data for prediction. Both a filename and a list of instances are allowed.
+                The data for prediction.
+                - a filename. If ends with `.txt`, the parser will seek to make predictions line by line from plain texts.
+                - a list of instances.
             pred (str):
                 If specified, the predicted results will be saved to the file. Default: ``None``.
             lang (str):
@@ -362,7 +364,9 @@ def predict(self, data, pred=None, lang=None, buckets=8, workers=0, batch_size=5
         r"""
         Args:
             data (str or Iterable):
-                The data for prediction. Both a filename and a list of instances are allowed.
+                The data for prediction.
+                - a filename. If ends with `.txt`, the parser will seek to make predictions line by line from plain texts.
+                - a list of instances.
             pred (str):
                 If specified, the predicted results will be saved to the file. Default: ``None``.
             lang (str):
diff --git a/supar/utils/transform.py b/supar/utils/transform.py
@@ -3,10 +3,12 @@
 from __future__ import annotations
 
 import os
+import re
 import shutil
 import tempfile
 from collections.abc import Iterable
 from contextlib import contextmanager
+from io import StringIO
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
 
 import nltk
@@ -393,23 +395,31 @@ def load(
             A list of :class:`CoNLLSentence` instances.
         """
 
+        isconll = False
+        if lang is not None:
+            tokenizer = Tokenizer(lang)
         if isinstance(data, str) and os.path.exists(data):
-            with open(data, 'r') as f:
-                lines = [line.strip() for line in f]
+            f = open(data)
+            if data.endswith('.txt'):
+                lines = (i
+                         for s in f
+                         if len(s) > 1
+                         for i in StringIO(self.toconll(s.split() if lang is None else tokenizer(s)) + '\n'))
+            else:
+                lines, isconll = f, True
         else:
             if lang is not None:
-                tokenizer = Tokenizer(lang)
-                data = [tokenizer(i) for i in ([data] if isinstance(data, str) else data)]
+                data = [tokenizer(s) for s in ([data] if isinstance(data, str) else data)]
             else:
                 data = [data] if isinstance(data[0], str) else data
-            lines = '\n'.join([self.toconll(i) for i in data]).split('\n')
+            lines = (i for s in data for i in StringIO(self.toconll(s) + '\n'))
 
         index, sentence = 0, []
-        for line in lines:
+        for line in progress_bar(lines):
             line = line.strip()
             if len(line) == 0:
                 sentence = CoNLLSentence(self, sentence, index)
-                if proj and not self.isprojective(list(map(int, sentence.arcs))):
+                if isconll and proj and not self.isprojective(list(map(int, sentence.arcs))):
                     logger.warning(f"Sentence {index} is not projective. Discarding it!")
                 elif max_len is not None and len(sentence) >= max_len:
                     logger.warning(f"Sentence {index} has {len(sentence)} tokens, exceeding {max_len}. Discarding it!")
@@ -492,10 +502,11 @@ def totree(
 
         if isinstance(tokens[0], str):
             tokens = [(token, '_') for token in tokens]
-        mapped = []
+        mapped, pattern = [], re.compile(f'[{"".join(special_tokens)}]')
         for i, (word, pos) in enumerate(tokens):
-            if word in special_tokens:
-                tokens[i] = (special_tokens[word], pos)
+            match = re.search(pattern, word)
+            if match:
+                tokens[i] = (pattern.sub(lambda m: special_tokens[m[0]], word), pos)
                 mapped.append((i, word))
         tree = nltk.Tree.fromstring(f"({root} {' '.join([f'( ({pos} {word}))' for word, pos in tokens])})")
         for i, word in mapped:
@@ -690,19 +701,27 @@ def load(
             A list of :class:`TreeSentence` instances.
         """
 
+        if lang is not None:
+            tokenizer = Tokenizer(lang)
         if isinstance(data, str) and os.path.exists(data):
-            data = open(data, 'r')
+            if data.endswith('.txt'):
+                data = (s.split() if lang is None else tokenizer(s) for s in open(data) if len(s) > 1)
+            else:
+                data = open(data)
         else:
             if lang is not None:
-                tokenizer = Tokenizer(lang)
                 data = [tokenizer(i) for i in ([data] if isinstance(data, str) else data)]
             else:
                 data = [data] if isinstance(data[0], str) else data
 
         index = 0
-        for s in data:
-            tree = nltk.Tree.fromstring(s) if isinstance(s, str) else self.totree(s, self.root)
-            sentence = TreeSentence(self, tree, index)
+        for s in progress_bar(data):
+            try:
+                tree = nltk.Tree.fromstring(s) if isinstance(s, str) else self.totree(s, self.root)
+                sentence = TreeSentence(self, tree, index)
+            except ValueError:
+                logger.warning(f"Error found while converting Sentence {index} to a tree:\n{s}\nDiscarding it!")
+                continue
             if max_len is not None and len(sentence) >= max_len:
                 logger.warning(f"Sentence {index} has {len(sentence)} tokens, exceeding {max_len}. Discarding it!")
             else: