Batch object

yzhangcs · yzhangcs · commit 207d38e959e7 · 2021-06-03T20:02:43.000+08:00
diff --git a/supar/utils/data.py b/supar/utils/data.py
@@ -1,10 +1,10 @@
 # -*- coding: utf-8 -*-
 
-from collections import namedtuple
-
 import torch
 import torch.distributed as dist
 from supar.utils.alg import kmeans
+from supar.utils.transform import Batch
+from torch.utils.data import DataLoader
 
 
 class Dataset(torch.utils.data.Dataset):
@@ -74,32 +74,14 @@ def __getstate__(self):
     def __setstate__(self, state):
         self.__dict__.update(state)
 
-    def collate_fn(self, batch):
-        if not hasattr(self, 'fields'):
-            raise RuntimeError("The fields are not numericalized yet. Please build the dataset first.")
-        return {f: [s.transformed[f.name] for s in batch] for f in self.fields}
-
     def build(self, batch_size, n_buckets=1, shuffle=False, distributed=False):
         # numericalize all fields
-        self.fields = self.transform(self.sentences)
+        fields = self.transform(self.sentences)
         # NOTE: the final bucket count is roughly equal to n_buckets
-        self.buckets = dict(zip(*kmeans([len(s.transformed[self.fields[0].name]) for s in self], n_buckets)))
+        self.buckets = dict(zip(*kmeans([len(s.transformed[fields[0].name]) for s in self], n_buckets)))
         self.loader = DataLoader(dataset=self,
                                  batch_sampler=Sampler(self.buckets, batch_size, shuffle, distributed),
-                                 collate_fn=self.collate_fn)
-
-
-class DataLoader(torch.utils.data.DataLoader):
-    r"""
-    DataLoader, matching with :class:`Dataset`.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def __iter__(self):
-        for batch in super().__iter__():
-            yield namedtuple('Batch', (f.name for f in batch.keys()))(*[f.compose(d) for f, d in batch.items()])
+                                 collate_fn=lambda x: Batch(x))
 
 
 class Sampler(torch.utils.data.Sampler):
diff --git a/supar/utils/transform.py b/supar/utils/transform.py
@@ -83,57 +83,6 @@ def save(self, path, sentences):
             f.write('\n'.join([str(i) for i in sentences]) + '\n')
 
 
-class Sentence(object):
-    r"""
-    A Sentence object holds a sentence with regard to specific data format.
-    """
-
-    def __init__(self, transform):
-        self.transform = transform
-
-        # mapping from each nested field to their proper position
-        self.maps = dict()
-        # names of each field
-        self.keys = set()
-        for i, field in enumerate(self.transform):
-            if not isinstance(field, Iterable):
-                field = [field]
-            for f in field:
-                if f is not None:
-                    self.maps[f.name] = i
-                    self.keys.add(f.name)
-        # original values and numericalized values of each position
-        self.values = []
-        self.transformed = {key: None for key in self.keys}
-
-    def __contains__(self, key):
-        return key in self.keys
-
-    def __getattr__(self, name):
-        if name in self.__dict__:
-            return self.__dict__[name]
-        elif name in self.maps:
-            return self.values[self.maps[name]]
-        else:
-            raise AttributeError
-
-    def __setattr__(self, name, value):
-        if 'keys' in self.__dict__ and name in self:
-            index = self.maps[name]
-            if index >= len(self.values):
-                self.__dict__[name] = value
-            else:
-                self.values[index] = value
-        else:
-            self.__dict__[name] = value
-
-    def __getstate__(self):
-        return vars(self)
-
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-
-
 class CoNLL(Transform):
     r"""
     The CoNLL object holds ten fields required for CoNLL-X data format :cite:`buchholz-marsi-2006-conll`.
@@ -402,77 +351,6 @@ def load(self, data, lang=None, proj=False, max_len=None, **kwargs):
         return sentences
 
 
-class CoNLLSentence(Sentence):
-    r"""
-    Sencence in CoNLL-X format.
-
-    Args:
-        transform (CoNLL):
-            A :class:`~supar.utils.transform.CoNLL` object.
-        lines (list[str]):
-            A list of strings composing a sentence in CoNLL-X format.
-            Comments and non-integer IDs are permitted.
-
-    Examples:
-        >>> lines = ['# text = But I found the location wonderful and the neighbors very kind.',
-                     '1\tBut\t_\t_\t_\t_\t_\t_\t_\t_',
-                     '2\tI\t_\t_\t_\t_\t_\t_\t_\t_',
-                     '3\tfound\t_\t_\t_\t_\t_\t_\t_\t_',
-                     '4\tthe\t_\t_\t_\t_\t_\t_\t_\t_',
-                     '5\tlocation\t_\t_\t_\t_\t_\t_\t_\t_',
-                     '6\twonderful\t_\t_\t_\t_\t_\t_\t_\t_',
-                     '7\tand\t_\t_\t_\t_\t_\t_\t_\t_',
-                     '7.1\tfound\t_\t_\t_\t_\t_\t_\t_\t_',
-                     '8\tthe\t_\t_\t_\t_\t_\t_\t_\t_',
-                     '9\tneighbors\t_\t_\t_\t_\t_\t_\t_\t_',
-                     '10\tvery\t_\t_\t_\t_\t_\t_\t_\t_',
-                     '11\tkind\t_\t_\t_\t_\t_\t_\t_\t_',
-                     '12\t.\t_\t_\t_\t_\t_\t_\t_\t_']
-        >>> sentence = CoNLLSentence(transform, lines)  # fields in transform are built from ptb.
-        >>> sentence.arcs = [3, 3, 0, 5, 6, 3, 6, 9, 11, 11, 6, 3]
-        >>> sentence.rels = ['cc', 'nsubj', 'root', 'det', 'nsubj', 'xcomp',
-                             'cc', 'det', 'dep', 'advmod', 'conj', 'punct']
-        >>> sentence
-        # text = But I found the location wonderful and the neighbors very kind.
-        1       But     _       _       _       _       3       cc      _       _
-        2       I       _       _       _       _       3       nsubj   _       _
-        3       found   _       _       _       _       0       root    _       _
-        4       the     _       _       _       _       5       det     _       _
-        5       location        _       _       _       _       6       nsubj   _       _
-        6       wonderful       _       _       _       _       3       xcomp   _       _
-        7       and     _       _       _       _       6       cc      _       _
-        7.1     found   _       _       _       _       _       _       _       _
-        8       the     _       _       _       _       9       det     _       _
-        9       neighbors       _       _       _       _       11      dep     _       _
-        10      very    _       _       _       _       11      advmod  _       _
-        11      kind    _       _       _       _       6       conj    _       _
-        12      .       _       _       _       _       3       punct   _       _
-    """
-
-    def __init__(self, transform, lines):
-        super().__init__(transform)
-
-        self.values = []
-        # record annotations for post-recovery
-        self.annotations = dict()
-
-        for i, line in enumerate(lines):
-            value = line.split('\t')
-            if value[0].startswith('#') or not value[0].isdigit():
-                self.annotations[-i-1] = line
-            else:
-                self.annotations[len(self.values)] = line
-                self.values.append(value)
-        self.values = list(zip(*self.values))
-
-    def __repr__(self):
-        # cover the raw lines
-        merged = {**self.annotations,
-                  **{i: '\t'.join(map(str, line))
-                     for i, line in enumerate(zip(*self.values))}}
-        return '\n'.join(merged.values()) + '\n'
-
-
 class Tree(Transform):
     r"""
     The Tree object factorize a constituency tree into four fields,
@@ -741,6 +619,150 @@ def load(self, data, lang=None, max_len=None, **kwargs):
         return sentences
 
 
+class Batch(object):
+
+    def __init__(self, sentences):
+        self.sentences = sentences
+        self.transformed = {f.name: f.compose([s.transformed[f.name] for s in sentences])
+                            for f in sentences[0].transform.flattened_fields}
+        self.fields = list(self.transformed.keys())
+
+    def __repr__(self):
+        s = ', '.join([f"{name}" for name in self.fields])
+        return f"{self.__class__.__name__}({s})"
+
+    def __getitem__(self, index):
+        return self.transformed[self.fields[index]]
+
+    def __getattr__(self, name):
+        if name in self.__dict__:
+            return self.__dict__[name]
+        if name in self.transformed:
+            return self.transformed[name]
+        if hasattr(self.sentences[0], name):
+            return [getattr(s, name) for s in self.sentences]
+        raise AttributeError
+
+
+class Sentence(object):
+
+    def __init__(self, transform):
+        self.transform = transform
+
+        # mapping from each nested field to their proper position
+        self.maps = dict()
+        # names of each field
+        self.keys = set()
+        for i, field in enumerate(self.transform):
+            if not isinstance(field, Iterable):
+                field = [field]
+            for f in field:
+                if f is not None:
+                    self.maps[f.name] = i
+                    self.keys.add(f.name)
+        # original values and numericalized values of each position
+        self.values = []
+        self.transformed = {key: None for key in self.keys}
+
+    def __contains__(self, key):
+        return key in self.keys
+
+    def __getattr__(self, name):
+        if name in self.__dict__:
+            return self.__dict__[name]
+        elif name in self.maps:
+            return self.values[self.maps[name]]
+        else:
+            raise AttributeError
+
+    def __setattr__(self, name, value):
+        if 'keys' in self.__dict__ and name in self:
+            index = self.maps[name]
+            if index >= len(self.values):
+                self.__dict__[name] = value
+            else:
+                self.values[index] = value
+        else:
+            self.__dict__[name] = value
+
+    def __getstate__(self):
+        return vars(self)
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+
+class CoNLLSentence(Sentence):
+    r"""
+    Sencence in CoNLL-X format.
+
+    Args:
+        transform (CoNLL):
+            A :class:`~supar.utils.transform.CoNLL` object.
+        lines (list[str]):
+            A list of strings composing a sentence in CoNLL-X format.
+            Comments and non-integer IDs are permitted.
+
+    Examples:
+        >>> lines = ['# text = But I found the location wonderful and the neighbors very kind.',
+                     '1\tBut\t_\t_\t_\t_\t_\t_\t_\t_',
+                     '2\tI\t_\t_\t_\t_\t_\t_\t_\t_',
+                     '3\tfound\t_\t_\t_\t_\t_\t_\t_\t_',
+                     '4\tthe\t_\t_\t_\t_\t_\t_\t_\t_',
+                     '5\tlocation\t_\t_\t_\t_\t_\t_\t_\t_',
+                     '6\twonderful\t_\t_\t_\t_\t_\t_\t_\t_',
+                     '7\tand\t_\t_\t_\t_\t_\t_\t_\t_',
+                     '7.1\tfound\t_\t_\t_\t_\t_\t_\t_\t_',
+                     '8\tthe\t_\t_\t_\t_\t_\t_\t_\t_',
+                     '9\tneighbors\t_\t_\t_\t_\t_\t_\t_\t_',
+                     '10\tvery\t_\t_\t_\t_\t_\t_\t_\t_',
+                     '11\tkind\t_\t_\t_\t_\t_\t_\t_\t_',
+                     '12\t.\t_\t_\t_\t_\t_\t_\t_\t_']
+        >>> sentence = CoNLLSentence(transform, lines)  # fields in transform are built from ptb.
+        >>> sentence.arcs = [3, 3, 0, 5, 6, 3, 6, 9, 11, 11, 6, 3]
+        >>> sentence.rels = ['cc', 'nsubj', 'root', 'det', 'nsubj', 'xcomp',
+                             'cc', 'det', 'dep', 'advmod', 'conj', 'punct']
+        >>> sentence
+        # text = But I found the location wonderful and the neighbors very kind.
+        1       But     _       _       _       _       3       cc      _       _
+        2       I       _       _       _       _       3       nsubj   _       _
+        3       found   _       _       _       _       0       root    _       _
+        4       the     _       _       _       _       5       det     _       _
+        5       location        _       _       _       _       6       nsubj   _       _
+        6       wonderful       _       _       _       _       3       xcomp   _       _
+        7       and     _       _       _       _       6       cc      _       _
+        7.1     found   _       _       _       _       _       _       _       _
+        8       the     _       _       _       _       9       det     _       _
+        9       neighbors       _       _       _       _       11      dep     _       _
+        10      very    _       _       _       _       11      advmod  _       _
+        11      kind    _       _       _       _       6       conj    _       _
+        12      .       _       _       _       _       3       punct   _       _
+    """
+
+    def __init__(self, transform, lines):
+        super().__init__(transform)
+
+        self.values = []
+        # record annotations for post-recovery
+        self.annotations = dict()
+
+        for i, line in enumerate(lines):
+            value = line.split('\t')
+            if value[0].startswith('#') or not value[0].isdigit():
+                self.annotations[-i-1] = line
+            else:
+                self.annotations[len(self.values)] = line
+                self.values.append(value)
+        self.values = list(zip(*self.values))
+
+    def __repr__(self):
+        # cover the raw lines
+        merged = {**self.annotations,
+                  **{i: '\t'.join(map(str, line))
+                     for i, line in enumerate(zip(*self.values))}}
+        return '\n'.join(merged.values()) + '\n'
+
+
 class TreeSentence(Sentence):
     r"""
     Args: