Reorganize modules

yzhangcs · yzhangcs · commit 2020dd0ba56c · 2021-12-25T17:22:11.000+08:00
diff --git a/supar/structs/fn.py b/supar/structs/fn.py
@@ -3,79 +3,7 @@
 import torch
 from supar.utils.common import MIN
 from supar.utils.fn import pad
-
-
-def kmeans(x, k, max_it=32):
-    r"""
-    KMeans algorithm for clustering the sentences by length.
-
-    Args:
-        x (list[int]):
-            The list of sentence lengths.
-        k (int):
-            The number of clusters.
-            This is an approximate value. The final number of clusters can be less or equal to `k`.
-        max_it (int):
-            Maximum number of iterations.
-            If centroids does not converge after several iterations, the algorithm will be early stopped.
-
-    Returns:
-        list[float], list[list[int]]:
-            The first list contains average lengths of sentences in each cluster.
-            The second is the list of clusters holding indices of data points.
-
-    Examples:
-        >>> x = torch.randint(10,20,(10,)).tolist()
-        >>> x
-        [15, 10, 17, 11, 18, 13, 17, 19, 18, 14]
-        >>> centroids, clusters = kmeans(x, 3)
-        >>> centroids
-        [10.5, 14.0, 17.799999237060547]
-        >>> clusters
-        [[1, 3], [0, 5, 9], [2, 4, 6, 7, 8]]
-    """
-
-    # the number of clusters must not be greater than the number of datapoints
-    x, k = torch.tensor(x, dtype=torch.float), min(len(x), k)
-    # collect unique datapoints
-    d = x.unique()
-    # initialize k centroids randomly
-    c = d[torch.randperm(len(d))[:k]]
-    # assign each datapoint to the cluster with the closest centroid
-    dists, y = torch.abs_(x.unsqueeze(-1) - c).min(-1)
-
-    for _ in range(max_it):
-        # if an empty cluster is encountered,
-        # choose the farthest datapoint from the biggest cluster and move that the empty one
-        mask = torch.arange(k).unsqueeze(-1).eq(y)
-        none = torch.where(~mask.any(-1))[0].tolist()
-        while len(none) > 0:
-            for i in none:
-                # the biggest cluster
-                b = torch.where(mask[mask.sum(-1).argmax()])[0]
-                # the datapoint farthest from the centroid of cluster b
-                f = dists[b].argmax()
-                # update the assigned cluster of f
-                y[b[f]] = i
-                # re-calculate the mask
-                mask = torch.arange(k).unsqueeze(-1).eq(y)
-            none = torch.where(~mask.any(-1))[0].tolist()
-        # update the centroids
-        c, old = (x * mask).sum(-1) / mask.sum(-1), c
-        # re-assign all datapoints to clusters
-        dists, y = torch.abs_(x.unsqueeze(-1) - c).min(-1)
-        # stop iteration early if the centroids converge
-        if c.equal(old):
-            break
-    # assign all datapoints to the new-generated clusters
-    # the empty ones are discarded
-    assigned = y.unique().tolist()
-    # get the centroids of the assigned clusters
-    centroids = c[assigned].tolist()
-    # map all values of datapoints to buckets
-    clusters = [torch.where(y.eq(i))[0].tolist() for i in assigned]
-
-    return centroids, clusters
+from torch.autograd import Function
 
 
 def tarjan(sequence):
@@ -283,3 +211,23 @@ def mst(scores, mask, multiroot=False):
         preds.append(tree)
 
     return pad(preds, total_length=seq_len).to(mask.device)
+
+
+class SampledLogsumexp(Function):
+
+    @staticmethod
+    def forward(ctx, x, dim=-1):
+        ctx.dim = dim
+        ctx.save_for_backward(x)
+        return x.logsumexp(dim=dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        from torch.distributions import OneHotCategorical
+        x, dim = ctx.saved_tensors, ctx.dim
+        if ctx.needs_input_grad[0]:
+            return grad_output.unsqueeze(dim).mul(OneHotCategorical(logits=x.movedim(dim, -1)).sample().movedim(-1, dim)), None
+        return None, None
+
+
+sampled_logsumexp = SampledLogsumexp.apply
diff --git a/supar/structs/semiring.py b/supar/structs/semiring.py
@@ -4,6 +4,7 @@
 
 import torch
 from supar.utils.common import MIN
+from supar.structs.fn import sampled_logsumexp
 
 
 class Semiring(object):
@@ -140,8 +141,8 @@ def one_(cls, x):
 
 
 class EntropySemiring(LogSemiring):
-    """
-    Entropy expectation semiring: :math:`<\oplus, +, [-\infty, 0], [0, 0]>`,
+    r"""
+    Entropy expectation semiring :math:`<\oplus, +, [-\infty, 0], [0, 0]>`,
     where :math:`\oplus` computes the log-values and the running distributional entropy :math:`H[p]`
     :cite:`li-eisner-2009-first,hwa-2000-sample,kim-etal-2019-unsupervised`.
     """
@@ -177,8 +178,8 @@ def one_(cls, x):
 
 
 class CrossEntropySemiring(LogSemiring):
-    """
-    Cross Entropy expectation semiring: :math:`<\oplus, +, [-\infty, -\infty, 0], [0, 0, 0]>`,
+    r"""
+    Cross Entropy expectation semiring :math:`<\oplus, +, [-\infty, -\infty, 0], [0, 0, 0]>`,
     where :math:`\oplus` computes the log-values and the running distributional cross entropy :math:`H[p,q]`
     of the two distributions :cite:`li-eisner-2009-first`.
     """
@@ -214,14 +215,11 @@ def one_(cls, x):
 
 
 class KLDivergenceSemiring(LogSemiring):
-    """
-    KL divergence expectation semiring: :math:`<\oplus, +, [-\infty, -\infty, 0], [0, 0, 0]>`,
+    r"""
+    KL divergence expectation semiring :math:`<\oplus, +, [-\infty, -\infty, 0], [0, 0, 0]>`,
     where :math:`\oplus` computes the log-values and the running distributional KL divergence :math:`KL[p \parallel q]`
     of the two distributions :cite:`li-eisner-2009-first`.
     """
-    """
-    KL divergence expectation semiring: `<logsumexp, +, -inf, 0>` :cite:`li-eisner-2009-first`.
-    """
 
     @classmethod
     def convert(cls, x):
@@ -261,20 +259,4 @@ class SampledSemiring(LogSemiring):
 
     @classmethod
     def sum(cls, x, dim=-1):
-        return SampledLogsumexp.apply(x, dim)
-
-
-class SampledLogsumexp(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, x, dim=-1):
-        ctx.save_for_backward(x, torch.tensor(dim))
-        return x.logsumexp(dim=dim)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        from torch.distributions import OneHotCategorical
-        x, dim = ctx.saved_tensors
-        if ctx.needs_input_grad[0]:
-            return grad_output.unsqueeze(dim).mul(OneHotCategorical(logits=x.movedim(dim, -1)).sample().movedim(-1, dim)), None
-        return None, None
+        return sampled_logsumexp(x, dim)
diff --git a/supar/structs/tree.py b/supar/structs/tree.py
@@ -3,8 +3,8 @@
 import torch
 import torch.nn as nn
 from supar.structs.dist import StructuredDistribution
+from supar.structs.fn import mst
 from supar.structs.semiring import LogSemiring
-from supar.utils.alg import mst
 from supar.utils.fn import stripe
 from torch.distributions.utils import lazy_property
 
diff --git a/supar/utils/__init__.py b/supar/utils/__init__.py
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 
-from . import alg, field, fn, metric, transform
-from .alg import chuliu_edmonds, kmeans, mst, tarjan
+from . import field, fn, metric, transform
 from .config import Config
 from .data import Dataset
 from .embedding import Embedding
@@ -10,5 +9,4 @@
 from .vocab import Vocab
 
 __all__ = ['ChartField', 'CoNLL', 'Config', 'Dataset', 'Embedding', 'Field',
-           'RawField', 'SubwordField', 'Transform', 'Tree', 'Vocab',
-           'alg', 'field', 'fn', 'metric', 'chuliu_edmonds', 'kmeans', 'mst', 'tarjan', 'transform']
+           'RawField', 'SubwordField', 'Transform', 'Tree', 'Vocab', 'field', 'fn', 'metric', 'transform']
diff --git a/supar/utils/data.py b/supar/utils/data.py
@@ -2,7 +2,7 @@
 
 import torch
 import torch.distributed as dist
-from supar.utils.alg import kmeans
+from supar.utils.fn import kmeans
 from supar.utils.transform import Batch
 from torch.utils.data import DataLoader
 
diff --git a/supar/utils/fn.py b/supar/utils/fn.py
@@ -29,6 +29,79 @@ def tohalfwidth(token):
     return unicodedata.normalize('NFKC', token)
 
 
+def kmeans(x, k, max_it=32):
+    r"""
+    KMeans algorithm for clustering the sentences by length.
+
+    Args:
+        x (list[int]):
+            The list of sentence lengths.
+        k (int):
+            The number of clusters.
+            This is an approximate value. The final number of clusters can be less or equal to `k`.
+        max_it (int):
+            Maximum number of iterations.
+            If centroids does not converge after several iterations, the algorithm will be early stopped.
+
+    Returns:
+        list[float], list[list[int]]:
+            The first list contains average lengths of sentences in each cluster.
+            The second is the list of clusters holding indices of data points.
+
+    Examples:
+        >>> x = torch.randint(10,20,(10,)).tolist()
+        >>> x
+        [15, 10, 17, 11, 18, 13, 17, 19, 18, 14]
+        >>> centroids, clusters = kmeans(x, 3)
+        >>> centroids
+        [10.5, 14.0, 17.799999237060547]
+        >>> clusters
+        [[1, 3], [0, 5, 9], [2, 4, 6, 7, 8]]
+    """
+
+    # the number of clusters must not be greater than the number of datapoints
+    x, k = torch.tensor(x, dtype=torch.float), min(len(x), k)
+    # collect unique datapoints
+    d = x.unique()
+    # initialize k centroids randomly
+    c = d[torch.randperm(len(d))[:k]]
+    # assign each datapoint to the cluster with the closest centroid
+    dists, y = torch.abs_(x.unsqueeze(-1) - c).min(-1)
+
+    for _ in range(max_it):
+        # if an empty cluster is encountered,
+        # choose the farthest datapoint from the biggest cluster and move that the empty one
+        mask = torch.arange(k).unsqueeze(-1).eq(y)
+        none = torch.where(~mask.any(-1))[0].tolist()
+        while len(none) > 0:
+            for i in none:
+                # the biggest cluster
+                b = torch.where(mask[mask.sum(-1).argmax()])[0]
+                # the datapoint farthest from the centroid of cluster b
+                f = dists[b].argmax()
+                # update the assigned cluster of f
+                y[b[f]] = i
+                # re-calculate the mask
+                mask = torch.arange(k).unsqueeze(-1).eq(y)
+            none = torch.where(~mask.any(-1))[0].tolist()
+        # update the centroids
+        c, old = (x * mask).sum(-1) / mask.sum(-1), c
+        # re-assign all datapoints to clusters
+        dists, y = torch.abs_(x.unsqueeze(-1) - c).min(-1)
+        # stop iteration early if the centroids converge
+        if c.equal(old):
+            break
+    # assign all datapoints to the new-generated clusters
+    # the empty ones are discarded
+    assigned = y.unique().tolist()
+    # get the centroids of the assigned clusters
+    centroids = c[assigned].tolist()
+    # map all values of datapoints to buckets
+    clusters = [torch.where(y.eq(i))[0].tolist() for i in assigned]
+
+    return centroids, clusters
+
+
 def stripe(x, n, w, offset=(0, 0), dim=1):
     r"""
     Returns a diagonal stripe of the tensor.
diff --git a/supar/utils/transform.py b/supar/utils/transform.py
@@ -292,7 +292,7 @@ def istree(cls, sequence, proj=False, multiroot=False):
             False
         """
 
-        from supar.utils.alg import tarjan
+        from supar.structs.fn import tarjan
         if proj and not cls.isprojective(sequence):
             return False
         n_roots = sum(head == 0 for head in sequence)
diff --git a/tests/test_fn.py b/tests/test_fn.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-from supar.utils import tarjan
+from supar.structs.fn import tarjan
 
 
 def test_tarjan():