Skip to content

Commit 0a878ad

Browse files
yanboliangmengxr
authored andcommitted
[SPARK-11875][ML][PYSPARK] Update doc for PySpark HasCheckpointInterval
* Update doc for PySpark ```HasCheckpointInterval``` that users can understand how to disable checkpoint. * Update doc for PySpark ```cacheNodeIds``` of ```DecisionTreeParams``` to notify the relationship between ```cacheNodeIds``` and ```checkpointInterval```. Author: Yanbo Liang <ybliang8@gmail.com> Closes apache#9856 from yanboliang/spark-11875. (cherry picked from commit 7216f40) Signed-off-by: Xiangrui Meng <meng@databricks.com>
1 parent d7b3d57 commit 0a878ad

File tree

2 files changed

+11
-9
lines changed

2 files changed

+11
-9
lines changed

python/pyspark/ml/param/_shared_params_code_gen.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ def get$Name(self):
118118
("inputCols", "input column names.", None),
119119
("outputCol", "output column name.", "self.uid + '__output'"),
120120
("numFeatures", "number of features.", None),
121-
("checkpointInterval", "checkpoint interval (>= 1).", None),
121+
("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " +
122+
"E.g. 10 means that the cache will get checkpointed every 10 iterations.", None),
122123
("seed", "random seed.", "hash(type(self).__name__)"),
123124
("tol", "the convergence tolerance for iterative algorithms.", None),
124125
("stepSize", "Step size to be used for each iteration of optimization.", None),
@@ -157,7 +158,8 @@ def get$Name(self):
157158
("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation."),
158159
("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " +
159160
"instances with nodes. If true, the algorithm will cache node IDs for each instance. " +
160-
"Caching can speed up training of deeper trees.")]
161+
"Caching can speed up training of deeper trees. Users can set how often should the " +
162+
"cache be checkpointed or disable it by setting checkpointInterval.")]
161163

162164
decisionTreeCode = '''class DecisionTreeParams(Params):
163165
"""

python/pyspark/ml/param/shared.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -325,16 +325,16 @@ def getNumFeatures(self):
325325

326326
class HasCheckpointInterval(Params):
327327
"""
328-
Mixin for param checkpointInterval: checkpoint interval (>= 1).
328+
Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
329329
"""
330330

331331
# a placeholder to make it appear in the generated doc
332-
checkpointInterval = Param(Params._dummy(), "checkpointInterval", "checkpoint interval (>= 1).")
332+
checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.")
333333

334334
def __init__(self):
335335
super(HasCheckpointInterval, self).__init__()
336-
#: param for checkpoint interval (>= 1).
337-
self.checkpointInterval = Param(self, "checkpointInterval", "checkpoint interval (>= 1).")
336+
#: param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
337+
self.checkpointInterval = Param(self, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.")
338338

339339
def setCheckpointInterval(self, value):
340340
"""
@@ -636,7 +636,7 @@ class DecisionTreeParams(Params):
636636
minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
637637
minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
638638
maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
639-
cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
639+
cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.")
640640

641641

642642
def __init__(self):
@@ -651,8 +651,8 @@ def __init__(self):
651651
self.minInfoGain = Param(self, "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
652652
#: param for Maximum memory in MB allocated to histogram aggregation.
653653
self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
654-
#: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
655-
self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
654+
#: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.
655+
self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.")
656656

657657
def setMaxDepth(self, value):
658658
"""

0 commit comments

Comments
 (0)