Skip to content

Commit bfb4201

Browse files
yinxusenmengxr
authored andcommitted
[SPARK-11551][DOC] Replace example code in ml-features.md using include_example
PR on behalf of somideshmukh, thanks! Author: Xusen Yin <yinxusen@gmail.com> Author: somideshmukh <somilde@us.ibm.com> Closes apache#10219 from yinxusen/SPARK-11551. (cherry picked from commit 051c6a0) Signed-off-by: Xiangrui Meng <meng@databricks.com>
1 parent ee0a6e7 commit bfb4201

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+2820
-1061
lines changed

docs/ml-features.md

Lines changed: 51 additions & 1061 deletions
Large diffs are not rendered by default.
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.ml;
19+
20+
import org.apache.spark.SparkConf;
21+
import org.apache.spark.api.java.JavaSparkContext;
22+
import org.apache.spark.sql.SQLContext;
23+
24+
// $example on$
25+
import java.util.Arrays;
26+
27+
import org.apache.spark.api.java.JavaRDD;
28+
import org.apache.spark.ml.feature.Binarizer;
29+
import org.apache.spark.sql.DataFrame;
30+
import org.apache.spark.sql.Row;
31+
import org.apache.spark.sql.RowFactory;
32+
import org.apache.spark.sql.types.DataTypes;
33+
import org.apache.spark.sql.types.Metadata;
34+
import org.apache.spark.sql.types.StructField;
35+
import org.apache.spark.sql.types.StructType;
36+
// $example off$
37+
38+
public class JavaBinarizerExample {
39+
public static void main(String[] args) {
40+
SparkConf conf = new SparkConf().setAppName("JavaBinarizerExample");
41+
JavaSparkContext jsc = new JavaSparkContext(conf);
42+
SQLContext jsql = new SQLContext(jsc);
43+
44+
// $example on$
45+
JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
46+
RowFactory.create(0, 0.1),
47+
RowFactory.create(1, 0.8),
48+
RowFactory.create(2, 0.2)
49+
));
50+
StructType schema = new StructType(new StructField[]{
51+
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
52+
new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
53+
});
54+
DataFrame continuousDataFrame = jsql.createDataFrame(jrdd, schema);
55+
Binarizer binarizer = new Binarizer()
56+
.setInputCol("feature")
57+
.setOutputCol("binarized_feature")
58+
.setThreshold(0.5);
59+
DataFrame binarizedDataFrame = binarizer.transform(continuousDataFrame);
60+
DataFrame binarizedFeatures = binarizedDataFrame.select("binarized_feature");
61+
for (Row r : binarizedFeatures.collect()) {
62+
Double binarized_value = r.getDouble(0);
63+
System.out.println(binarized_value);
64+
}
65+
// $example off$
66+
jsc.stop();
67+
}
68+
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.ml;
19+
20+
import org.apache.spark.SparkConf;
21+
import org.apache.spark.api.java.JavaSparkContext;
22+
import org.apache.spark.sql.SQLContext;
23+
24+
// $example on$
25+
import java.util.Arrays;
26+
27+
import org.apache.spark.api.java.JavaRDD;
28+
import org.apache.spark.ml.feature.Bucketizer;
29+
import org.apache.spark.sql.DataFrame;
30+
import org.apache.spark.sql.Row;
31+
import org.apache.spark.sql.RowFactory;
32+
import org.apache.spark.sql.types.DataTypes;
33+
import org.apache.spark.sql.types.Metadata;
34+
import org.apache.spark.sql.types.StructField;
35+
import org.apache.spark.sql.types.StructType;
36+
// $example off$
37+
38+
public class JavaBucketizerExample {
39+
public static void main(String[] args) {
40+
SparkConf conf = new SparkConf().setAppName("JavaBucketizerExample");
41+
JavaSparkContext jsc = new JavaSparkContext(conf);
42+
SQLContext jsql = new SQLContext(jsc);
43+
44+
// $example on$
45+
double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};
46+
47+
JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
48+
RowFactory.create(-0.5),
49+
RowFactory.create(-0.3),
50+
RowFactory.create(0.0),
51+
RowFactory.create(0.2)
52+
));
53+
StructType schema = new StructType(new StructField[]{
54+
new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
55+
});
56+
DataFrame dataFrame = jsql.createDataFrame(data, schema);
57+
58+
Bucketizer bucketizer = new Bucketizer()
59+
.setInputCol("features")
60+
.setOutputCol("bucketedFeatures")
61+
.setSplits(splits);
62+
63+
// Transform original data into its bucket index.
64+
DataFrame bucketedData = bucketizer.transform(dataFrame);
65+
bucketedData.show();
66+
// $example off$
67+
jsc.stop();
68+
}
69+
}
70+
71+
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.ml;
19+
20+
import org.apache.spark.SparkConf;
21+
import org.apache.spark.api.java.JavaSparkContext;
22+
import org.apache.spark.sql.SQLContext;
23+
24+
// $example on$
25+
import java.util.Arrays;
26+
27+
import org.apache.spark.api.java.JavaRDD;
28+
import org.apache.spark.ml.feature.DCT;
29+
import org.apache.spark.mllib.linalg.VectorUDT;
30+
import org.apache.spark.mllib.linalg.Vectors;
31+
import org.apache.spark.sql.DataFrame;
32+
import org.apache.spark.sql.Row;
33+
import org.apache.spark.sql.RowFactory;
34+
import org.apache.spark.sql.types.Metadata;
35+
import org.apache.spark.sql.types.StructField;
36+
import org.apache.spark.sql.types.StructType;
37+
// $example off$
38+
39+
public class JavaDCTExample {
40+
public static void main(String[] args) {
41+
SparkConf conf = new SparkConf().setAppName("JavaDCTExample");
42+
JavaSparkContext jsc = new JavaSparkContext(conf);
43+
SQLContext jsql = new SQLContext(jsc);
44+
45+
// $example on$
46+
JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
47+
RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
48+
RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
49+
RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
50+
));
51+
StructType schema = new StructType(new StructField[]{
52+
new StructField("features", new VectorUDT(), false, Metadata.empty()),
53+
});
54+
DataFrame df = jsql.createDataFrame(data, schema);
55+
DCT dct = new DCT()
56+
.setInputCol("features")
57+
.setOutputCol("featuresDCT")
58+
.setInverse(false);
59+
DataFrame dctDf = dct.transform(df);
60+
dctDf.select("featuresDCT").show(3);
61+
// $example off$
62+
jsc.stop();
63+
}
64+
}
65+
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.ml;
19+
20+
import org.apache.spark.SparkConf;
21+
import org.apache.spark.api.java.JavaSparkContext;
22+
import org.apache.spark.sql.SQLContext;
23+
24+
// $example on$
25+
import java.util.ArrayList;
26+
import java.util.Arrays;
27+
import java.util.List;
28+
29+
import org.apache.spark.api.java.JavaRDD;
30+
import org.apache.spark.ml.feature.ElementwiseProduct;
31+
import org.apache.spark.mllib.linalg.Vector;
32+
import org.apache.spark.mllib.linalg.VectorUDT;
33+
import org.apache.spark.mllib.linalg.Vectors;
34+
import org.apache.spark.sql.DataFrame;
35+
import org.apache.spark.sql.Row;
36+
import org.apache.spark.sql.RowFactory;
37+
import org.apache.spark.sql.types.DataTypes;
38+
import org.apache.spark.sql.types.StructField;
39+
import org.apache.spark.sql.types.StructType;
40+
// $example off$
41+
42+
public class JavaElementwiseProductExample {
43+
public static void main(String[] args) {
44+
SparkConf conf = new SparkConf().setAppName("JavaElementwiseProductExample");
45+
JavaSparkContext jsc = new JavaSparkContext(conf);
46+
SQLContext sqlContext = new SQLContext(jsc);
47+
48+
// $example on$
49+
// Create some vector data; also works for sparse vectors
50+
JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
51+
RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
52+
RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))
53+
));
54+
55+
List<StructField> fields = new ArrayList<StructField>(2);
56+
fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
57+
fields.add(DataTypes.createStructField("vector", new VectorUDT(), false));
58+
59+
StructType schema = DataTypes.createStructType(fields);
60+
61+
DataFrame dataFrame = sqlContext.createDataFrame(jrdd, schema);
62+
63+
Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);
64+
65+
ElementwiseProduct transformer = new ElementwiseProduct()
66+
.setScalingVec(transformingVector)
67+
.setInputCol("vector")
68+
.setOutputCol("transformedVector");
69+
70+
// Batch transform the vectors to create new column:
71+
transformer.transform(dataFrame).show();
72+
// $example off$
73+
jsc.stop();
74+
}
75+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.ml;
19+
20+
import org.apache.spark.SparkConf;
21+
import org.apache.spark.api.java.JavaSparkContext;
22+
import org.apache.spark.sql.SQLContext;
23+
24+
// $example on$
25+
import org.apache.spark.ml.feature.MinMaxScaler;
26+
import org.apache.spark.ml.feature.MinMaxScalerModel;
27+
import org.apache.spark.sql.DataFrame;
28+
// $example off$
29+
30+
public class JavaMinMaxScalerExample {
31+
public static void main(String[] args) {
32+
SparkConf conf = new SparkConf().setAppName("JaveMinMaxScalerExample");
33+
JavaSparkContext jsc = new JavaSparkContext(conf);
34+
SQLContext jsql = new SQLContext(jsc);
35+
36+
// $example on$
37+
DataFrame dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
38+
MinMaxScaler scaler = new MinMaxScaler()
39+
.setInputCol("features")
40+
.setOutputCol("scaledFeatures");
41+
42+
// Compute summary statistics and generate MinMaxScalerModel
43+
MinMaxScalerModel scalerModel = scaler.fit(dataFrame);
44+
45+
// rescale each feature to range [min, max].
46+
DataFrame scaledData = scalerModel.transform(dataFrame);
47+
scaledData.show();
48+
// $example off$
49+
jsc.stop();
50+
}
51+
}

0 commit comments

Comments
 (0)