add invertedIndex example

liudiwei · liudiwei · commit c3784a5d5552 · 2016-07-30T15:48:19.000+08:00
diff --git a/README.md b/README.md
@@ -26,14 +26,17 @@
 
 - PCA主要成分分析, [Python实现源码](https://github.com/csuldw/MachineLearning/tree/master/PCA)
 
+- spark-demo：使用scala编写的spark实例.
+	- invertedIndex, [Spark 倒排索引源码](https://github.com/csuldw/MachineLearning/tree/master/invertedIndex)
+
 ## Supplementary
 
 - MNIST数据集[加载方法](https://github.com/csuldw/MachineLearning/tree/master/dataset/MNIST).
 
 
 ## Contributor
 
-- 刘帝伟, 中南大学在读硕士，[Homepage](http://csuldw.github.io).
+- 刘帝伟, 中南大学在读硕士，[HomePage](http://www.csuldw.com).
 
 
 ## Contact
diff --git a/doc/i0200bh53yr.p202.1.mp4.egt b/doc/i0200bh53yr.p202.1.mp4.egt
diff --git a/spark-demo/README.md b/spark-demo/README.md
@@ -0,0 +1,4 @@
+
+### 说明
+
+- invertedIndex: 一个倒排索引的spark实例.
diff --git a/spark-demo/invertedIndex/build.sbt b/spark-demo/invertedIndex/build.sbt
@@ -0,0 +1,5 @@
+name := "invertedIndex"
+version := "1.0.0"
+scalaVersion := "2.10.4"
+libraryDependencies += "org.apache.spark" %% "spark-core" % "1.3.1"
+resolvers += "Akka Respository" at "http://repo.akka.io/releases/"
diff --git a/spark-demo/invertedIndex/conf/base.conf b/spark-demo/invertedIndex/conf/base.conf
@@ -0,0 +1 @@
+inputfile=/home/hadoop-news/liudiwei/test/input.data
diff --git a/spark-demo/invertedIndex/data/input.data b/spark-demo/invertedIndex/data/input.data
@@ -0,0 +1,5 @@
+doc1	Apache Spark Scala Hadoop Java C Python Do And Will KNN
+doc2	SVM Scala News Play Akka Yes GBDT
+doc3	LDA SVM RF GBDT Adaboost Kmeans KNN 
+doc4	QQ BAT I Great All LDA
+doc5	Apache Hadoop MapReduce Git SVN SVM
diff --git a/spark-demo/invertedIndex/deps/json4s-native_2.10-3.2.10.jar b/spark-demo/invertedIndex/deps/json4s-native_2.10-3.2.10.jar
diff --git a/spark-demo/invertedIndex/result/res.data b/spark-demo/invertedIndex/result/res.data
@@ -0,0 +1,29 @@
+(Akka,doc2)                                                                     
+(Python,doc1)
+(QQ,doc4)
+(RF,doc3)
+(Apache,doc1|doc5)
+(Will,doc1)
+(Java,doc1)
+(MapReduce,doc5)
+(SVM,doc2|doc3|doc5)
+(Scala,doc1|doc2)
+(Git,doc5)
+(Play,doc2)
+(And,doc1)
+(SVN,doc5)
+(GBDT,doc2|doc3)
+(News,doc2)
+(Spark,doc1)
+(Kmeans,doc3)
+(Do,doc1)
+(KNN,doc1|doc3)
+(I,doc4)
+(All,doc4)
+(LDA,doc4|doc3)
+(BAT,doc4)
+(Great,doc4)
+(C,doc1)
+(Adaboost,doc3)
+(Yes,doc2)
+(Hadoop,doc5|doc1)
diff --git a/spark-demo/invertedIndex/run.sh b/spark-demo/invertedIndex/run.sh
@@ -0,0 +1,21 @@
+hadoop fs -put data/input.data /home/hadoop-news/liudiwei/test
+exe_cores=2
+exe_num=2
+tmp_dir=/home/hdp-guanggao/old_jobs/tmp
+exe_mem=2G
+drv_mem=3G
+
+spark-submit \
+    --master yarn-client \
+    --driver-memory $drv_mem \
+    --executor-memory $exe_mem \
+    --num-executors $exe_num \
+    --executor-cores $exe_cores \
+    --driver-java-options -Dsun.io.serialization.extendedDebugInfo=true \
+    --driver-java-options -Djava.io.tmpdir=$tmp_dir \
+    --conf spark.eventLog.enabled=true \
+    --conf spark.storage.memoryFraction=0.1 \
+    --jars "deps/json4s-native_2.10-3.2.10.jar" \
+    --class "InvertedIndex" \
+    ./target/scala-2.10/invertedindex_2.10-1.0.0.jar \
+    conf/base.conf
diff --git a/spark-demo/invertedIndex/src/main/scala/invertedIndex.scala b/spark-demo/invertedIndex/src/main/scala/invertedIndex.scala
@@ -0,0 +1,29 @@
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext._
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.commons.configuration.{ PropertiesConfiguration => HierConf }
+import org.apache.spark.broadcast.Broadcast
+
+import scala.collection.mutable._
+
+object InvertedIndex{
+  def main(args : Array[String]){
+    val conf = new SparkConf().setAppName("invertedIndex")
+      .set("spark.serializer", "org.apache.spark.serializer.JavaSerializer")
+      .set("spark.akka.frameSize","256")
+      .set("spark.ui.port","4071")
+    val sc = new org.apache.spark.SparkContext(conf)
+    val cfg = new HierConf(args(0))
+    val inputfile = cfg.getString("inputfile")
+    val result = sc.textFile(inputfile)
+      .map(x => x.split("\t"))
+      .map(x => (x(0), x(1)))
+      .map(x => x._2.split(" ").map(y => (y, x._1)))
+      .flatMap(x => x)
+      .reduceByKey( (x, y) => x + "|" + y)
+    result.collect.foreach(println)
+    sc.stop()
+  }
+}
diff --git a/spark-demo/invertedIndex/target/scala-2.10/invertedindex_2.10-1.0.0.jar b/spark-demo/invertedIndex/target/scala-2.10/invertedindex_2.10-1.0.0.jar

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
++
 +### 说明
++
 +- invertedIndex: 一个倒排索引的spark实例.
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+inputfile=/home/hadoop-news/liudiwei/test/input.data`