SlideShare a Scribd company logo
R      MapReduce

    @holidayworking


    2010   8   28
)
        Twitter: @holidayworking
        :
        :
        :                   F1
        :
        Java, PL/SQL:
        Python, Ruby, R:




@holidayworking ()          R    MapReduce   2010   8   28   2 / 18
MapReduce



   Google

   map          reduce   2




  @holidayworking ()         R   MapReduce   2010   8   28   3 / 18
MapReduce


 1   Map


 2   Shuffle

 3   Reduce




     @holidayworking ()   R   MapReduce   2010   8   28   4 / 18
MapReduce




        [1]


  @holidayworking ()   R   MapReduce   2010   8   28   5 / 18
MapReduce




           Grep




  @holidayworking ()   R   MapReduce   2010   8   28   6 / 18
Hadoop


   Google File System   MapReduce




  @holidayworking ()     R   MapReduce   2010   8   28   7 / 18
Hadoop


   Google File System   MapReduce
   Hadoop      Java




  @holidayworking ()     R   MapReduce   2010   8   28   7 / 18
Hadoop


   Google File System   MapReduce
   Hadoop      Java
          MapReduce                      Java




  @holidayworking ()     R   MapReduce      2010   8   28   7 / 18
Hadoop


   Google File System   MapReduce
   Hadoop      Java
          MapReduce                      Java

   Hadoop Streaming




  @holidayworking ()     R   MapReduce      2010   8   28   7 / 18
Hadoop


   Google File System   MapReduce
   Hadoop      Java
          MapReduce                            Java

   Hadoop Streaming
                                         MapReduce




  @holidayworking ()     R   MapReduce               2010   8   28   7 / 18
Hadoop


   Google File System   MapReduce
   Hadoop      Java
          MapReduce                            Java

   Hadoop Streaming
                                         MapReduce

          R




  @holidayworking ()     R   MapReduce               2010   8   28   7 / 18
R       MapReduce




                    Ardbeg 10 Years Old
                    Bowmore 12 Years Old
                    Talisker 10 Years Old
                    The Glenlivet 12 Year Old
                    The Macallan 12 Years

                    Ballantine 12 Years Old
                    Ballantine 17 Years Old
                    Johnnie Walker Gold Label 18 Years Old
                    Johnnie Walker Swing



    @holidayworking ()               R   MapReduce           2010   8   28   8 / 18
iWork           Numbers
                  250

 2010/07/01          The Macallan 12 Years                    single malt       10
 2010/07/01          Ballantine 12 Years Old                    blended         3
 2010/07/01          Ballantine 17 Years Old                    blended         6
 2010/07/01          Johnnie Walker Gold Label 18 Years Old     blended         6
 2010/07/02          The Glenlivet 12 Year Old                single malt       4
 2010/07/02          Ardbeg 10 Years Old                      single malt       2
 2010/07/02          Ballantine 12 Years Old                    blended         8
 2010/07/02          Ballantine 17 Years Old                    blended         7
 2010/07/02          Johnnie Walker Swing                       blended         3
                                           (   )
 2010/07/31          Johnnie Walker Swing                       blended         4
 2010/07/31          Johnnie Walker Gold Label 18 Years Old     blended         2
 2010/07/31          Bowmore 12 Years Old                     single malt       4
 2010/07/31          Talisker 10 Years Old                    single malt       7



@holidayworking ()                    R   MapReduce                  2010   8   28   9 / 18
@holidayworking ()   R   MapReduce   2010   8   28   10 / 18
MapReduce

 1   Mapper
 2   Reducer
 3   Hadoop Streaming
     $ hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-0.20.2-streaming.jar 
        -input scotch.tsv 
        -output output 
        -mapper mapper.r 
        -reducer reducer.r


 4


     $ cat output/part-00000
     blended 592
     single malt 783




     @holidayworking ()           R   MapReduce             2010   8   28   11 / 18
Reducer
#!/usr/bin/env Rscript

env <- new.env(hash = TRUE)
con <- file("stdin", open = "r")
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
  line <- unlist(strsplit(line, "t"))
  key <- line[1]
  value <- as.integer(line[2])
  if (exists(key, envir = env, inherits = FALSE)) {
    oldcount <- get(key, envir = env)
    assign(key, oldcount + value, envir = env)
  } else {
    assign(key, value, envir = env)
  }
}
close(con)

for (key in ls(env, all = TRUE)) {
  cat(key, "t", get(value, envir = env), "n", sep = " ")
}



     @holidayworking ()           R   MapReduce              2010   8   28   12 / 18
Mapper
#!/usr/bin/env Rscript

con <- file("stdin", open = "r")
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
  line <- unlist(strsplit(line, "t"))
  date <- line[1]
  order <- line[4]
  cat(sprintf("%st%sn", date, order), sep = "")
}
close(con)



cat output/part-00000
2010/07/01 25
2010/07/02 42
2010/07/03 39

2010/07/29 17
2010/07/30 45
2010/07/31 47

@holidayworking ()            R   MapReduce            2010   8   28   13 / 18
Mapper
#!/usr/bin/env Rscript

con <- file("stdin", open = "r")
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
  line <- unlist(strsplit(line, "t"))
  brand <- line[2]
  order <- line[4]
  cat(sprintf("%st%sn", brand, order), sep = "")
}
close(con)



$ cat output/part-00000
Ardbeg 10 Years Old 166
Ballantine 12 Years Old 142
Ballantine 17 Years Old 150
Bowmore 12 Years Old 149
Johnnie Walker Gold Label 18 Years Old 176
Johnnie Walker Swing 124
Talisker 10 Years Old 176
The Glenlivet 12 Year Old 164
The Macallan 12 Years 128
@holidayworking ()           R    MapReduce            2010   8   28   14 / 18
Mapper
#!/usr/bin/env Rscript

con <- file("stdin", open = "r")
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
  line <- unlist(strsplit(line, "t"))
  type <- line[3]
  order <- line[4]
  cat(sprintf("%st%sn", type, order), sep = "")
}
close(con)




$ cat output/part-00000
blended 592
single malt 783




@holidayworking ()           R   MapReduce             2010   8   28   15 / 18
MapReduce :




@holidayworking ()   R   MapReduce   2010   8   28   16 / 18
MapReduce :

Hadoop : Google File System       MapReduce




@holidayworking ()     R   MapReduce          2010   8   28   16 / 18
MapReduce :

Hadoop : Google File System        MapReduce

Hadoop Streaming               R       MapReduce




@holidayworking ()     R   MapReduce           2010   8   28   16 / 18
@holidayworking ()   R   MapReduce   2010   8   28   17 / 18
Jeffrey Dean and Sanjay Ghemawat.
Mapreduce: Simplified data processing on large clusters.
OSDI’04: Sixth Symposium on Operating System Design and Implementation, 2004.
Tom White.
Hadoop.
                     .




@holidayworking ()              R   MapReduce                2010   8   28      18 / 18

More Related Content

RでMapreduce

  • 1. R MapReduce @holidayworking 2010 8 28
  • 2. ) Twitter: @holidayworking : : : F1 : Java, PL/SQL: Python, Ruby, R: @holidayworking () R MapReduce 2010 8 28 2 / 18
  • 3. MapReduce Google map reduce 2 @holidayworking () R MapReduce 2010 8 28 3 / 18
  • 4. MapReduce 1 Map 2 Shuffle 3 Reduce @holidayworking () R MapReduce 2010 8 28 4 / 18
  • 5. MapReduce [1] @holidayworking () R MapReduce 2010 8 28 5 / 18
  • 6. MapReduce Grep @holidayworking () R MapReduce 2010 8 28 6 / 18
  • 7. Hadoop Google File System MapReduce @holidayworking () R MapReduce 2010 8 28 7 / 18
  • 8. Hadoop Google File System MapReduce Hadoop Java @holidayworking () R MapReduce 2010 8 28 7 / 18
  • 9. Hadoop Google File System MapReduce Hadoop Java MapReduce Java @holidayworking () R MapReduce 2010 8 28 7 / 18
  • 10. Hadoop Google File System MapReduce Hadoop Java MapReduce Java Hadoop Streaming @holidayworking () R MapReduce 2010 8 28 7 / 18
  • 11. Hadoop Google File System MapReduce Hadoop Java MapReduce Java Hadoop Streaming MapReduce @holidayworking () R MapReduce 2010 8 28 7 / 18
  • 12. Hadoop Google File System MapReduce Hadoop Java MapReduce Java Hadoop Streaming MapReduce R @holidayworking () R MapReduce 2010 8 28 7 / 18
  • 13. R MapReduce Ardbeg 10 Years Old Bowmore 12 Years Old Talisker 10 Years Old The Glenlivet 12 Year Old The Macallan 12 Years Ballantine 12 Years Old Ballantine 17 Years Old Johnnie Walker Gold Label 18 Years Old Johnnie Walker Swing @holidayworking () R MapReduce 2010 8 28 8 / 18
  • 14. iWork Numbers 250 2010/07/01 The Macallan 12 Years single malt 10 2010/07/01 Ballantine 12 Years Old blended 3 2010/07/01 Ballantine 17 Years Old blended 6 2010/07/01 Johnnie Walker Gold Label 18 Years Old blended 6 2010/07/02 The Glenlivet 12 Year Old single malt 4 2010/07/02 Ardbeg 10 Years Old single malt 2 2010/07/02 Ballantine 12 Years Old blended 8 2010/07/02 Ballantine 17 Years Old blended 7 2010/07/02 Johnnie Walker Swing blended 3 ( ) 2010/07/31 Johnnie Walker Swing blended 4 2010/07/31 Johnnie Walker Gold Label 18 Years Old blended 2 2010/07/31 Bowmore 12 Years Old single malt 4 2010/07/31 Talisker 10 Years Old single malt 7 @holidayworking () R MapReduce 2010 8 28 9 / 18
  • 15. @holidayworking () R MapReduce 2010 8 28 10 / 18
  • 16. MapReduce 1 Mapper 2 Reducer 3 Hadoop Streaming $ hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-0.20.2-streaming.jar -input scotch.tsv -output output -mapper mapper.r -reducer reducer.r 4 $ cat output/part-00000 blended 592 single malt 783 @holidayworking () R MapReduce 2010 8 28 11 / 18
  • 17. Reducer #!/usr/bin/env Rscript env <- new.env(hash = TRUE) con <- file("stdin", open = "r") while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) { line <- unlist(strsplit(line, "t")) key <- line[1] value <- as.integer(line[2]) if (exists(key, envir = env, inherits = FALSE)) { oldcount <- get(key, envir = env) assign(key, oldcount + value, envir = env) } else { assign(key, value, envir = env) } } close(con) for (key in ls(env, all = TRUE)) { cat(key, "t", get(value, envir = env), "n", sep = " ") } @holidayworking () R MapReduce 2010 8 28 12 / 18
  • 18. Mapper #!/usr/bin/env Rscript con <- file("stdin", open = "r") while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) { line <- unlist(strsplit(line, "t")) date <- line[1] order <- line[4] cat(sprintf("%st%sn", date, order), sep = "") } close(con) cat output/part-00000 2010/07/01 25 2010/07/02 42 2010/07/03 39 2010/07/29 17 2010/07/30 45 2010/07/31 47 @holidayworking () R MapReduce 2010 8 28 13 / 18
  • 19. Mapper #!/usr/bin/env Rscript con <- file("stdin", open = "r") while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) { line <- unlist(strsplit(line, "t")) brand <- line[2] order <- line[4] cat(sprintf("%st%sn", brand, order), sep = "") } close(con) $ cat output/part-00000 Ardbeg 10 Years Old 166 Ballantine 12 Years Old 142 Ballantine 17 Years Old 150 Bowmore 12 Years Old 149 Johnnie Walker Gold Label 18 Years Old 176 Johnnie Walker Swing 124 Talisker 10 Years Old 176 The Glenlivet 12 Year Old 164 The Macallan 12 Years 128 @holidayworking () R MapReduce 2010 8 28 14 / 18
  • 20. Mapper #!/usr/bin/env Rscript con <- file("stdin", open = "r") while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) { line <- unlist(strsplit(line, "t")) type <- line[3] order <- line[4] cat(sprintf("%st%sn", type, order), sep = "") } close(con) $ cat output/part-00000 blended 592 single malt 783 @holidayworking () R MapReduce 2010 8 28 15 / 18
  • 21. MapReduce : @holidayworking () R MapReduce 2010 8 28 16 / 18
  • 22. MapReduce : Hadoop : Google File System MapReduce @holidayworking () R MapReduce 2010 8 28 16 / 18
  • 23. MapReduce : Hadoop : Google File System MapReduce Hadoop Streaming R MapReduce @holidayworking () R MapReduce 2010 8 28 16 / 18
  • 24. @holidayworking () R MapReduce 2010 8 28 17 / 18
  • 25. Jeffrey Dean and Sanjay Ghemawat. Mapreduce: Simplified data processing on large clusters. OSDI’04: Sixth Symposium on Operating System Design and Implementation, 2004. Tom White. Hadoop. . @holidayworking () R MapReduce 2010 8 28 18 / 18