If you want to implement map abbreviation (using Hadoop) in a language other than Java, then you are using a function called streaming. Then the data is fed to the display device via STDIN (readLines ()), back to Hadoop via STDOUT (cat ()), and then to the reducer again through STDIN (readLines ()) and then washed through STDOUT (cat ()).
The following code is taken from the article . I wrote about writing a map reduction job using R for Hadoop. The code is supposed to be 2 grams, but I would say simple enough to see what happens with MapReduce.
library(stringdist, quietly=TRUE)
input <- file("stdin", "r")
while(length(line <- readLines(input, n=1, warn=FALSE)) > 0) {
if(nchar(line) == 0) break
fields <- unlist(strsplit(line, "\t"))
d <- qgrams(tolower(fields[4]), q=2)
for(i in 1:ncol(d)) {
cat(fields[2], "\t", colnames(d)[i], "\t", d[1,i], "\n")
}
}
close(input)
-
input <- file("stdin", "r")
is_first_line <- TRUE
while(length(line <- readLines(input, n=1, warn=FALSE)) > 0) {
line <- unlist(strsplit(line, "\t"))
if(!is_first_line &&
prev_lang == line[1] &&
prev_2gram == line[2]) {
sum <- sum + as.integer(line[3])
}
else {
if(!is_first_line) {
cat(prev_lang,"\t",prev_2gram,"\t",sum,"\n")
}
prev_lang <- line[1]
prev_2gram <- line[2]
sum <- as.integer(line[3])
is_first_line <- FALSE
}
}
cat(prev_lang,"\t",prev_2gram, "\t", sum, "\n")
close(input)
http://www.joyofdata.de/blog/mapreduce-r-hadoop-amazon-emr/