The data is a bunch of super secret Eve Online killmails. The first part of the script handles the json to data.frame transformation. After that I introduce the R core merging in a for-loop, data.table variant, reduce function variants, and finally the fast.merging. There's actually alot more data then we are using in the bench. Totalling 166106 killmails, merging only 3000 of those. That is because as noted in previous posts the merge for-looping seems to work in exponential time so going much higher then 3000 lines seems really excessive. Even now it took over an hour to run the whole bench on intel i5 3570k (not overclocked). So if you want faster results choose 1000ish frames and set rbenchmark replications to 1. With low frame count (under 100) the results are fairly similiar.
If you are going to test are the results from different functions the same, note that the order of rows and columns will be different. I'm eagerly waiting for you results and ideas.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Using 64bit R 2.15.3 | |
#Packages we are going to use | |
library(rjson) | |
library(data.table) | |
library(rbenchmark) | |
#set your WD where your unpackaged file is, or add path. | |
#Reading only partial data for this test, you can naturally do as you wish | |
eve = readLines("kms-2013-01-06-s.json", n=3001) | |
#json => data.frames (data lines) | |
eve = matrix(eve, ncol=1) | |
eve.list = apply(eve, 1, list) | |
#First and last lines are useless | |
eve.list = eve.list[-1] | |
#eve.list = eve.list[-length(eve.list)] #didn't read the last line in this case | |
#Handling json... | |
eve.list = lapply(eve.list, function(x){ | |
x = unlist(x) | |
return(fromJSON(x)) | |
}) | |
#Handling data.frame | |
eve.data.frame = function(eve.list){ | |
framo = unlist(eve.list) | |
dups = unique(names(framo)[duplicated(names(framo))]) | |
if(length(dups) > 0){ | |
for(i in seq(dups)){ | |
bol = names(framo) %in% dups[i] | |
framo[unique(dups[i])] = paste(framo[bol], collapse= ",") | |
} | |
framo = framo[!duplicated(names(framo))] | |
} | |
return(as.matrix(t(framo))) | |
} | |
#This takes awhile if you read the whole data | |
eve.frames = lapply(eve.list, eve.data.frame) | |
#For example | |
eve.frames[[1]] | |
eve.frames[[100]] | |
#The amount of variables in each data line | |
N.variables = sapply(eve.frames, ncol) | |
table(N.variables) | |
#Converting data.frames to data.tables | |
eve.data.table = lapply(eve.frames, as.data.table) | |
#Is this correct? | |
#The functions used | |
core.forLoop <- function(data.list) { | |
complete = data.list[[1]] | |
for(i in 2:length(data.list)){ | |
part = data.list[[i]] | |
complete = merge(complete, part, | |
all=TRUE, | |
sort=FALSE) | |
} | |
return(complete) | |
} | |
data.table.forLoop <- function(data.list){ | |
complete = data.list[[1]] | |
for(i in 2:length(data.list)){ | |
part = data.list[[i]] | |
complete = merge(complete, part, | |
by=intersect( names(complete), names(part) ), | |
all=TRUE, | |
sort=FALSE) | |
} | |
return(complete) | |
} | |
core.Reduce.merge = function(data.list){ | |
Reduce(function(a, b) merge(a, b, all=TRUE, sort=FALSE), data.list) | |
} | |
data.table.Reduce.merge = function(data.list){ | |
Reduce(function(a, b) merge(a, b, by=intersect(names(a), names(b)), all=TRUE, sort=FALSE), data.list) | |
} | |
fast.merging = function(data.list, nparts){ | |
if(!is.list(data.list)) stop("data.list isn't a list") | |
while(length(data.list) != 1){ #Loop until everything is merged | |
if(length(data.list) > nparts){ | |
starts = seq(1, length(data.list), nparts) | |
ends = seq(nparts, length(data.list), nparts) #starts and ends are of equal size if length(data.list) divides nparts. | |
if(length(ends) < length(starts)) ends = c(ends, length(data.list)) #making sure things are even | |
sections = matrix(c(starts, ends), ncol=2, byrow=FALSE) | |
sections = apply(sections, 1, list) | |
}else{ | |
sections = list(c(1, length(data.list))) | |
} | |
#We have the standard way inside lapply | |
data.list = lapply(sections, function(x, data.list){ | |
if(is.list(x)) x = x[[1]] | |
#the standard way starts -> | |
part = data.list[[x[1]]] | |
for(i in x[1]:x[2]){ | |
part = merge(part, data.list[[i]], all=TRUE, sort=FALSE) | |
} | |
#<- standard way ends | |
return(part) | |
}, data.list = data.list) | |
} | |
return(data.list[[1]]) #returning the merged data frame | |
} | |
#This will take awhile, go to lunch or something like that. | |
benchmark(core.forLoop(eve.frames), | |
data.table.forLoop(eve.data.table), | |
core.Reduce.merge(eve.frames), | |
data.table.Reduce.merge(eve.data.table), | |
fast.merging(eve.frames, 10), | |
replications=5) | |
# test replications elapsed relative user.self sys.self user.child sys.child | |
#1 core.forLoop(eve.frames) 5 1398.29 4.932 1393.98 0.06 NA NA | |
#3 core.Reduce.merge(eve.frames) 5 1405.77 4.958 1401.02 0.31 NA NA | |
#2 data.table.forLoop(eve.data.table) 5 797.76 2.814 794.61 0.09 NA NA | |
#4 data.table.Reduce.merge(eve.data.table) 5 801.93 2.829 799.91 0.16 NA NA | |
#5 fast.merging(eve.frames, 10) 5 283.51 1.000 282.95 0.00 NA NA | |
#If you are checking are going to check are the results the same, remember to sort your rows and columns | |
#basically data[order(data[, 1]), order(colnames(data))] for all data. |