123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280 |
- #
- # Test if one batch of simulations produced statistically the same result as another batch,
- # by looking at the recorded output scalars (histograms and vectors are ignored).
- #
- # Purpose: as regression test for cases where simpler methods such as fingerprint checks fail.
- # (Certain code changes, such as rearranging modules, changing random number usage or
- # self-message scheduling alter the fingerprint but do not change model behavior,
- # i.e. simulations should produce statistically the same results as before. After such
- # changes, this routine can be used to check that model behavior was not affected.)
- #
- # The routine looks at each scalar, and performs Student t-test to check that the sample
- # means from the two simulation batches match, then F-test to check that the variances
- # also match. If both tests accept for all scalars, then the two simulations match.
- #
- # Both t-test and F-test assume that the variables are normally distributed. Therefore,
- # first we perform a normality test, and skip t-test and F-test if normality test rejects.
- #
- # In our experience, about 50 runs per batch are enough to satisfy the test.
- # If there are less, e.g. 10 runs, then mismatch (reject) is likely to be reported
- # for some of the scalars. The test can be made more permissive by setting meantest.alpha
- # (the p-value threshold for the t-test) to be smaller, e.g. 0.025 instead of 0.05,
- # of course on the cost of decreasing the reliability of the test.
- #
- # TO TRY:
- #
- # $ cd ~/inet/examples/inet/nclients
- # $ ./run -f basicHTTP.ini -u Cmdenv --repeat=100 -r0..49 --result-dir=results1 --vector-recording=false
- # $ ./run -f basicHTTP.ini -u Cmdenv --repeat=100 -r50..99 --result-dir=results2 --vector-recording=false
- # $ Rscript ../../../tests/misc/statistical/test.R results1 results2
- #
- require(optparse, quietly = TRUE)
- require(reshape, quietly = TRUE)
- require(omnetpp, quietly = TRUE)
- testScalars <- function(description,
- scalars1, scalars2, # scalars to compare
- exclude = NULL, # excluded scalars
- normalitytest.alpha = 0.05, meantest.alpha = 0.05, variancetest.alpha = 0.05, # statistical test parameters
- logLevel = 3)
- {
- numExclude <- 0
- numExactMatch <- 0
- numAccept <- 0
- numReject <- 0
- numNReject <- 0
- numTReject <- 0
- numFReject <- 0
- pass <- TRUE
- if (logLevel > 0)
- cat("Testing '", description, "'\n", sep="")
- # extract module-name pairs (they are the variables we're going to test one by one)
- scalarnames1 <- unique(data.frame(module = scalars1$module, name = scalars1$name))
- scalarnames2 <- unique(data.frame(module = scalars2$module, name = scalars2$name))
-
- if (logLevel > 0) cat("Processing ", length(scalarnames1$name), " scalars from '", resultsdir1, "'\n", sep="")
- if (logLevel > 1) print(scalarnames1)
- if (logLevel > 0) cat("Processing ", length(scalarnames2$name), " scalars from '", resultsdir2, "'\n", sep="")
- if (logLevel > 1) print(scalarnames2)
- # remove excluded scalars
- if (!is.null(exclude)) {
- excludedScalars1 <- subset(scalarnames1, grepl(exclude, name))
- excludedScalars2 <- subset(scalarnames2, grepl(exclude, name))
- numExclude1 <- length(excludedScalars1$name)
- numExclude2 <- length(excludedScalars2$name)
- if (numExclude1 > 0) {
- if (logLevel > 0) cat("Excluding ", numExclude1, " scalars from '", resultsdir1, "'\n", sep="")
- if (logLevel > 1) print(excludedScalars1)
- }
- if (numExclude2 > 0) {
- if (logLevel > 0) cat("Excluding ", numExclude2, " scalars from '", resultsdir2, "'\n", sep="")
- if (logLevel > 1) print(excludedScalars2)
- }
- numExclude <- numExclude1 + numExclude2
- scalarnames1 <- subset(scalarnames1, !grepl(exclude, name))
- scalarnames2 <- subset(scalarnames2, !grepl(exclude, name))
- }
- # check if the two datasets have the exact same set of scalars
- a1 <- paste(scalarnames1$module, scalarnames1$name)
- a2 <- paste(scalarnames2$module, scalarnames2$name)
- onlyInScalars1 <- setdiff(a1, a2)
- onlyInScalars2 <- setdiff(a2, a1)
- numAdditional1 <- length(onlyInScalars1)
- numAdditional2 <- length(onlyInScalars2)
- numAdditional <- numAdditional1 + numAdditional2
- if (numAdditional1 > 0) {
- if (logLevel > 0) cat("Found ", numAdditional1, " additional scalars from '", resultsdir1, "'\n", sep="")
- if (logLevel > 1) print(onlyInScalars1)
- }
- if (numAdditional2 > 0) {
- if (logLevel > 0) cat("Ignoring ", numAdditional2, " additional scalars from '", resultsdir2, "'\n", sep="")
- if (logLevel > 1) print(onlyInScalars2)
- }
- scalarnames <- merge(scalarnames1, scalarnames2, all = FALSE)
- if (logLevel > 0) cat("Comparing", length(scalarnames$name), "scalars found in both resultsets\n")
- if (logLevel > 1) print(scalarnames)
- rejectedScalars <- list()
- # loop through the list of scalars, and test each scalar one by one
- for (i in 1:nrow(scalarnames)) {
-
- module <- scalarnames$module[i]
- name <- scalarnames$name[i]
- values1 <- scalars1[scalars1$name == as.character(name) & scalars1$module == as.character(module),]$value
- values2 <- scalars2[scalars2$name == as.character(name) & scalars2$module == as.character(module),]$value
- if (logLevel > 2) {
- cat("\nProcessing scalar:", as.character(module), as.character(name), "\n")
- print(values1)
- print(values2)
- }
- nameAndValuesString <- paste(as.character(module), " ", as.character(name), " VALUES: [", paste(values1,collapse=', '), "] AND [", paste(values2,collapse=', '), "]")
-
-
- # remove NaNs
- if (sum(is.na(values1)) != sum(is.na(values2))) {
- if (logLevel > 2) cat(' DIFFERENT NUMBER OF NANS, REJECT\n')
- numReject <- numReject + 1
- next
- }
- else {
- values1 <- values1[!is.na(values1)]
- values2 <- values2[!is.na(values2)]
- }
-
- # same numbers, in maybe different order?
- if (length(values1) == length(values2) && all(sort(values1) == sort(values2))) {
- if (logLevel > 2) cat(' EXACT MATCH, ACCEPT\n')
- numExactMatch <- numExactMatch + 1
- next
- }
- # same constant number in both? (must test it here, as t.test() throws "data are
- # essentially constant" error for this case)
- if (min(values1) == max(values1) && min(values2) == max(values2)) {
- if (values1[1]==values2[1]) {
- if (logLevel > 2) cat(' SAME CONSTANTS, ACCEPT\n')
- numExactMatch <- numExactMatch + 1
- } else {
- if (logLevel > 2) cat(' DIFFERENT CONSTANTS, REJECT\n')
- numReject <- numReject + 1
- rejectedScalars <- append(rejectedScalars, nameAndValuesString)
- }
- next
- }
- # t-test requires that variables be normally distributed, so we must test that first
- # Note: it is not required that their variances be the same, as t.test() has a var.equal=FALSE argument
-
- # step 1: normality test (not needed [in fact causes an error] if all values are the same)
- if (min(values1) != max(values1)) {
- t1 <- stats::shapiro.test(values1)
- if (logLevel > 2) cat("normality test sample1 p-value:", t1$p.value, "\n")
- if (t1$p.value < normalitytest.alpha) {
- if (logLevel > 2) cat(' REJECT\n') # t-test and F-test only work if values are normally distributed
- numNReject <- numNReject + 1
- next
- }
- }
- if (min(values2) != max(values2)) {
- t2 <- stats::shapiro.test(values2)
- if (logLevel > 2) cat("normality test sample2 p-value:", t2$p.value, "\n")
- if (t2$p.value < normalitytest.alpha) {
- if (logLevel > 2) cat(' REJECT\n') # t-test and F-test only work if values are normally distributed
- numNReject <- numNReject + 1
- next
- }
- }
- # step 2: t-test for mean
- t <- t.test(values1, values2)
- if (logLevel > 2) cat("mean test p-value:", t$p.value)
- if (t$p.value < meantest.alpha) {
- if (logLevel > 2) cat(' REJECT\n')
- numTReject <- numTReject + 1
- rejectedScalars <- append(rejectedScalars, nameAndValuesString)
- next
- }
- # step 3: F-test for variance (t.test() only checks the mean)
- v <- var.test(values1, values2)
- if (logLevel > 2) cat("variance test p-value:", v$p.value, "\n")
- if (v$p.value < variancetest.alpha) {
- if (logLevel > 2) cat(' REJECT\n')
- numFReject <- numFReject + 1
- rejectedScalars <- append(rejectedScalars, nameAndValuesString)
- next
- }
- if (logLevel > 2) cat(' ACCEPT\n')
- numAccept <- numAccept + 1
- }
- if (logLevel > 2) cat("\n")
- if (logLevel > 0 && length(rejectedScalars) > 0) {
- cat("Found", length(rejectedScalars), "different scalars:\n");
- if (logLevel > 1) print(rejectedScalars)
- }
- pass <- (numAdditional == 0 && numReject == 0 && numNReject == 0 && numTReject == 0 && numFReject == 0 && numExclude + numExactMatch + numAccept > 0)
- if (logLevel > 0)
- cat("Success summary:", numExclude, "excluded,", numExactMatch, "exactly matched,", numAccept, "accepted", "\n")
- if (logLevel > 0 || !pass)
- cat("Failure summary:", numAdditional, "additional,", numReject, "differing constants,", numNReject, "rejected normality tests,", numTReject, "rejected t-tests,", numFReject, "rejected F-tests", "\n")
- cat("Test result for '", description, "' : ", sep="")
- if (pass) cat("PASS\n") else cat("FAIL\n")
- pass
- }
- # parse options and print help when necessary
- optionSpecifications <- list(
- make_option(c("-v", "--verbose"), action = "store", default = 0, help = "Set output level of detail"),
- make_option(c("-n", "--normality-test-alpha"), action = "store", default = 0.05, help = "Set normality test threshold"),
- make_option(c("-m", "--mean-test-alpha"), action = "store", default = 0.05, help = "Set mean test (t test) threshold"),
- make_option(c("-a", "--variance-test-alpha"), action = "store", default = 0.05, help = "Set variance test (f test) threshold"),
- make_option(c("-t", "--test"), action = "store", default = NULL, help = "Test matching simulation results only"),
- make_option(c("-x", "--exclude-scalars"), action = "store", default = NULL, help = "Exclude matching scalars from testing"))
- optionParser <- OptionParser(usage = "%prog [options] <results-1> <results-2>", option_list = optionSpecifications)
- parsedArgs <- parse_args(optionParser, args = commandArgs(trailingOnly = TRUE), print_help_and_exit = TRUE, positional_arguments = TRUE)
- options <- parsedArgs$options
- args <- parsedArgs$args
- if (length(args) != 2) {
- print_help(optionParser)
- q()
- }
- options(width=120)
- # results directories
- resultsdir1 <- args[1]
- resultsdir2 <- args[2]
- if (options$verbose > 0) cat("Statistical scalar comparison test started\n", sep="")
- # read results
- if (options$verbose > 1) cat("Loading results from '", resultsdir1, "'\n", sep="")
- results1 <- loadDataset(paste(resultsdir1, "*.sca", sep="/"))
- if (options$verbose > 1) cat("Loading results from '", resultsdir2, "'\n", sep="")
- results2 <- loadDataset(paste(resultsdir2, "*.sca", sep="/"))
- # preparing results
- wideattributes1 <- cast(results1$runattrs, runid~attrname, value='attrvalue')
- wideattributes2 <- cast(results2$runattrs, runid~attrname, value='attrvalue')
- scalars1 <- results1$scalars
- scalars2 <- results2$scalars
- widescalars1 <- merge(scalars1, wideattributes1)
- widescalars2 <- merge(scalars2, wideattributes2)
- keycolumns <- c('experiment', 'measurement', 'inifile', 'configname')
- tests1 <- widescalars1[!duplicated(widescalars1[,keycolumns]), keycolumns]
- tests2 <- widescalars2[!duplicated(widescalars2[,keycolumns]), keycolumns]
- tests <- merge(tests1, tests2, all = TRUE)
- if (!is.null(options$test))
- tests <- subset(tests, grepl(options$test, experiment))
- invisible(by(tests, 1:nrow(tests), function(key) {
- cat("\n");
- selectedscalars1 <- subset(widescalars1, as.character(experiment) == key$experiment & as.character(measurement) == key$measurement & as.character(inifile) == key$inifile & as.character(configname) == key$configname)
- selectedscalars2 <- subset(widescalars2, as.character(experiment) == key$experiment & as.character(measurement) == key$measurement & as.character(inifile) == key$inifile & as.character(configname) == key$configname)
- description <- paste(key$experiment, key$inifile, "[", key$configname, "]", key$measurement, sep="")
- testScalars(description, selectedscalars1, selectedscalars2,
- exclude = options$exclude,
- normalitytest.alpha = options$`normality-test-alpha`,
- meantest.alpha = options$`mean-test-alpha`,
- variancetest.alpha = options$`variance-test-alpha`,
- logLevel = options$verbose)
- }))
- if (options$verbose > 0) cat("Statistical scalar comparison test finished\n", sep="")
|