lin.wang
/
QUIC


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
							#
# Test if one batch of simulations produced statistically the same result as another batch,
# by looking at the recorded output scalars (histograms and vectors are ignored).
#
# Purpose: as regression test for cases where simpler methods such as fingerprint checks fail.
# (Certain code changes, such as rearranging modules, changing random number usage or 
# self-message scheduling alter the fingerprint but do not change model behavior, 
# i.e. simulations should produce statistically the same results as before. After such
# changes, this routine can be used to check that model behavior was not affected.)
# 
# The routine looks at each scalar, and performs Student t-test to check that the sample
# means from the two simulation batches match, then F-test to check that the variances 
# also match. If both tests accept for all scalars, then the two simulations match.
#
# Both t-test and F-test assume that the variables are normally distributed. Therefore,
# first we perform a normality test, and skip t-test and F-test if normality test rejects.
#
# In our experience, about 50 runs per batch are enough to satisfy the test.
# If there are less, e.g. 10 runs, then mismatch (reject) is likely to be reported 
# for some of the scalars. The test can be made more permissive by setting meantest.alpha 
# (the p-value threshold for the t-test) to be smaller, e.g. 0.025 instead of 0.05,
# of course on the cost of decreasing the reliability of the test.
#
# TO TRY:
#
# $ cd ~/inet/examples/inet/nclients
# $ ./run -f basicHTTP.ini -u Cmdenv --repeat=100 -r0..49  --result-dir=results1 --vector-recording=false
# $ ./run -f basicHTTP.ini -u Cmdenv --repeat=100 -r50..99 --result-dir=results2 --vector-recording=false
# $ Rscript ../../../tests/misc/statistical/test.R results1 results2
#

require(optparse, quietly = TRUE)
require(reshape, quietly = TRUE)
require(omnetpp, quietly = TRUE)

testScalars <- function(description,
    scalars1, scalars2, # scalars to compare
    exclude = NULL,    # excluded scalars
    normalitytest.alpha = 0.05, meantest.alpha = 0.05, variancetest.alpha = 0.05, # statistical test parameters 
    logLevel = 3)
{
    numExclude <- 0
    numExactMatch <- 0
    numAccept <- 0
    numReject <- 0
    numNReject <- 0
    numTReject <- 0
    numFReject <- 0
    pass <- TRUE

    if (logLevel > 0)
        cat("Testing '", description, "'\n", sep="")

    # extract module-name pairs (they are the variables we're going to test one by one)
    scalarnames1 <- unique(data.frame(module = scalars1$module, name = scalars1$name))
    scalarnames2 <- unique(data.frame(module = scalars2$module, name = scalars2$name))
    
    if (logLevel > 0) cat("Processing ", length(scalarnames1$name), " scalars from '", resultsdir1, "'\n", sep="")
    if (logLevel > 1) print(scalarnames1)

    if (logLevel > 0) cat("Processing ", length(scalarnames2$name), " scalars from '", resultsdir2, "'\n", sep="")
    if (logLevel > 1) print(scalarnames2)

    # remove excluded scalars
    if (!is.null(exclude)) {
        excludedScalars1 <- subset(scalarnames1, grepl(exclude, name))
        excludedScalars2 <- subset(scalarnames2, grepl(exclude, name))
        numExclude1 <- length(excludedScalars1$name)
        numExclude2 <- length(excludedScalars2$name)
        if (numExclude1 > 0) {
            if (logLevel > 0) cat("Excluding ", numExclude1, " scalars from '", resultsdir1, "'\n", sep="")
            if (logLevel > 1) print(excludedScalars1)
        }
        if (numExclude2 > 0) {
            if (logLevel > 0) cat("Excluding ", numExclude2, " scalars from '", resultsdir2, "'\n", sep="")
            if (logLevel > 1) print(excludedScalars2)
        }
        numExclude <- numExclude1 + numExclude2
        scalarnames1 <- subset(scalarnames1, !grepl(exclude, name))
        scalarnames2 <- subset(scalarnames2, !grepl(exclude, name))
    }

    # check if the two datasets have the exact same set of scalars
    a1 <- paste(scalarnames1$module, scalarnames1$name)
    a2 <- paste(scalarnames2$module, scalarnames2$name)
    onlyInScalars1 <- setdiff(a1, a2)
    onlyInScalars2 <- setdiff(a2, a1)
    numAdditional1 <- length(onlyInScalars1)
    numAdditional2 <- length(onlyInScalars2)
    numAdditional <- numAdditional1 + numAdditional2 

    if (numAdditional1 > 0) {
        if (logLevel > 0) cat("Found ", numAdditional1, " additional scalars from '", resultsdir1, "'\n", sep="")
        if (logLevel > 1) print(onlyInScalars1)
    }
    if (numAdditional2 > 0) {
        if (logLevel > 0) cat("Ignoring ", numAdditional2, " additional scalars from '", resultsdir2, "'\n", sep="")
        if (logLevel > 1) print(onlyInScalars2)
    }

    scalarnames <- merge(scalarnames1, scalarnames2, all = FALSE)

    if (logLevel > 0) cat("Comparing", length(scalarnames$name), "scalars found in both resultsets\n")
    if (logLevel > 1) print(scalarnames)

    rejectedScalars <- list()

    # loop through the list of scalars, and test each scalar one by one
    for (i in 1:nrow(scalarnames)) {
    
        module <- scalarnames$module[i]
        name <- scalarnames$name[i]

        values1 <- scalars1[scalars1$name == as.character(name) & scalars1$module == as.character(module),]$value
        values2 <- scalars2[scalars2$name == as.character(name) & scalars2$module == as.character(module),]$value

        if (logLevel > 2) {
            cat("\nProcessing scalar:", as.character(module), as.character(name), "\n")
            print(values1)
            print(values2)
        }

        nameAndValuesString <- paste(as.character(module), " ", as.character(name), " VALUES: [", paste(values1,collapse=', '), "] AND [", paste(values2,collapse=', '), "]")
        
        
        # remove NaNs
        if (sum(is.na(values1)) != sum(is.na(values2))) {
            if (logLevel > 2) cat('    DIFFERENT NUMBER OF NANS, REJECT\n')
            numReject <- numReject + 1
            next
        }
        else {
            values1 <- values1[!is.na(values1)]
            values2 <- values2[!is.na(values2)]
        }
     
        # same numbers, in maybe different order?
        if (length(values1) == length(values2) && all(sort(values1) == sort(values2))) {
            if (logLevel > 2) cat('    EXACT MATCH, ACCEPT\n')
            numExactMatch <- numExactMatch + 1
            next
        }

        # same constant number in both? (must test it here, as t.test() throws "data are 
        # essentially constant" error for this case)
        if (min(values1) == max(values1) && min(values2) == max(values2)) {
            if (values1[1]==values2[1]) {
                if (logLevel > 2) cat('    SAME CONSTANTS, ACCEPT\n')
                numExactMatch <- numExactMatch + 1
            } else {
                if (logLevel > 2) cat('    DIFFERENT CONSTANTS, REJECT\n')
                numReject <- numReject + 1
                rejectedScalars <- append(rejectedScalars, nameAndValuesString)
            }
            next
        }

        # t-test requires that variables be normally distributed, so we must test that first
        # Note: it is not required that their variances be the same, as t.test() has a var.equal=FALSE argument
        
        # step 1: normality test (not needed [in fact causes an error] if all values are the same)
        if (min(values1) != max(values1)) {
            t1 <- stats::shapiro.test(values1)
            if (logLevel > 2) cat("normality test sample1 p-value:", t1$p.value, "\n")
            if (t1$p.value < normalitytest.alpha) {
                if (logLevel > 2) cat('    REJECT\n')   # t-test and F-test only work if values are normally distributed
                numNReject <- numNReject + 1
                next
            }
        }
        if (min(values2) != max(values2)) {
            t2 <- stats::shapiro.test(values2)
            if (logLevel > 2) cat("normality test sample2 p-value:", t2$p.value, "\n")
            if (t2$p.value < normalitytest.alpha) {
                if (logLevel > 2) cat('    REJECT\n')   # t-test and F-test only work if values are normally distributed
                numNReject <- numNReject + 1
                next
            }
        }

        # step 2: t-test for mean
        t <- t.test(values1, values2)
        if (logLevel > 2) cat("mean test p-value:", t$p.value)
        if (t$p.value < meantest.alpha) {
            if (logLevel > 2) cat('    REJECT\n')
            numTReject <- numTReject + 1
            rejectedScalars <- append(rejectedScalars, nameAndValuesString)
            next
        }

        # step 3: F-test for variance (t.test() only checks the mean)
        v <- var.test(values1, values2)
        if (logLevel > 2) cat("variance test p-value:", v$p.value, "\n")
        if (v$p.value < variancetest.alpha) {
            if (logLevel > 2) cat('    REJECT\n')
            numFReject <- numFReject + 1
            rejectedScalars <- append(rejectedScalars, nameAndValuesString)
            next
        }

        if (logLevel > 2) cat('    ACCEPT\n')
        numAccept <- numAccept + 1
    }

    if (logLevel > 2) cat("\n")

    if (logLevel > 0 && length(rejectedScalars) > 0) {
        cat("Found", length(rejectedScalars), "different scalars:\n");
        if (logLevel > 1) print(rejectedScalars)
    }

    pass <- (numAdditional == 0 && numReject == 0 && numNReject == 0 && numTReject == 0 && numFReject == 0 && numExclude + numExactMatch + numAccept > 0)
    if (logLevel > 0)
        cat("Success summary:", numExclude, "excluded,", numExactMatch, "exactly matched,", numAccept, "accepted", "\n")
    if (logLevel > 0 || !pass)
        cat("Failure summary:", numAdditional, "additional,", numReject, "differing constants,", numNReject, "rejected normality tests,", numTReject, "rejected t-tests,", numFReject, "rejected F-tests", "\n")
    cat("Test result for '", description, "' : ", sep="")
    if (pass) cat("PASS\n") else cat("FAIL\n")
    pass
}

# parse options and print help when necessary
optionSpecifications <- list(
   make_option(c("-v", "--verbose"), action = "store", default = 0, help = "Set output level of detail"),
   make_option(c("-n", "--normality-test-alpha"), action = "store", default = 0.05, help = "Set normality test threshold"),
   make_option(c("-m", "--mean-test-alpha"), action = "store", default = 0.05, help = "Set mean test (t test) threshold"),
   make_option(c("-a", "--variance-test-alpha"), action = "store", default = 0.05, help = "Set variance test (f test) threshold"),
   make_option(c("-t", "--test"), action = "store", default = NULL, help = "Test matching simulation results only"),
   make_option(c("-x", "--exclude-scalars"), action = "store", default = NULL, help = "Exclude matching scalars from testing"))
optionParser <- OptionParser(usage = "%prog [options] <results-1> <results-2>", option_list = optionSpecifications)
parsedArgs <- parse_args(optionParser, args = commandArgs(trailingOnly = TRUE), print_help_and_exit = TRUE, positional_arguments = TRUE)
options <- parsedArgs$options
args <- parsedArgs$args

if (length(args) != 2) {
    print_help(optionParser)
    q()
}

options(width=120)

# results directories
resultsdir1 <- args[1]
resultsdir2 <- args[2]
if (options$verbose > 0) cat("Statistical scalar comparison test started\n", sep="")

# read results
if (options$verbose > 1) cat("Loading results from '", resultsdir1, "'\n", sep="")
results1 <- loadDataset(paste(resultsdir1, "*.sca", sep="/"))
if (options$verbose > 1) cat("Loading results from '", resultsdir2, "'\n", sep="")
results2 <- loadDataset(paste(resultsdir2, "*.sca", sep="/"))

# preparing results
wideattributes1 <- cast(results1$runattrs, runid~attrname, value='attrvalue')
wideattributes2 <- cast(results2$runattrs, runid~attrname, value='attrvalue')
scalars1 <- results1$scalars
scalars2 <- results2$scalars 
widescalars1 <- merge(scalars1, wideattributes1)
widescalars2 <- merge(scalars2, wideattributes2)
keycolumns <- c('experiment', 'measurement', 'inifile', 'configname')
tests1 <- widescalars1[!duplicated(widescalars1[,keycolumns]), keycolumns]
tests2 <- widescalars2[!duplicated(widescalars2[,keycolumns]), keycolumns]
tests <- merge(tests1, tests2, all = TRUE)
if (!is.null(options$test))
    tests <- subset(tests, grepl(options$test, experiment))

invisible(by(tests, 1:nrow(tests), function(key) {
    cat("\n"); 
    selectedscalars1 <- subset(widescalars1, as.character(experiment) == key$experiment & as.character(measurement) == key$measurement & as.character(inifile) == key$inifile & as.character(configname) == key$configname)
    selectedscalars2 <- subset(widescalars2, as.character(experiment) == key$experiment & as.character(measurement) == key$measurement & as.character(inifile) == key$inifile & as.character(configname) == key$configname)
    description <- paste(key$experiment, key$inifile, "[", key$configname, "]", key$measurement, sep="")
    testScalars(description, selectedscalars1, selectedscalars2,
        exclude = options$exclude, 
        normalitytest.alpha = options$`normality-test-alpha`,
        meantest.alpha = options$`mean-test-alpha`,
        variancetest.alpha = options$`variance-test-alpha`, 
        logLevel = options$verbose)
}))

if (options$verbose > 0) cat("Statistical scalar comparison test finished\n", sep="")