test.R 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. #
  2. # Test if one batch of simulations produced statistically the same result as another batch,
  3. # by looking at the recorded output scalars (histograms and vectors are ignored).
  4. #
  5. # Purpose: as regression test for cases where simpler methods such as fingerprint checks fail.
  6. # (Certain code changes, such as rearranging modules, changing random number usage or
  7. # self-message scheduling alter the fingerprint but do not change model behavior,
  8. # i.e. simulations should produce statistically the same results as before. After such
  9. # changes, this routine can be used to check that model behavior was not affected.)
  10. #
  11. # The routine looks at each scalar, and performs Student t-test to check that the sample
  12. # means from the two simulation batches match, then F-test to check that the variances
  13. # also match. If both tests accept for all scalars, then the two simulations match.
  14. #
  15. # Both t-test and F-test assume that the variables are normally distributed. Therefore,
  16. # first we perform a normality test, and skip t-test and F-test if normality test rejects.
  17. #
  18. # In our experience, about 50 runs per batch are enough to satisfy the test.
  19. # If there are less, e.g. 10 runs, then mismatch (reject) is likely to be reported
  20. # for some of the scalars. The test can be made more permissive by setting meantest.alpha
  21. # (the p-value threshold for the t-test) to be smaller, e.g. 0.025 instead of 0.05,
  22. # of course on the cost of decreasing the reliability of the test.
  23. #
  24. # TO TRY:
  25. #
  26. # $ cd ~/inet/examples/inet/nclients
  27. # $ ./run -f basicHTTP.ini -u Cmdenv --repeat=100 -r0..49 --result-dir=results1 --vector-recording=false
  28. # $ ./run -f basicHTTP.ini -u Cmdenv --repeat=100 -r50..99 --result-dir=results2 --vector-recording=false
  29. # $ Rscript ../../../tests/misc/statistical/test.R results1 results2
  30. #
  31. require(optparse, quietly = TRUE)
  32. require(reshape, quietly = TRUE)
  33. require(omnetpp, quietly = TRUE)
  34. testScalars <- function(description,
  35. scalars1, scalars2, # scalars to compare
  36. exclude = NULL, # excluded scalars
  37. normalitytest.alpha = 0.05, meantest.alpha = 0.05, variancetest.alpha = 0.05, # statistical test parameters
  38. logLevel = 3)
  39. {
  40. numExclude <- 0
  41. numExactMatch <- 0
  42. numAccept <- 0
  43. numReject <- 0
  44. numNReject <- 0
  45. numTReject <- 0
  46. numFReject <- 0
  47. pass <- TRUE
  48. if (logLevel > 0)
  49. cat("Testing '", description, "'\n", sep="")
  50. # extract module-name pairs (they are the variables we're going to test one by one)
  51. scalarnames1 <- unique(data.frame(module = scalars1$module, name = scalars1$name))
  52. scalarnames2 <- unique(data.frame(module = scalars2$module, name = scalars2$name))
  53. if (logLevel > 0) cat("Processing ", length(scalarnames1$name), " scalars from '", resultsdir1, "'\n", sep="")
  54. if (logLevel > 1) print(scalarnames1)
  55. if (logLevel > 0) cat("Processing ", length(scalarnames2$name), " scalars from '", resultsdir2, "'\n", sep="")
  56. if (logLevel > 1) print(scalarnames2)
  57. # remove excluded scalars
  58. if (!is.null(exclude)) {
  59. excludedScalars1 <- subset(scalarnames1, grepl(exclude, name))
  60. excludedScalars2 <- subset(scalarnames2, grepl(exclude, name))
  61. numExclude1 <- length(excludedScalars1$name)
  62. numExclude2 <- length(excludedScalars2$name)
  63. if (numExclude1 > 0) {
  64. if (logLevel > 0) cat("Excluding ", numExclude1, " scalars from '", resultsdir1, "'\n", sep="")
  65. if (logLevel > 1) print(excludedScalars1)
  66. }
  67. if (numExclude2 > 0) {
  68. if (logLevel > 0) cat("Excluding ", numExclude2, " scalars from '", resultsdir2, "'\n", sep="")
  69. if (logLevel > 1) print(excludedScalars2)
  70. }
  71. numExclude <- numExclude1 + numExclude2
  72. scalarnames1 <- subset(scalarnames1, !grepl(exclude, name))
  73. scalarnames2 <- subset(scalarnames2, !grepl(exclude, name))
  74. }
  75. # check if the two datasets have the exact same set of scalars
  76. a1 <- paste(scalarnames1$module, scalarnames1$name)
  77. a2 <- paste(scalarnames2$module, scalarnames2$name)
  78. onlyInScalars1 <- setdiff(a1, a2)
  79. onlyInScalars2 <- setdiff(a2, a1)
  80. numAdditional1 <- length(onlyInScalars1)
  81. numAdditional2 <- length(onlyInScalars2)
  82. numAdditional <- numAdditional1 + numAdditional2
  83. if (numAdditional1 > 0) {
  84. if (logLevel > 0) cat("Found ", numAdditional1, " additional scalars from '", resultsdir1, "'\n", sep="")
  85. if (logLevel > 1) print(onlyInScalars1)
  86. }
  87. if (numAdditional2 > 0) {
  88. if (logLevel > 0) cat("Ignoring ", numAdditional2, " additional scalars from '", resultsdir2, "'\n", sep="")
  89. if (logLevel > 1) print(onlyInScalars2)
  90. }
  91. scalarnames <- merge(scalarnames1, scalarnames2, all = FALSE)
  92. if (logLevel > 0) cat("Comparing", length(scalarnames$name), "scalars found in both resultsets\n")
  93. if (logLevel > 1) print(scalarnames)
  94. rejectedScalars <- list()
  95. # loop through the list of scalars, and test each scalar one by one
  96. for (i in 1:nrow(scalarnames)) {
  97. module <- scalarnames$module[i]
  98. name <- scalarnames$name[i]
  99. values1 <- scalars1[scalars1$name == as.character(name) & scalars1$module == as.character(module),]$value
  100. values2 <- scalars2[scalars2$name == as.character(name) & scalars2$module == as.character(module),]$value
  101. if (logLevel > 2) {
  102. cat("\nProcessing scalar:", as.character(module), as.character(name), "\n")
  103. print(values1)
  104. print(values2)
  105. }
  106. nameAndValuesString <- paste(as.character(module), " ", as.character(name), " VALUES: [", paste(values1,collapse=', '), "] AND [", paste(values2,collapse=', '), "]")
  107. # remove NaNs
  108. if (sum(is.na(values1)) != sum(is.na(values2))) {
  109. if (logLevel > 2) cat(' DIFFERENT NUMBER OF NANS, REJECT\n')
  110. numReject <- numReject + 1
  111. next
  112. }
  113. else {
  114. values1 <- values1[!is.na(values1)]
  115. values2 <- values2[!is.na(values2)]
  116. }
  117. # same numbers, in maybe different order?
  118. if (length(values1) == length(values2) && all(sort(values1) == sort(values2))) {
  119. if (logLevel > 2) cat(' EXACT MATCH, ACCEPT\n')
  120. numExactMatch <- numExactMatch + 1
  121. next
  122. }
  123. # same constant number in both? (must test it here, as t.test() throws "data are
  124. # essentially constant" error for this case)
  125. if (min(values1) == max(values1) && min(values2) == max(values2)) {
  126. if (values1[1]==values2[1]) {
  127. if (logLevel > 2) cat(' SAME CONSTANTS, ACCEPT\n')
  128. numExactMatch <- numExactMatch + 1
  129. } else {
  130. if (logLevel > 2) cat(' DIFFERENT CONSTANTS, REJECT\n')
  131. numReject <- numReject + 1
  132. rejectedScalars <- append(rejectedScalars, nameAndValuesString)
  133. }
  134. next
  135. }
  136. # t-test requires that variables be normally distributed, so we must test that first
  137. # Note: it is not required that their variances be the same, as t.test() has a var.equal=FALSE argument
  138. # step 1: normality test (not needed [in fact causes an error] if all values are the same)
  139. if (min(values1) != max(values1)) {
  140. t1 <- stats::shapiro.test(values1)
  141. if (logLevel > 2) cat("normality test sample1 p-value:", t1$p.value, "\n")
  142. if (t1$p.value < normalitytest.alpha) {
  143. if (logLevel > 2) cat(' REJECT\n') # t-test and F-test only work if values are normally distributed
  144. numNReject <- numNReject + 1
  145. next
  146. }
  147. }
  148. if (min(values2) != max(values2)) {
  149. t2 <- stats::shapiro.test(values2)
  150. if (logLevel > 2) cat("normality test sample2 p-value:", t2$p.value, "\n")
  151. if (t2$p.value < normalitytest.alpha) {
  152. if (logLevel > 2) cat(' REJECT\n') # t-test and F-test only work if values are normally distributed
  153. numNReject <- numNReject + 1
  154. next
  155. }
  156. }
  157. # step 2: t-test for mean
  158. t <- t.test(values1, values2)
  159. if (logLevel > 2) cat("mean test p-value:", t$p.value)
  160. if (t$p.value < meantest.alpha) {
  161. if (logLevel > 2) cat(' REJECT\n')
  162. numTReject <- numTReject + 1
  163. rejectedScalars <- append(rejectedScalars, nameAndValuesString)
  164. next
  165. }
  166. # step 3: F-test for variance (t.test() only checks the mean)
  167. v <- var.test(values1, values2)
  168. if (logLevel > 2) cat("variance test p-value:", v$p.value, "\n")
  169. if (v$p.value < variancetest.alpha) {
  170. if (logLevel > 2) cat(' REJECT\n')
  171. numFReject <- numFReject + 1
  172. rejectedScalars <- append(rejectedScalars, nameAndValuesString)
  173. next
  174. }
  175. if (logLevel > 2) cat(' ACCEPT\n')
  176. numAccept <- numAccept + 1
  177. }
  178. if (logLevel > 2) cat("\n")
  179. if (logLevel > 0 && length(rejectedScalars) > 0) {
  180. cat("Found", length(rejectedScalars), "different scalars:\n");
  181. if (logLevel > 1) print(rejectedScalars)
  182. }
  183. pass <- (numAdditional == 0 && numReject == 0 && numNReject == 0 && numTReject == 0 && numFReject == 0 && numExclude + numExactMatch + numAccept > 0)
  184. if (logLevel > 0)
  185. cat("Success summary:", numExclude, "excluded,", numExactMatch, "exactly matched,", numAccept, "accepted", "\n")
  186. if (logLevel > 0 || !pass)
  187. cat("Failure summary:", numAdditional, "additional,", numReject, "differing constants,", numNReject, "rejected normality tests,", numTReject, "rejected t-tests,", numFReject, "rejected F-tests", "\n")
  188. cat("Test result for '", description, "' : ", sep="")
  189. if (pass) cat("PASS\n") else cat("FAIL\n")
  190. pass
  191. }
  192. # parse options and print help when necessary
  193. optionSpecifications <- list(
  194. make_option(c("-v", "--verbose"), action = "store", default = 0, help = "Set output level of detail"),
  195. make_option(c("-n", "--normality-test-alpha"), action = "store", default = 0.05, help = "Set normality test threshold"),
  196. make_option(c("-m", "--mean-test-alpha"), action = "store", default = 0.05, help = "Set mean test (t test) threshold"),
  197. make_option(c("-a", "--variance-test-alpha"), action = "store", default = 0.05, help = "Set variance test (f test) threshold"),
  198. make_option(c("-t", "--test"), action = "store", default = NULL, help = "Test matching simulation results only"),
  199. make_option(c("-x", "--exclude-scalars"), action = "store", default = NULL, help = "Exclude matching scalars from testing"))
  200. optionParser <- OptionParser(usage = "%prog [options] <results-1> <results-2>", option_list = optionSpecifications)
  201. parsedArgs <- parse_args(optionParser, args = commandArgs(trailingOnly = TRUE), print_help_and_exit = TRUE, positional_arguments = TRUE)
  202. options <- parsedArgs$options
  203. args <- parsedArgs$args
  204. if (length(args) != 2) {
  205. print_help(optionParser)
  206. q()
  207. }
  208. options(width=120)
  209. # results directories
  210. resultsdir1 <- args[1]
  211. resultsdir2 <- args[2]
  212. if (options$verbose > 0) cat("Statistical scalar comparison test started\n", sep="")
  213. # read results
  214. if (options$verbose > 1) cat("Loading results from '", resultsdir1, "'\n", sep="")
  215. results1 <- loadDataset(paste(resultsdir1, "*.sca", sep="/"))
  216. if (options$verbose > 1) cat("Loading results from '", resultsdir2, "'\n", sep="")
  217. results2 <- loadDataset(paste(resultsdir2, "*.sca", sep="/"))
  218. # preparing results
  219. wideattributes1 <- cast(results1$runattrs, runid~attrname, value='attrvalue')
  220. wideattributes2 <- cast(results2$runattrs, runid~attrname, value='attrvalue')
  221. scalars1 <- results1$scalars
  222. scalars2 <- results2$scalars
  223. widescalars1 <- merge(scalars1, wideattributes1)
  224. widescalars2 <- merge(scalars2, wideattributes2)
  225. keycolumns <- c('experiment', 'measurement', 'inifile', 'configname')
  226. tests1 <- widescalars1[!duplicated(widescalars1[,keycolumns]), keycolumns]
  227. tests2 <- widescalars2[!duplicated(widescalars2[,keycolumns]), keycolumns]
  228. tests <- merge(tests1, tests2, all = TRUE)
  229. if (!is.null(options$test))
  230. tests <- subset(tests, grepl(options$test, experiment))
  231. invisible(by(tests, 1:nrow(tests), function(key) {
  232. cat("\n");
  233. selectedscalars1 <- subset(widescalars1, as.character(experiment) == key$experiment & as.character(measurement) == key$measurement & as.character(inifile) == key$inifile & as.character(configname) == key$configname)
  234. selectedscalars2 <- subset(widescalars2, as.character(experiment) == key$experiment & as.character(measurement) == key$measurement & as.character(inifile) == key$inifile & as.character(configname) == key$configname)
  235. description <- paste(key$experiment, key$inifile, "[", key$configname, "]", key$measurement, sep="")
  236. testScalars(description, selectedscalars1, selectedscalars2,
  237. exclude = options$exclude,
  238. normalitytest.alpha = options$`normality-test-alpha`,
  239. meantest.alpha = options$`mean-test-alpha`,
  240. variancetest.alpha = options$`variance-test-alpha`,
  241. logLevel = options$verbose)
  242. }))
  243. if (options$verbose > 0) cat("Statistical scalar comparison test finished\n", sep="")