Merge pull request #414 from stan-dev/time_in_read_cmdstan_csv

rok-cesnovar · web-flow · commit d7190ba09275 · 2020-12-20T07:02:16.000+01:00
Return time in read_cmdstan_csv for MCMC
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: cmdstanr
 Title: R Interface to 'CmdStan'
-Version: 0.3.0
+Version: 0.3.0.9000
 Date: 2020-12-17
 Authors@R: 
     c(person(given = "Jonah", family = "Gabry", role = c("aut", "cre"),
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# Items for next tagged release
+
+### Bug fixes
+
+### New features
+
+* `read_cmdstan_csv` now also returns time for MCMC sampling CSV files.
+
 # cmdstanr 0.3.0
 
 ### Bug fixes
diff --git a/R/fit.R b/R/fit.R
@@ -514,8 +514,9 @@ CmdStanFit$set("public", name = "data_file", value = data_file)
 #'
 #' @return
 #' A list with elements
-#' * `total`: (scalar) the total run time.
-#' * `chains`: (data frame) for MCMC only, timing info for the individual
+#' * `total`: (scalar) The total run time. For MCMC this may be different than
+#' the sum of the chain run times if parallelization was used.
+#' * `chains`: (data frame) For MCMC only, timing info for the individual
 #' chains. The data frame has columns `"chain_id"`, `"warmup"`, `"sampling"`,
 #' and `"total"`.
 #'
diff --git a/R/read_csv.R b/R/read_csv.R
@@ -28,6 +28,10 @@
 #' For [sampling][model-method-sample] the returned list also includes the
 #' following components:
 #'
+#' * `time`: Run time information for the individual chains. The returned object
+#' is the same as for the [$time()][fit-method-time] method except the total run
+#' time can't be inferred from the CSV files (the chains may have been run in
+#' parallel) and is therefore `NA`.
 #' * `inv_metric`: A list (one element per chain) of inverse mass matrices
 #' or their diagonals, depending on the type of metric used.
 #' * `step_size`: A list (one element per chain) of the step sizes used.
@@ -120,6 +124,8 @@ read_cmdstan_csv <- function(files,
   step_size <- list()
   col_types <- NULL
   col_select <- NULL
+  metadata <- NULL
+  time <- data.frame()
   not_matching <- c()
   for (output_file in files) {
     if (is.null(metadata)) {
@@ -130,7 +136,9 @@ read_cmdstan_csv <- function(files,
       if (!is.null(metadata$step_size_adaptation)) {
         step_size[[as.character(metadata$id)]] <- metadata$step_size_adaptation
       }
-      id <- metadata$id
+      if (!is.null(metadata$time)) {
+        time <- rbind(time, metadata$time)
+      }
     } else {
       csv_file_info <- read_csv_metadata(output_file)
       check <- check_csv_metadata_matches(metadata, csv_file_info)
@@ -151,7 +159,9 @@ read_cmdstan_csv <- function(files,
       if (!is.null(csv_file_info$step_size_adaptation)) {
         step_size[[as.character(csv_file_info$id)]] <- csv_file_info$step_size_adaptation
       }
-      id <- csv_file_info$id
+      if (!is.null(csv_file_info$time)) {
+        time <- rbind(time, csv_file_info$time)
+      }
     }
     if (is.null(col_select)) {
       if (is.null(variables)) { # variables = NULL returns all
@@ -321,6 +331,7 @@ read_cmdstan_csv <- function(files,
     }
     list(
       metadata = metadata,
+      time = list(total = NA_integer_, chains = time),
       inv_metric = inv_metric,
       step_size = step_size,
       warmup_draws = warmup_draws,
@@ -393,6 +404,9 @@ read_csv_metadata <- function(csv_file) {
   inv_metric_rows <- -1
   parsing_done <- FALSE
   dense_inv_metric <- FALSE
+  warmup_time <- 0
+  sampling_time <-0
+  total_time <- 0
   if (os_is_windows()) {
     grep_path <- repair_path(Sys.which("grep.exe"))
     fread_cmd <- paste0(grep_path, " '^[#a-zA-Z]' --color=never ", csv_file)
@@ -467,6 +481,16 @@ read_csv_metadata <- function(csv_file) {
               csv_file_info[[key_val[1]]] <- key_val[2]
             }
           }
+        } else if (grepl("(Warm-up)", tmp, fixed = TRUE)) {
+          tmp <- gsub("Elapsed Time:", "", tmp, fixed = TRUE)
+          tmp <- gsub("seconds (Warm-up)", "", tmp, fixed = TRUE)
+          warmup_time <- as.numeric(tmp)
+        } else if (grepl("(Sampling)", tmp, fixed = TRUE)) {
+          tmp <- gsub("seconds (Sampling)", "", tmp, fixed = TRUE)
+          sampling_time <- as.numeric(tmp)
+        } else if (grepl("(Total)", tmp, fixed = TRUE)) {
+          tmp <- gsub("seconds (Total)", "", tmp, fixed = TRUE)
+          total_time <- as.numeric(tmp)
         }
       }
     }
@@ -493,6 +517,14 @@ read_csv_metadata <- function(csv_file) {
   } else {
     csv_file_info$threads_per_chain <- csv_file_info$num_threads
   }
+  if (csv_file_info$method == "sample") {
+    csv_file_info$time <- data.frame(
+      chain_id = csv_file_info$id,
+      warmup = warmup_time,
+      sampling = sampling_time,
+      total = total_time
+    )
+  }
   csv_file_info$model <- NULL
   csv_file_info$engaged <- NULL
   csv_file_info$delta <- NULL
diff --git a/man/fit-method-time.Rd b/man/fit-method-time.Rd
diff --git a/man/read_cmdstan_csv.Rd b/man/read_cmdstan_csv.Rd
diff --git a/tests/testthat/test-csv.R b/tests/testthat/test-csv.R
@@ -471,3 +471,50 @@ test_that("stan_variables and stan_variable_dims works in read_cdmstan_csv()", {
   expect_equal(gq$metadata$stan_variable_dims, list(y_rep = 10, sum_y = 1))
 })
 
+test_that("returning time works for read_cmdstan_csv", {
+  csv_files <- test_path("resources", "csv", "model1-2-no-warmup.csv")
+  csv_data <- read_cmdstan_csv(csv_files)
+  expect_equal(csv_data$time$total, NA_integer_)
+  expect_equal(csv_data$time$chains, data.frame(
+    chain_id = 2,
+    warmup = 0.017041,
+    sampling = 0.022068,
+    total = 0.039109
+  ))
+
+  csv_files <- test_path("resources", "csv", "model1-3-diff_args.csv")
+  csv_data <- read_cmdstan_csv(csv_files)
+  expect_equal(csv_data$time$total, NA_integer_)
+  expect_equal(csv_data$time$chains, data.frame(
+    chain_id = 1,
+    warmup = 0.038029,
+    sampling = 0.030711,
+    total = 0.06874
+  ))
+
+  csv_files <- c(
+    test_path("resources", "csv", "model1-1-warmup.csv"),
+    test_path("resources", "csv", "model1-2-warmup.csv")
+  )
+  csv_data <- read_cmdstan_csv(csv_files)
+  expect_equal(csv_data$time$total, NA_integer_)
+  expect_equal(csv_data$time$chains, data.frame(
+    chain_id = c(1,2),
+    warmup = c(0.038029, 0.017041),
+    sampling = c(0.030711, 0.022068),
+    total = c(0.06874, 0.039109)
+  ))
+  csv_files <- c(
+    test_path("resources", "csv", "bernoulli-1-optimize.csv")
+  )
+  csv_data <- read_cmdstan_csv(csv_files)
+  expect_null(csv_data$time$chains)
+})
+
+test_that("time from read_cmdstan_csv matches time from fit$time()", {
+  fit <- fit_bernoulli_thin_1
+  expect_equivalent(
+    read_cmdstan_csv(fit$output_files())$time$chains,
+    fit$time()$chains
+  )
+})