diff --git a/R/api.R b/R/api.R index 402ec90..f280796 100644 --- a/R/api.R +++ b/R/api.R @@ -64,18 +64,19 @@ validate_profile <- function(x) { stopifnot(undotted(names(x$sample_types)) == c("type", "unit")) stopifnot(is.character(x$sample_types$type)) stopifnot(is.character(x$sample_types$unit)) - #' It is currently restricted to one row with values `"samples"` and `"count"`, - #' respectively. - stopifnot(nrow(x$sample_types) == 1) - stopifnot(x$sample_types$type == "samples") - stopifnot(x$sample_types$unit == "count") + #' It always has five rows describing the sample count and memory profiling + #' data types. + stopifnot(nrow(x$sample_types) == 5) + stopifnot(x$sample_types$type == c("samples", "small_v", "big_v", "nodes", "dup_count")) + stopifnot(x$sample_types$unit == c("count", "cells", "cells", "bytes", "count")) #' - #' The `samples` table has two columns, `value` (integer) and `locations` - #' (list). + #' The `samples` table has six columns: `value` (integer), `locations` + #' (list), and integer columns `small_v`, `big_v`, `nodes`, and `dup_count` + #' for memory profiling data. #' Additional columns with a leading dot in the name are allowed #' after the required columns. - stopifnot(undotted(names(x$samples)) == c("value", "locations")) + stopifnot(undotted(names(x$samples)) == c("value", "locations", "small_v", "big_v", "nodes", "dup_count")) stopifnot(is.integer(x$samples$value)) stopifnot(is.list(x$samples$locations)) #' The `value` column describes the number of consecutive samples for the @@ -91,6 +92,18 @@ validate_profile <- function(x) { stopifnot(unlist(map(x$samples$locations, "[[", "location_id")) %in% x$locations$location_id) #' The locations are listed in inner-first order, i.e., the first location #' corresponds to the innermost entry of the stack trace. + #' The `small_v`, `big_v`, `nodes`, and `dup_count` columns contain integer + #' memory statistics per sample. When memory profiling data is not available, + #' these columns are all `NA`. When present, all memory values must be + #' nonnegative. + stopifnot(is.integer(x$samples$small_v)) + stopifnot(is.integer(x$samples$big_v)) + stopifnot(is.integer(x$samples$nodes)) + stopifnot(is.integer(x$samples$dup_count)) + stopifnot(is.na(x$samples$small_v) | x$samples$small_v >= 0L) + stopifnot(is.na(x$samples$big_v) | x$samples$big_v >= 0L) + stopifnot(is.na(x$samples$nodes) | x$samples$nodes >= 0L) + stopifnot(is.na(x$samples$dup_count) | x$samples$dup_count >= 0L) #' #' The `locations` table has three integer columns, `location_id`, diff --git a/R/pprof-from-ds.R b/R/pprof-from-ds.R index 8f3d105..71c71b2 100644 --- a/R/pprof-from-ds.R +++ b/R/pprof-from-ds.R @@ -2,6 +2,8 @@ ds_to_msg <- function(ds) { validate_profile(ds) provide_proto() + has_memory <- !all(is.na(ds$samples$small_v)) + msg <- RProtoBuf::new(perftools.profiles.Profile) msg$string_table <- unique(c( @@ -13,14 +15,17 @@ ds_to_msg <- function(ds) { ds$functions$filename )) - add_sample_types_to_msg(ds$sample_types, msg) + add_sample_types_to_msg(ds$sample_types, msg, has_memory) add_samples_to_msg(ds$samples, msg) add_locations_to_msg(ds$locations, msg) add_functions_to_msg(ds$functions, msg) msg } -add_sample_types_to_msg <- function(sample_types, msg) { +add_sample_types_to_msg <- function(sample_types, msg, has_memory) { + if (!has_memory) { + sample_types <- sample_types[1, , drop = FALSE] + } sample_types$type <- match(sample_types$type, msg$string_table) - 1L sample_types$unit <- match(sample_types$unit, msg$string_table) - 1L @@ -33,9 +38,14 @@ add_sample_types_to_msg <- function(sample_types, msg) { } add_samples_to_msg <- function(samples, msg) { + has_memory <- !all(is.na(samples$small_v)) msg$sample <- lapply(split_rows(samples), function(s) { s_msg <- RProtoBuf::new(perftools.profiles.Sample) - s_msg$value <- s$value + if (has_memory) { + s_msg$value <- c(s$value, s$small_v, s$big_v, s$nodes, s$dup_count) + } else { + s_msg$value <- s$value + } s_msg$location_id <- s$locations[[1]]$location_id s_msg }) diff --git a/R/pprof-to-ds.R b/R/pprof-to-ds.R index 14c28fd..c52400d 100644 --- a/R/pprof-to-ds.R +++ b/R/pprof-to-ds.R @@ -16,26 +16,45 @@ msg_to_ds <- function(msg) { } get_sample_types_from_msg <- function(msg) { - sample_types <- map(msg$sample_type, function(st) { - tibble::tibble( - type = as.integer(st$type), - unit = as.integer(st$unit) - ) + tibble::tibble( + type = c("samples", "small_v", "big_v", "nodes", "dup_count"), + unit = c("count", "cells", "cells", "bytes", "count") + ) +} + +get_samples_from_msg <- function(msg) { + # Determine which value indices correspond to memory types + all_types <- map(msg$sample_type, function(st) { + msg$string_table[as.integer(st$type) + 1] }) - sample_types <- merge_rows(sample_types) + all_types <- unlist(all_types) - sample_types$type <- msg$string_table[sample_types$type + 1] - sample_types$unit <- msg$string_table[sample_types$unit + 1] + mem_types <- c("small_v", "big_v", "nodes", "dup_count") + has_memory <- all(mem_types %in% all_types) - sample_types[1, ] -} + mem_indices <- NULL + if (has_memory) { + mem_indices <- match(mem_types, all_types) + } -get_samples_from_msg <- function(msg) { samples <- map(msg$sample, function(s) { - tibble::tibble( - value = as.integer(s$value[[1]]), + values <- as.integer(s$value) + row <- tibble::tibble( + value = values[[1]], locations = list(tibble::tibble(location_id = as.integer(s$location_id))) ) + if (has_memory) { + row$small_v <- values[[mem_indices[[1]]]] + row$big_v <- values[[mem_indices[[2]]]] + row$nodes <- values[[mem_indices[[3]]]] + row$dup_count <- values[[mem_indices[[4]]]] + } else { + row$small_v <- NA_integer_ + row$big_v <- NA_integer_ + row$nodes <- NA_integer_ + row$dup_count <- NA_integer_ + } + row }) samples <- tibble::as_tibble(do.call(rbind, samples)) samples diff --git a/R/rprof-from-ds.R b/R/rprof-from-ds.R index 1ed8442..d799b75 100644 --- a/R/rprof-from-ds.R +++ b/R/rprof-from-ds.R @@ -1,6 +1,8 @@ ds_to_rprof <- function(ds) { validate_profile(ds) + has_memory <- !all(is.na(ds$samples$small_v)) + . <- ds$locations . <- merge(., ds$functions[c("function_id", "system_name", "filename")], by = "function_id", sort = FALSE, all.x = TRUE) . <- .[-1L] @@ -18,9 +20,15 @@ ds_to_rprof <- function(ds) { flat_locations <- . files <- paste0("#File ", unique_files$file_id, ": ", unique_files$filename) + + # Expand samples by value (repeat count) + sample_idx <- rep(seq_len(nrow(ds$samples)), ds$samples$value) + traces <- map_chr( - rep(ds$samples$locations, ds$samples$value), - function(loc) { + seq_along(sample_idx), + function(i) { + si <- sample_idx[[i]] + loc <- ds$samples$locations[[si]] . <- flat_locations[match(loc$location_id, flat_locations$location_id), ] stopifnot(.$location_id == loc$location_id) funs <- paste0( @@ -31,9 +39,27 @@ ds_to_rprof <- function(ds) { } ) + header <- if (has_memory) { + "memory profiling: line profiling: sample.interval=20000" + } else { + "line profiling: sample.interval=20000" + } + + # Build memory data for roundtrip compatibility + memory <- NULL + if (has_memory) { + memory <- tibble::tibble( + small_v = ds$samples$small_v[sample_idx], + big_v = ds$samples$big_v[sample_idx], + nodes = ds$samples$nodes[sample_idx], + dup_count = ds$samples$dup_count[sample_idx] + ) + } + tibble::lst( - header = "line profiling: sample.interval=20000", + header, files, - traces + traces, + memory ) } diff --git a/R/rprof-read.R b/R/rprof-read.R index 9821131..42552be 100644 --- a/R/rprof-read.R +++ b/R/rprof-read.R @@ -35,9 +35,32 @@ read_rprof_ll <- function(path) { header <- 1L files <- grep("^#File ", lines) traces <- setdiff(seq_along(lines), c(header, files)) + + memory_profiling <- startsWith(lines[header], "memory profiling:") + + # Strip memory data prefix from trace lines + memory <- NULL + trace_lines <- lines[traces] + if (memory_profiling && length(trace_lines) > 0) { + mem_rx <- "^:([0-9]+):([0-9]+):([0-9]+):([0-9]+):" + mem_matches <- regmatches(trace_lines, regexec(mem_rx, trace_lines)) + has_mem <- vapply(mem_matches, length, integer(1)) > 0 + if (any(has_mem)) { + memory <- tibble::tibble( + small_v = as.integer(vapply(mem_matches[has_mem], "[[", character(1), 2L)), + big_v = as.integer(vapply(mem_matches[has_mem], "[[", character(1), 3L)), + nodes = as.integer(vapply(mem_matches[has_mem], "[[", character(1), 4L)), + dup_count = as.integer(vapply(mem_matches[has_mem], "[[", character(1), 5L)) + ) + # Strip the memory prefix from trace lines + trace_lines <- sub(mem_rx, "", trace_lines) + } + } + list( header = lines[header], files = lines[files], - traces = lines[traces] + traces = trace_lines, + memory = memory ) } diff --git a/R/rprof-to-ds.R b/R/rprof-to-ds.R index eec30cd..2064589 100644 --- a/R/rprof-to-ds.R +++ b/R/rprof-to-ds.R @@ -20,8 +20,8 @@ rprof_to_ds <- function(rprof) { get_sample_types_from_rprof <- function(rprof) { tibble::tibble( - type = "samples", - unit = "count" + type = c("samples", "small_v", "big_v", "nodes", "dup_count"), + unit = c("count", "cells", "cells", "bytes", "count") ) } @@ -150,7 +150,22 @@ add_samples_to_flat_rprof <- function(flat_rprof) { .$locations <- map(.$locations, tibble::as_tibble, rownames = NULL) .$value <- 1L - . <- .[c("value", "locations")] + + memory <- flat_rprof$rprof$memory + if (!is.null(memory)) { + # Memory data is indexed by trace line (sample), match by sample_id + mem <- memory[.$sample_id, , drop = FALSE] + .$small_v <- mem$small_v + .$big_v <- mem$big_v + .$nodes <- mem$nodes + .$dup_count <- mem$dup_count + } else { + .$small_v <- NA_integer_ + .$big_v <- NA_integer_ + .$nodes <- NA_integer_ + .$dup_count <- NA_integer_ + } + . <- .[c("value", "locations", "small_v", "big_v", "nodes", "dup_count")] flat_rprof$samples <- . diff --git a/R/rprof-write.R b/R/rprof-write.R index 8e2432b..e0de9a3 100644 --- a/R/rprof-write.R +++ b/R/rprof-write.R @@ -2,6 +2,17 @@ #' @export write_rprof <- function(x, path) { rprof <- ds_to_rprof(x) + # Add memory prefix to traces when writing to file + if (!is.null(rprof$memory)) { + rprof$traces <- paste0( + ":", rprof$memory$small_v, + ":", rprof$memory$big_v, + ":", rprof$memory$nodes, + ":", rprof$memory$dup_count, ":", + rprof$traces + ) + } + rprof$memory <- NULL writeLines(unlist(rprof, use.names = FALSE), path) invisible(x) } diff --git a/inst/samples/rprof/memory.out b/inst/samples/rprof/memory.out new file mode 100644 index 0000000..3830a0f --- /dev/null +++ b/inst/samples/rprof/memory.out @@ -0,0 +1,14 @@ +memory profiling: line profiling: sample.interval=20000 +:341536:1289015:34667640:1635:"" "do.call" "as.data.frame.list" "as.data.frame" "data.frame" "Ops.data.frame" +:293305:449462:23670472:6514:"anyDuplicated" "[.data.frame" "[" +:369435:615462:36216432:10044:"get" "Ops.data.frame" +:327669:523462:29319360:9780:"[<-.data.frame" "[<-" +:285807:435462:22434496:9769:"any" ".deparseOpts" "deparse" "paste" "deparse1" "force" "as.data.frame.numeric" "as.data.frame" "" "do.call" "as.data.frame.list" "as.data.frame" "data.frame" "Ops.data.frame" +:363324:605462:35221424:10234:"as.data.frame.list" "as.data.frame" "data.frame" "Ops.data.frame" +:322360:515462:28440888:9864:"" "do.call" "as.data.frame.list" "as.data.frame" "data.frame" "Ops.data.frame" +:281229:429462:21686672:9852:"Ops.data.frame" +:358636:597462:34433280:10212:"deparse" "paste" "deparse1" "force" "as.data.frame.numeric" "as.data.frame" "" "do.call" "as.data.frame.list" "as.data.frame" "data.frame" "Ops.data.frame" +:318714:513462:27870752:9993:"%in%" "Ops.data.frame" +:279206:429462:21346248:10022:"[.data.frame" "[" +:356880:597462:34140176:10248:".deparseOpts" "deparse" "paste" "deparse1" "force" "as.data.frame.numeric" "as.data.frame" "" "do.call" "as.data.frame.list" "as.data.frame" "data.frame" "Ops.data.frame" +:317207:513462:27611920:9989:"do.call" "as.data.frame.list" "as.data.frame" "data.frame" "Ops.data.frame" diff --git a/man/validate_profile.Rd b/man/validate_profile.Rd index ec6115b..2e0436e 100644 --- a/man/validate_profile.Rd +++ b/man/validate_profile.Rd @@ -48,11 +48,14 @@ that is accepted by \code{\link[=package_version]{package_version()}}. The \code{sample_types} table has two character columns, \code{type} and \code{unit}. Additional columns with a leading dot in the name are allowed after the required columns. -It is currently restricted to one row with values \code{"samples"} and \code{"count"}, +The first row must have values \code{"samples"} and \code{"count"}, respectively. +Additional rows may describe memory profiling data. The \code{samples} table has two columns, \code{value} (integer) and \code{locations} (list). +When memory profiling data is present, the table also has integer columns +\code{small_v}, \code{big_v}, \code{nodes}, and \code{dup_count}. Additional columns with a leading dot in the name are allowed after the required columns. The \code{value} column describes the number of consecutive samples for the @@ -63,6 +66,9 @@ For each \code{location_id} value a corresponding observation in the \code{locat table must exist. The locations are listed in inner-first order, i.e., the first location corresponds to the innermost entry of the stack trace. +When memory profiling data is present, the \code{small_v}, \code{big_v}, \code{nodes}, +and \code{dup_count} columns contain integer memory statistics per sample. +All memory values must be nonnegative. The \code{locations} table has three integer columns, \code{location_id}, \code{function_id}, and \code{line}. diff --git a/tests/testthat/test-rprof-memory.R b/tests/testthat/test-rprof-memory.R new file mode 100644 index 0000000..54d266b --- /dev/null +++ b/tests/testthat/test-rprof-memory.R @@ -0,0 +1,69 @@ +test_that("read memory profiling data", { + ds <- read_inst_rprof("rprof/memory.out") + expect_error(validate_profile(ds), NA) + + # sample_types always has 5 rows + expect_equal(nrow(ds$sample_types), 5) + expect_equal(ds$sample_types$type, c("samples", "small_v", "big_v", "nodes", "dup_count")) + expect_equal(ds$sample_types$unit, c("count", "cells", "cells", "bytes", "count")) + + # samples always have memory columns + expect_true("small_v" %in% names(ds$samples)) + expect_true("big_v" %in% names(ds$samples)) + expect_true("nodes" %in% names(ds$samples)) + expect_true("dup_count" %in% names(ds$samples)) + + # memory values should be nonnegative integers (not NA for memory profiles) + expect_true(is.integer(ds$samples$small_v)) + expect_true(is.integer(ds$samples$big_v)) + expect_true(is.integer(ds$samples$nodes)) + expect_true(is.integer(ds$samples$dup_count)) + expect_true(all(ds$samples$small_v >= 0)) + expect_true(all(ds$samples$big_v >= 0)) + expect_true(all(ds$samples$nodes >= 0)) + expect_true(all(ds$samples$dup_count >= 0)) +}) + +test_that("non-memory profiles have NA memory columns", { + ds <- read_inst_rprof("rprof/1.out") + expect_error(validate_profile(ds), NA) + + # sample_types always has 5 rows + expect_equal(nrow(ds$sample_types), 5) + + # memory columns are always present but all NA + expect_true("small_v" %in% names(ds$samples)) + expect_true("big_v" %in% names(ds$samples)) + expect_true("nodes" %in% names(ds$samples)) + expect_true("dup_count" %in% names(ds$samples)) + expect_true(all(is.na(ds$samples$small_v))) + expect_true(all(is.na(ds$samples$big_v))) + expect_true(all(is.na(ds$samples$nodes))) + expect_true(all(is.na(ds$samples$dup_count))) +}) + +test_that("memory profiling roundtrip via rprof", { + ds <- read_inst_rprof("rprof/memory.out") + + # Convert to rprof and back + ds1 <- rprof_to_ds(ds_to_rprof(ds)) + expect_error(validate_profile(ds1), NA) + + # roundtrip again + ds2 <- rprof_to_ds(ds_to_rprof(ds1)) + expect_error(validate_profile(ds2), NA) + + expect_identical(strip_dots(ds1), strip_dots(ds2)) +}) + +test_that("memory profiling write and read back", { + ds <- read_inst_rprof("rprof/memory.out") + + path <- tempfile("profiler_mem", fileext = ".out") + write_rprof(ds, path) + + ds1 <- read_rprof(path) + expect_error(validate_profile(ds1), NA) + + expect_identical(strip_dots(ds), strip_dots(ds1)) +})