How can you summarise a tibble?
Initial object
Let’s assume that we have the tibble my_tib
where the variables are my_chars
, my_years
, my_ints
and my_nums
.
my_tib <-
tibble:: tibble (
my_chars = c (rep (LETTERS[1 : 2 ], 2 ), LETTERS[1 ]),
my_years = rep (c (2021 L, 2025 L), 2 : 3 ),
my_ints = 1 L: 5 L,
my_nums = 1.5 : 5.5
)
constructive:: construct (my_tib)
tibble::tibble(
my_chars = c("A", "B", "A", "B", "A"),
my_years = rep(c(2021L, 2025L), 2:3),
my_ints = 1:5,
my_nums = seq(1.5, 5.5, by = 1),
)
Use summarise()
from dplyr
To obtain the total sum of each of the variables for each of the letters A
and B
and for each year, we can use dplyr::summarise()
(or dplyr::summarize()
) together with dplyr::goup_by()
before it and dplyr::ungroup()
at the end, to ensure we have an ungrouped tibble to work with.
my_summarised_tib_grouped <-
my_tib |>
dplyr:: group_by (my_chars, my_years) |>
dplyr:: summarise (
my_ints_sum = sum (my_ints),
my_nums_sum = sum (my_nums)
)
`summarise()` has grouped output by 'my_chars'. You can override using the
`.groups` argument.
constructive:: construct (my_summarised_tib_grouped)
tibble::tibble(
my_chars = rep(c("A", "B"), each = 2L),
my_years = rep(c(2021L, 2025L), 2),
my_ints_sum = c(1L, 8L, 2L, 4L),
my_nums_sum = c(1.5, 9, 2.5, 4.5),
) |>
dplyr::group_by(my_chars)
To remove the groups, use dplyr::ungroup()
.
my_summarised_tib <-
my_summarised_tib_grouped |>
dplyr:: ungroup () |>
dplyr:: arrange (my_chars)
constructive:: construct (my_summarised_tib)
tibble::tibble(
my_chars = rep(c("A", "B"), each = 2L),
my_years = rep(c(2021L, 2025L), 2),
my_ints_sum = c(1L, 8L, 2L, 4L),
my_nums_sum = c(1.5, 9, 2.5, 4.5),
)
Please note that using .by
or .groups = "drop"
will allow you to return an ungrouped tibble automatically.
my_summarised_tib_alt <-
my_tib |>
dplyr:: summarise (
my_ints_sum = sum (my_ints),
my_nums_sum = sum (my_nums),
.by = c (my_chars, my_years)
)
constructive:: construct (my_summarised_tib_alt)
tibble::tibble(
my_chars = rep(c("A", "B"), 2),
my_years = rep(c(2021L, 2025L), each = 2L),
my_ints_sum = c(1L, 2L, 8L, 4L),
my_nums_sum = c(1.5, 2.5, 9, 4.5),
)
my_summarised_tib_alt_drop <-
my_tib |>
dplyr:: group_by (my_chars, my_years) |>
dplyr:: summarise (
my_ints_sum = sum (my_ints),
my_nums_sum = sum (my_nums),
.groups = "drop"
)
constructive:: construct (my_summarised_tib_alt_drop)
tibble::tibble(
my_chars = rep(c("A", "B"), each = 2L),
my_years = rep(c(2021L, 2025L), 2),
my_ints_sum = c(1L, 8L, 2L, 4L),
my_nums_sum = c(1.5, 9, 2.5, 4.5),
)
Use reframe()
from dplyr
We can also use dplyr::reframe()
. We obtain directly an ungrouped tibble.
my_reframed_tib <-
my_tib |>
dplyr:: reframe (
my_ints_sum = sum (my_ints),
my_nums_sum = sum (my_nums),
.by = c (my_chars, my_years)
) |>
dplyr:: arrange (my_chars)
constructive:: construct (my_reframed_tib)
tibble::tibble(
my_chars = rep(c("A", "B"), each = 2L),
my_years = rep(c(2021L, 2025L), 2),
my_ints_sum = c(1L, 8L, 2L, 4L),
my_nums_sum = c(1.5, 9, 2.5, 4.5),
)
An alternative using dplyr::group_by()
also exists. Note that this solution also returns an ungrouped tibble automatically thanks to dplyr::reframe()
.
my_reframed_tib_alt <-
my_tib |>
dplyr:: group_by (my_chars, my_years) |>
dplyr:: reframe (
my_ints_sum = sum (my_ints),
my_nums_sum = sum (my_nums)
) |>
dplyr:: arrange (my_chars)
constructive:: construct (my_reframed_tib_alt)
tibble::tibble(
my_chars = rep(c("A", "B"), each = 2L),
my_years = rep(c(2021L, 2025L), 2),
my_ints_sum = c(1L, 8L, 2L, 4L),
my_nums_sum = c(1.5, 9, 2.5, 4.5),
)
Use count()
from dplyr
Eventually, we can run dplyr::count()
by specifying the frequency weights with the argument wt
. If wt
is not NULL
, the sum of the specified variable is returned for each group given in the first argument. We obtain directly an ungrouped tibble.
my_counted_tib <-
my_tib |>
dplyr:: count (my_chars, my_years, wt = my_ints, name = "my_ints_sum" ) |>
dplyr:: left_join (
my_tib |>
dplyr:: count (my_chars, my_years, wt = my_nums, name = "my_nums_sum" ),
by = dplyr:: join_by (my_chars, my_years)
) |>
dplyr:: arrange (my_chars)
constructive:: construct (my_counted_tib)
tibble::tibble(
my_chars = rep(c("A", "B"), each = 2L),
my_years = rep(c(2021L, 2025L), 2),
my_ints_sum = c(1L, 8L, 2L, 4L),
my_nums_sum = c(1.5, 9, 2.5, 4.5),
)
Comparison of the three solutions
In my opinion, the reframe()
solution is the best and the safest one because you do not need to worry about ungrouping the tibble at the end of your pipe (happening in some cases with summarise()
) nor joining another summarised tibble (with count()
) to obtain the sums of the variables you want.
waldo:: compare (my_reframed_tib, my_counted_tib)
waldo:: compare (my_reframed_tib, my_summarised_tib)
Citation BibTeX citation:
@online{lettry2024,
author = {Lettry, Layal Christine},
title = {What Are the Different Ways of Summing a Variable?},
date = {2024-08-23},
url = {https://rdiscovery.netlify.app/posts/2024-08-23_sums/},
langid = {en}
}
For attribution, please cite this work as:
Lettry, Layal Christine. 2024.
“What Are the Different Ways of
Summing a Variable?” August 23, 2024.
https://rdiscovery.netlify.app/posts/2024-08-23_sums/ .