Interim work around is to use rowAny():
rowAny <- function(x) rowSum(x) > 0
df %>% filter(rowAny(across(is.numeric, ~ .x > 0))
Something like rowAny (and a rowAll) would be great also for using inside case_when when one wants to build the same condition for multiple variables, instead of having to use multiple | or multiple &. (not sure if I should open a new issue).
Thank you!
A few notes:
Notionally, I don't like rowAny() because it doesn't really do what its name indicates. IMHO a slightly more intuitive alternative:
rowwise_any <- function(x) purrr::pmap_lgl(x, any)
df %>% filter(rowwise_any(across(where(is.numeric), ~ .x > 0)))
@courtiol suggests to use lay::lay() to achieve the same:
df %>% filter(lay::lay(across(where(is.numeric)), ~ any(.x > 0)))
Neither of the above are really intuitive in my eyes. Spontaneously I would wish for an "any-of" version of the existing "all-of" across(), e.g. across_any(), so we could just:
df %>% filter(across_any(where(is.numeric), ~ .x > 0))
Or maybe instead introduce a flag like .all:
df %>% filter(across(.all = FALSE, where(is.numeric), ~ .x > 0))
But I'm not deep enough into dplyr semantics/internals to be able to assess if either of the above would really be a good API...
Performance-wise it seems that rowAny() is by far the best of the above three workarounds.
Benchmark reprex:
rowAny <- function(x) rowSums(x) > 0
rowwise_any <- function(x) purrr::pmap_lgl(x, any)
iris_big <- tibble::as_tibble(iris[sample(1:nrow(iris), 5e+5, replace = TRUE), ])
bench::mark(rowAny = iris_big %>% dplyr::filter(rowAny(dplyr::across(where(is.numeric), ~ .x == 1))),
rowwise_any = iris_big %>% dplyr::filter(rowwise_any(dplyr::across(where(is.numeric), ~ .x == 1))),
lay = iris_big %>% dplyr::filter(lay::lay(dplyr::across(where(is.numeric)), ~ any(.x == 1))))
#> # A tibble: 3 x 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
#> 1 rowAny 30.11ms 45.36ms 22.0 27.7MB 12.0 11 6 500.99ms <tibble [26… <Rprofmem[,3]… <bch:tm… <tibble […
#> 2 rowwise_any 1.93s 1.93s 0.519 16.3MB 5.19 1 10 1.93s <tibble [26… <Rprofmem[,3]… <bch:tm… <tibble […
#> 3 lay 8.74s 8.74s 0.114 18.2MB 2.74 1 24 8.74s <tibble [26… <Rprofmem[,3]… <bch:tm… <tibble […
An additional alternative to the ones given by @salim-b would be the function any_cols introduced by @lionel- in issue #5536.
I ran in https://rstudio.cloud the same benchmark as above just adding this function:
any_cols <- function(df) reduce(df, `|`, .init = FALSE)
rowAny <- function(x) rowSums(x) > 0
rowwise_any <- function(x) purrr::pmap_lgl(x, any)
iris_big <- tibble::as_tibble(iris[sample(1:nrow(iris), 5e+5, replace = TRUE), ])
bench::mark(rowAny = iris_big %>% dplyr::filter(rowAny(dplyr::across(where(is.numeric), ~ .x == 1))),
rowwise_any = iris_big %>% dplyr::filter(rowwise_any(dplyr::across(where(is.numeric), ~ .x == 1))),
lay = iris_big %>% dplyr::filter(lay::lay(dplyr::across(where(is.numeric)), ~ any(.x == 1))),
any_cols = iris_big %>% dplyr::filter(any_cols(across(where(is.numeric), ~ .x == 1))))
#> # A tibble: 4 x 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
#> 1 rowAny 21.75ms 33.59ms 24.9 27.7MB 15.6 16 10 641.8ms <tibble [26,677 × 5]> <Rprofmem[,3] [30 × 3]> <bch:tm [16]> <tibble [16 × 3]>
#> 2 rowwise_any 1.5s 1.5s 0.669 16.3MB 16.1 1 24 1.5s <tibble [26,677 × 5]> <Rprofmem[,3] [1,678 × 3]> <bch:tm [1]> <tibble [1 × 3]>
#> 3 lay 5.27s 5.27s 0.190 18.2MB 11.0 1 58 5.27s <tibble [26,677 × 5]> <Rprofmem[,3] [1,920 × 3]> <bch:tm [1]> <tibble [1 × 3]>
#> 4 any_cols 15.09ms 15.8ms 49.1 22MB 17.7 25 9 509.37ms <tibble [26,677 × 5]> <Rprofmem[,3] [56 × 3]> <bch:tm [25]> <tibble [25 × 3]>
Currently filter() does it's own reducing internally with & when multiple columns are involved:
SEXP eval_filter_one(SEXP quos, SEXP mask, SEXP caller, R_xlen_t n, SEXP env_filter) {
// then reduce to a single logical vector of size n
SEXP reduced = PROTECT(Rf_allocVector(LGLSXP, n));
// init with TRUE
int* p_reduced = LOGICAL(reduced);
for (R_xlen_t i = 0; i < n ; i++, ++p_reduced) {
*p_reduced = TRUE;
}
// reduce
R_xlen_t nquos = XLENGTH(quos);
for (R_xlen_t i=0; i < nquos; i++) {
SEXP current_expression = PROTECT(Rf_ScalarInteger(i+1));
Rf_defineVar(dplyr::symbols::current_expression, current_expression, env_filter);
SEXP res = PROTECT(rlang::eval_tidy(VECTOR_ELT(quos, i), mask, caller));
filter_check_size(res, i, n, quos);
filter_check_type(res, i, quos);
if (TYPEOF(res) == LGLSXP) {
reduce_lgl(reduced, res, n);
} else if(Rf_inherits(res, "data.frame")) {
R_xlen_t ncol = XLENGTH(res);
for (R_xlen_t j=0; j<ncol; j++) {
reduce_lgl(reduced, VECTOR_ELT(res, j), n);
}
}
UNPROTECT(2);
}
UNPROTECT(1);
return reduced;
}
Perhaps this could be an argument of filter() to control the reducing, instead of calling a function on the result of across() ?
In terms of syntax, it looks weird IMO to wrap the result of across() in some reducer. What about something like if_any() and if_all():
library(palmerpenguins)
library(dplyr, warn.conflicts = FALSE)
library(purrr)
if_any <- function(...) {
reduce(across(...), `|`)
}
if_all <- function(...) {
reduce(across(...), `&`)
}
penguins %>%
filter(if_any(starts_with("bill"), ~ . > 42))
#> # A tibble: 203 x 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g
#> <fct> <fct> <dbl> <dbl> <int> <int>
#> 1 Adelie Torge… 42.5 20.7 197 4500
#> 2 Adelie Torge… 46 21.5 194 4200
#> 3 Adelie Dream 42.2 18.5 180 3550
#> 4 Adelie Dream 44.1 19.7 196 4400
#> 5 Adelie Dream 42.3 21.2 191 4150
#> 6 Adelie Torge… 45.8 18.9 197 4150
#> 7 Adelie Torge… 42.8 18.5 195 4250
#> 8 Adelie Torge… 42.1 19.1 195 4000
#> 9 Adelie Torge… 42.9 17.6 196 4700
#> 10 Adelie Dream 43.2 18.5 192 4100
#> # … with 193 more rows, and 2 more variables: sex <fct>, year <int>
penguins %>%
filter(if_all(starts_with("bill"), ~ . > 42))
#> # A tibble: 0 x 8
#> # … with 8 variables: species <fct>, island <fct>, bill_length_mm <dbl>,
#> # bill_depth_mm <dbl>, flipper_length_mm <int>, body_mass_g <int>, sex <fct>,
#> # year <int>
Created on 2020-11-06 by the reprex package (v0.3.0.9001)
What about the previous idea of yours (@romainfrancois) of doing this internally and having an argument in filter (e.g. .combine = c(&, |))?
For the if_all idea, perhaps some may get confused by looking for a similarity between that and the superseeded suffixed functions such as mutate_all... (but perhaps not).
Would turning any() and all() as S3 methods be an option to just have those functions, while keeping base compatibility via dispatch? I guess you would rather not mask more base fns though...
I'd like this to be partial, i.e. apply to part of the filter(), in the same vein as results of across() participate in a mutate() or summarise() result.
Back on the approach of using something around across(), perhaps either() would work:
library(palmerpenguins)
library(dplyr, warn.conflicts = FALSE)
either <- function(...) {
data <- suppressMessages(vctrs::df_list(..., .name_repair = "universal"))
purrr::reduce(data, `|`)
}
penguins %>%
filter(
either(across(starts_with("bill"), ~ . > 42), flipper_length_mm > 180)
)
#> # A tibble: 331 x 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g
#> <fct> <fct> <dbl> <dbl> <int> <int>
#> 1 Adelie Torge… 39.1 18.7 181 3750
#> 2 Adelie Torge… 39.5 17.4 186 3800
#> 3 Adelie Torge… 40.3 18 195 3250
#> 4 Adelie Torge… 36.7 19.3 193 3450
#> 5 Adelie Torge… 39.3 20.6 190 3650
#> 6 Adelie Torge… 38.9 17.8 181 3625
#> 7 Adelie Torge… 39.2 19.6 195 4675
#> 8 Adelie Torge… 34.1 18.1 193 3475
#> 9 Adelie Torge… 42 20.2 190 4250
#> 10 Adelie Torge… 37.8 17.1 186 3300
#> # … with 321 more rows, and 2 more variables: sex <fct>, year <int>
either() would be some sort of generalized or() that understand auto splicing:
either(
c(TRUE, FALSE),
c(FALSE, TRUE)
)
#> [1] TRUE TRUE
either(
data.frame(x = c(TRUE, FALSE), y = c(FALSE, TRUE)),
c(FALSE, FALSE)
)
#> [1] TRUE TRUE
Created on 2020-11-09 by the reprex package (v0.3.0.9001)
Examples from ?filter_if give :
# examples from ?filter_if():
mtcars %>% filter_all(all_vars(. > 150))
mtcars %>% filter(across(everything(), ~ .x > 150))
# Or the union:
mtcars %>% filter_all(any_vars(. > 150))
mtcars %>% filter(if_any(everything(), ~ .x > 150))
mtcars %>% filter(either(across(everything(), ~ .x > 150)))
# You can vary the selection of columns on which to apply the
# predicate. filter_at() takes a vars() specification:
mtcars %>% filter_at(vars(starts_with("d")), any_vars((. %% 2) == 0))
mtcars %>% filter(if_any(starts_with("d"), ~ (. %% 2) == 0))
mtcars %>% filter(either(across(starts_with("d"), ~ (. %% 2) == 0)))
# And filter_if() selects variables with a predicate function:
is_int <- function(x) all(floor(x) == x)
mtcars %>% filter_if(is_int, all_vars(. != 0))
mtcars %>% filter(across(where(is_int), ~ .x != 0))
# filter_if + any_vars
is_int <- function(x) all(floor(x) == x)
mtcars %>% filter_if(is_int, any_vars(. != 0))
mtcars %>% filter(if_any(where(is_int), ~ .x != 0))
mtcars %>% filter(either(across(where(is_int), ~ .x != 0)))
either() applies an in-row operation and there is a syntax for that already, so perhaps all what we need is already here:
library(dplyr, warn.conflicts = FALSE)
is_int <- function(x) all(floor(x) == x)
mtcars %>%
rowwise() %>%
filter(any(c_across(where(is_int)) == 6)) ## filter rows containing at least one 6 at integer cols
#> # A tibble: 7 x 11
#> # Rowwise:
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 4 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
#> 5 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
#> 6 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 4
#> 7 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
mtcars %>%
rowwise() %>%
filter(all(c_across(where(is_int)) != 0)) ## filter rows containing no 0 at integer cols
#> # A tibble: 7 x 11
#> # Rowwise:
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 2 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4 1
#> 3 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2
#> 4 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1
#> 5 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
#> 6 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2
#> 7 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
Created on 2020-11-09 by the reprex package (v0.3.0)
Most helpful comment
In terms of syntax, it looks weird IMO to wrap the result of
across()in some reducer. What about something likeif_any()andif_all():Created on 2020-11-06 by the reprex package (v0.3.0.9001)