Tidyr: Implement multi-spread

Created on 31 Dec 2015  ·  15Comments  ·  Source: tidyverse/tidyr

library(dplyr)
library(tidyr)

# From Jenny Bryan --------------------------------------------------------

input <- frame_data(
  ~hw,   ~name,  ~mark,   ~pr,
  "hw1", "anna",    95,  "ok",
  "hw1", "alan",    90, "meh",
  "hw1", "carl",    85,  "ok",
  "hw2", "alan",    70, "meh",
  "hw2", "carl",    80,  "ok"
)

# Want:
input %>%
  gather(key = element, value = score, mark, pr) %>%
  unite(thing, hw, element, remove = TRUE) %>%
  spread(thing, score, convert = TRUE)

# With multispread - still have to go through untidy/molten form,
# which loses variable names
input %>%
  gather(mark, pr, key = element, value = score) %>%
  spread(c(hw, element), score, convert = TRUE)

# http://stackoverflow.com/questions/27247078 -----------------------------

df <- frame_data(
  ~id, ~type,     ~transactions, ~amount,
  20,  "income",  20,            100,
  20,  "expense", 25,            95,
  30,  "income",  50,            300,
  30,  "expense", 45,            250
)

df %>%
  gather(var, val, transactions:amount) %>%
  unite(var2, type, var) %>%
  spread(var2, val)

# With multispread - still have to go through untidy/molten form
df %>%
  gather(var, val, transactions:amount) %>%
  spread(c(type, var), val)

# http://stackoverflow.com/questions/24929954 -----------------------------

df <- expand.grid(Year = 2000:2014, Product = c("A", "B"), Country = c("AI", "EI")) %>%
  tbl_df() %>%
  select(Product, Country, Year) %>%
  mutate(value = rnorm(nrow(.))) %>%
  filter((Product == "A" & Country == "AI") | (Product == "B" & Country == "EI"))

df %>%
  unite(Prod_Count, Product, Country) %>%
  spread(Prod_Count, value)

# If we had multi-spread:
df %>%
  spread(c(Product, Country), value)
feature pivoting

Most helpful comment

All of these problems are now solvable with pivot, so I'm closing this issue although I'll continue to think about more compact forms.

library(dplyr, warn.conflicts = FALSE)
library(tidyr)

# From Jenny Bryan --------------------------------------------------------

input <- tribble(
  ~hw,   ~name,  ~mark,   ~pr,
  "hw1", "anna",    95,  "ok",
  "hw1", "alan",    90, "meh",
  "hw1", "carl",    85,  "ok",
  "hw2", "alan",    70, "meh",
  "hw2", "carl",    80,  "ok"
)

spec <- input %>% 
  expand(hw, measure = c("mark", "pr")) %>% 
  mutate(
    col_name = paste0(hw, "_", measure)
  )
spec
#> # A tibble: 4 x 3
#>   hw    measure col_name
#>   <chr> <chr>   <chr>   
#> 1 hw1   mark    hw1_mark
#> 2 hw1   pr      hw1_pr  
#> 3 hw2   mark    hw2_mark
#> 4 hw2   pr      hw2_pr

input %>% pivot(spec)
#> # A tibble: 3 x 5
#>   name  hw1_mark hw2_mark hw1_pr hw2_pr
#>   <chr>    <dbl>    <dbl> <chr>  <chr> 
#> 1 anna        95       NA ok     <NA>  
#> 2 alan        90       70 meh    meh   
#> 3 carl        85       80 ok     ok

# http://stackoverflow.com/questions/27247078 -----------------------------

df <- tribble(
  ~id, ~type,     ~transactions, ~amount,
  20,  "income",  20,            100,
  20,  "expense", 25,            95,
  30,  "income",  50,            300,
  30,  "expense", 45,            250
)

spec <- df %>% 
  expand(type, measure = c("transactions", "amount")) %>% 
  mutate(col_name = paste0(type, "_", measure))
spec
#> # A tibble: 4 x 3
#>   type    measure      col_name            
#>   <chr>   <chr>        <chr>               
#> 1 expense amount       expense_amount      
#> 2 expense transactions expense_transactions
#> 3 income  amount       income_amount       
#> 4 income  transactions income_transactions

df %>% pivot(spec)
#> # A tibble: 2 x 5
#>      id expense_amount income_amount expense_transactio… income_transactio…
#>   <dbl>          <dbl>         <dbl>               <dbl>              <dbl>
#> 1    20             95           100                  25                 20
#> 2    30            250           300                  45                 50


# http://stackoverflow.com/questions/24929954 -----------------------------

df <- expand_grid(
    Product = c("A", "B"), 
    Country = c("AI", "EI"), 
    Year = 2000:2014
  ) %>%
  mutate(value = rnorm(nrow(.))) %>%
  filter((Product == "A" & Country == "AI") | (Product == "B" & Country == "EI"))

spec <- df %>% 
  expand(Product, Country, measure = "value") %>% 
  mutate(
    col_name = paste0(Product, "_", Country)
  )
spec
#> # A tibble: 4 x 4
#>   Product Country measure col_name
#>   <chr>   <chr>   <chr>   <chr>   
#> 1 A       AI      value   A_AI    
#> 2 A       EI      value   A_EI    
#> 3 B       AI      value   B_AI    
#> 4 B       EI      value   B_EI

df %>% pivot(spec) %>% head()
#> # A tibble: 6 x 5
#>    Year    A_AI  A_EI  B_AI    B_EI
#>   <int>   <dbl> <dbl> <dbl>   <dbl>
#> 1  2000  0.299     NA    NA  2.22  
#> 2  2001 -0.0589    NA    NA -1.03  
#> 3  2002 -0.911     NA    NA  0.344 
#> 4  2003 -2.60      NA    NA -1.47  
#> 5  2004 -0.354     NA    NA -0.0561
#> 6  2005 -0.699     NA    NA  0.560

Created on 2019-03-03 by the reprex package (v0.2.1.9000)

All 15 comments

Maybe spread should take a set of columns for the rows and a set for the columns. If multiple variables for the columns, join them together with a separate. Default rows to not-cols, and cols to not-rows, but also make it possible to reduce the number of variables, eliminating the need for an intermediate select()

spread() already has a sep argument. Do you want to support vars() calls for the key and value arguments?

df %>%
  spread(vars(Product, Country), value)

# for multiple value columns
df %>%
  spread(vars(Product, Country), vars(value_1:value_x))

This will probably be a new verb - and yes, it will probably use vars().

We can also a nested data frame as an intermediate step:

library(tidyverse)

data <- tribble(
  ~hw,   ~name,  ~mark,   ~pr,
  "hw1", "anna",    95,  "ok",
  "hw1", "alan",    90, "meh",
  "hw1", "carl",    85,  "ok",
  "hw2", "alan",    70, "meh",
  "hw2", "carl",    80,  "ok"
)

fill_empty <- function(x) {
  map(x, ~if (is.null(.)) tibble(.rows = 1) else .)
}

data %>% 
  nest(-hw, -name) %>% 
  spread(name, data) %>%
  mutate_at(vars(-hw), list(fill_empty)) %>%
  unnest(.sep = "_")
#> Warning: The `.name_repair` argument to `as_tibble()` takes precedence over
#> the deprecated `validate` argument.
#> # A tibble: 2 x 7
#>   hw    alan_mark alan_pr anna_mark anna_pr carl_mark carl_pr
#>   <chr>     <dbl> <chr>       <dbl> <chr>       <dbl> <chr>  
#> 1 hw1          90 meh            95 ok             85 ok     
#> 2 hw2          70 meh            NA <NA>           80 ok

Created on 2019-01-08 by the reprex package (v0.2.1.9000)

It would be much faster if spread() supported hierarchical columns, and if we had a tie() + untie() pair of verbs equivalent to nest() + unnest() :+1:

library(tidyverse)

data <- tribble(
  ~hw,   ~name,  ~mark,   ~pr,
  "hw1", "anna",    95,  "ok",
  "hw1", "alan",    90, "meh",
  "hw1", "carl",    85,  "ok",
  "hw2", "alan",    70, "meh",
  "hw2", "carl",    80,  "ok"
)

# tie
tied <- data
tied$data <- data %>% select(-hw, -name)
tied <- tied[c("hw", "name", "data")]
tied
#> # A tibble: 5 x 3
#>   hw    name  data$mark $pr  
#>   <chr> <chr>     <dbl> <chr>
#> 1 hw1   anna         95 ok   
#> 2 hw1   alan         90 meh  
#> 3 hw1   carl         85 ok   
#> 4 hw2   alan         70 meh  
#> 5 hw2   carl         80 ok

# spread
spread <- tibble(
  hw = unique(tied$hw),
  anna = tied$data[tied$name == "anna", ][1:2, ],
  alan = tied$data[tied$name == "alan", ][1:2, ],
  carl = tied$data[tied$name == "carl", ][1:2, ]
)
spread
#> # A tibble: 2 x 4
#>   hw    anna$mark $pr   alan$mark $pr   carl$mark $pr  
#>   <chr>     <dbl> <chr>     <dbl> <chr>     <dbl> <chr>
#> 1 hw1          95 ok           90 meh          85 ok   
#> 2 hw2          NA <NA>         70 meh          80 ok

# untie: omitted, straightforward

Created on 2019-01-08 by the reprex package (v0.2.1.9000)

Happy to work on fleshing out the details.

A very preliminary version of implementation:
https://gist.github.com/yutannihilation/958d2f2eb8b2fcddf3391a32a1740d6d

Things I learned today about the problem:

  • @jeroen uses "nested" to describe both list columns ("nested tables") and data frame columns ("nested records"). I like the way records are consistent with what _vctrs_ describes as "records"
  • The implementation of nest_column() and unnest_column() works for the cases I tested, but we need to think about:

    • Ordering of columns in the result

    • Naming: Maybe tie() / untie() or nest_record() and unnest_record()

  • We need a more robust implementation of spread(), perhaps using group_by() %>% group_data() and repeated vec_slice()
  • For the related problem of multi-gather, we need more helpers and perhaps a better gather()
  • A blog post describing the current limitations, our options and the challenges would be a good way to tackle the problem

@krlmlr I think the notion of nesting should refer to disaggregating operations. A data frame column might be nested within groups or rows (thus is really a list column of data frames), but not necessarily. See https://github.com/tidyverse/dplyr/issues/3967 for some examples.

Note to self: multi-spread is about forming the key from multiple columns, which is also a natural fit to the packed data frame.

All of these problems are now solvable with pivot, so I'm closing this issue although I'll continue to think about more compact forms.

library(dplyr, warn.conflicts = FALSE)
library(tidyr)

# From Jenny Bryan --------------------------------------------------------

input <- tribble(
  ~hw,   ~name,  ~mark,   ~pr,
  "hw1", "anna",    95,  "ok",
  "hw1", "alan",    90, "meh",
  "hw1", "carl",    85,  "ok",
  "hw2", "alan",    70, "meh",
  "hw2", "carl",    80,  "ok"
)

spec <- input %>% 
  expand(hw, measure = c("mark", "pr")) %>% 
  mutate(
    col_name = paste0(hw, "_", measure)
  )
spec
#> # A tibble: 4 x 3
#>   hw    measure col_name
#>   <chr> <chr>   <chr>   
#> 1 hw1   mark    hw1_mark
#> 2 hw1   pr      hw1_pr  
#> 3 hw2   mark    hw2_mark
#> 4 hw2   pr      hw2_pr

input %>% pivot(spec)
#> # A tibble: 3 x 5
#>   name  hw1_mark hw2_mark hw1_pr hw2_pr
#>   <chr>    <dbl>    <dbl> <chr>  <chr> 
#> 1 anna        95       NA ok     <NA>  
#> 2 alan        90       70 meh    meh   
#> 3 carl        85       80 ok     ok

# http://stackoverflow.com/questions/27247078 -----------------------------

df <- tribble(
  ~id, ~type,     ~transactions, ~amount,
  20,  "income",  20,            100,
  20,  "expense", 25,            95,
  30,  "income",  50,            300,
  30,  "expense", 45,            250
)

spec <- df %>% 
  expand(type, measure = c("transactions", "amount")) %>% 
  mutate(col_name = paste0(type, "_", measure))
spec
#> # A tibble: 4 x 3
#>   type    measure      col_name            
#>   <chr>   <chr>        <chr>               
#> 1 expense amount       expense_amount      
#> 2 expense transactions expense_transactions
#> 3 income  amount       income_amount       
#> 4 income  transactions income_transactions

df %>% pivot(spec)
#> # A tibble: 2 x 5
#>      id expense_amount income_amount expense_transactio… income_transactio…
#>   <dbl>          <dbl>         <dbl>               <dbl>              <dbl>
#> 1    20             95           100                  25                 20
#> 2    30            250           300                  45                 50


# http://stackoverflow.com/questions/24929954 -----------------------------

df <- expand_grid(
    Product = c("A", "B"), 
    Country = c("AI", "EI"), 
    Year = 2000:2014
  ) %>%
  mutate(value = rnorm(nrow(.))) %>%
  filter((Product == "A" & Country == "AI") | (Product == "B" & Country == "EI"))

spec <- df %>% 
  expand(Product, Country, measure = "value") %>% 
  mutate(
    col_name = paste0(Product, "_", Country)
  )
spec
#> # A tibble: 4 x 4
#>   Product Country measure col_name
#>   <chr>   <chr>   <chr>   <chr>   
#> 1 A       AI      value   A_AI    
#> 2 A       EI      value   A_EI    
#> 3 B       AI      value   B_AI    
#> 4 B       EI      value   B_EI

df %>% pivot(spec) %>% head()
#> # A tibble: 6 x 5
#>    Year    A_AI  A_EI  B_AI    B_EI
#>   <int>   <dbl> <dbl> <dbl>   <dbl>
#> 1  2000  0.299     NA    NA  2.22  
#> 2  2001 -0.0589    NA    NA -1.03  
#> 3  2002 -0.911     NA    NA  0.344 
#> 4  2003 -2.60      NA    NA -1.47  
#> 5  2004 -0.354     NA    NA -0.0561
#> 6  2005 -0.699     NA    NA  0.560

Created on 2019-03-03 by the reprex package (v0.2.1.9000)

@hadley
pivot() seems exciting! 🎉

A quick question. pivot() is currently marked as internal. Is this just because pivot() is still in an experimental stage? Or, will there be some more higher interface for the users?

https://github.com/tidyverse/tidyr/blob/438dac6e6525d4bbd3b395c91ba5ffe7cef484ef/R/pivot.R#L17

It will possibly have a different interface, and possibly a different name, and possibly different arguments. Nothing is sure yet :smile:

Got it, thanks!

@hadley any plans to allow multiple columns in the current pivot_wide()s value_from? Such that the example from Jenny Bryan here could just become:

pivot_wide(input, names_from = hw, values_from = c(mark, pr))

@mikmart I can't see how that would would - you'd end up with two values in each cell? Can you please create a new issue with reprex explaining in a bit more detail?

Elaborated in #586. I was thinking about adding a suffix to the new columns based on value_from, which is not without issues.

Was this page helpful?
0 / 5 - 0 ratings

Related issues

uhlitz picture uhlitz  ·  9Comments

voxnonecho picture voxnonecho  ·  30Comments

strengejacke picture strengejacke  ·  8Comments

PMassicotte picture PMassicotte  ·  10Comments

dpprdan picture dpprdan  ·  10Comments