I have the following want table, which are generated with rowwise() and mutate().
The main issue is that this solution is too slow.
The actual dataset contains around 15,000,000 rows, and took around 6 hours to complete.
What I hope to do is to convert the rowwise() and mutate() operations into tidytable so that it can leverage on data.table speed. Or maybe I just need a base R solution?
set.seed(1990)
mydf <- tibble(id = as.vector(outer(letters, letters, paste0))[1:10]
, open_week = rep(1:5,2)) %>%
mutate(close_week = open_week + sample(1:5,10, replace = T)) %>%
arrange(open_week)
mydf
# some are closed, some are not closed # if not closed, set to NA
mydf$close_week[sample(c(TRUE, FALSE),10, replace = T, prob = c(0.1,0.9))] <- NA
mydf
# A tibble: 10 x 3
id open_week close_week
<chr> <int> <int>
1 aa 1 2
2 fa 1 4
3 ba 2 4
4 ga 2 NA
5 ca 3 7
6 ha 3 6
7 da 4 6
8 ia 4 5
9 ea 5 7
10 ja 5 9
# calculate up to the last week
week_last <- max(mydf$close_week, na.rm = T)
# create complete week grid
df <- as_tibble(data.frame(week = seq(from = min(mydf$open_week, na.rm = T)
, to = max(mydf$close_week, na.rm = T), by = 1)))
have <- df %>%
rowwise() %>%
mutate( # which IDs are active - for the rowwise week?
active_id_list = list(mydf$id[week >= mydf$open_week &
week < ifelse(is.na(mydf$close_week),
week_last +1,
mydf$close_week)]),
# what are the ages of the IDs - for the rowwise week?
active_id_age_list = list(week - mydf$open_week[week >= mydf$open_week &
week < ifelse(is.na(mydf$close_week),
week +1,
mydf$close_week)]),
# which IDs have age less than 1 week, more than 1 week - for the rowwise week?
active_id_less_1_week_list = list(active_id_list[active_id_age_list < 1]),
active_id_above_1_week_list = list(active_id_list[active_id_age_list >= 1]),
# how many active IDs based on age less than 1 week, age more than 1 week - for the rowwise week?
active_id_less_1_week = sum(active_id_age_list < 1, na.rm = T),
active_id_above_1_week = sum(active_id_age_list >= 1, na.rm = T),
# how many active IDs in total?
active_id_count = length(active_id_age_list)) %>%
ungroup() %>%
dplyr::select(!where(is.list)) # remove the list object, unless want to inspect the actual ID list
have
# A tibble: 9 x 4
week active_id_less_1_week active_id_above_1_week active_id_count
<dbl> <int> <int> <int>
1 1 2 0 2
2 2 2 1 3
3 3 2 3 5
4 4 2 3 5
5 5 2 4 6
6 6 0 4 4
7 7 0 2 2
8 8 0 2 2
9 9 0 1 1
I tried replacing the rowwise() and mutate() with tidytable::mutate_rowwise.()
as per https://markfairbanks.github.io/tidytable/reference/mutate_rowwise..html
But I'm not sure how to interpret the following error
have <- df %>%
tidytable::mutate_rowwise.( # which IDs are active - for the rowwise week?
active_id_list = list(mydf$id[week >= mydf$open_week &
week < ifelse(is.na(mydf$close_week),
week_last +1,
mydf$close_week)]),
# what are the ages of the IDs - for the rowwise week?
active_id_age_list = list(week - mydf$open_week[week >= mydf$open_week &
week < ifelse(is.na(mydf$close_week),
week +1,
mydf$close_week)]),
# which IDs have age less than 1 week, more than 1 week - for the rowwise week?
active_id_less_1_week_list = list(active_id_list[active_id_age_list < 1]),
active_id_above_1_week_list = list(active_id_list[active_id_age_list >= 1]),
# how many active IDs based on age less than 1 week, age more than 1 week - for the rowwise week?
active_id_less_1_week = sum(active_id_age_list < 1, na.rm = T),
active_id_above_1_week = sum(active_id_age_list >= 1, na.rm = T),
# how many active IDs in total?
active_id_count = length(active_id_age_list)) %>%
ungroup() %>%
dplyr::select(!where(is.list)) # remove the list object, unless want to inspect the actual ID list
Error in `[.data.table`(list(week = c(1, 2, 3, 4, 5, 6, 7, 8, 9), .rowwise_id = 1:9), :
'list' object cannot be coerced to type 'double'