Sum amount last 6 month prior to the date of transaction
This is simply a non-equi join in data.table. You can create a variable of date - 180
and limit the join between the current date and that variable. This should be fairly quick
library(data.table)setDT(dt)[, date_minus_180 := date - 180]dt[, amnt_6_m := .SD[dt, sum(amount, na.rm = TRUE), on = .(to = from, date <= date, date >= date_minus_180), by = .EACHI]$V1]head(dt, 10)# id from to date amount date_minus_180 amnt_6_m# 1: 18529 5370 9356 2005-05-31 24.4 2004-12-02 0.0# 2: 13742 5370 5605 2005-08-05 7618.0 2005-02-06 0.0# 3: 9913 5370 8567 2005-09-12 21971.0 2005-03-16 0.0# 4: 956 8605 5370 2005-10-05 5245.0 2005-04-08 0.0# 5: 2557 5370 5636 2005-11-12 2921.0 2005-05-16 5245.0# 6: 1602 6390 5370 2005-11-26 8000.0 2005-05-30 0.0# 7: 18669 5370 8933 2005-11-30 169.2 2005-06-03 13245.0# 8: 35900 5370 8483 2006-01-31 71.5 2005-08-04 13245.0# 9: 48667 8934 5370 2006-03-31 14.6 2005-10-02 0.0# 10: 51341 5370 7626 2006-04-11 4214.0 2005-10-13 8014.6
Here is one option using data.table
:
library(data.table)setDT(df)setkey(df, to, date)# Unique combination of from and dateaf <- df[, unique(.SD), .SDcols = c("from", "date")]# For each combination check sum of incoming in the last 6 monthsfor (i in 1:nrow(af)) { set( af, i = i, j = "am6m", value = df[(date) %between% (af$date[[i]] - c(180, 0)) & to == af$from[[i]], sum(amount)] )}# Join the results into the main data.framedf[, am6m := af[.SD, on = .(from, date), am6m]]> tail(df)# id from to date amount am6m# 1: 18529 5370 9356 2005-05-31 24.4 0.0# 2: 258484 5370 9499 2008-01-09 720.0 74543.5# 3: 251611 5370 9533 2007-12-31 14.6 46143.5# 4: 83324 5370 9676 2006-08-31 261.1 40203.8# 5: 203763 5370 9689 2007-08-31 14.6 92353.1# 6: 103444 5370 9772 2006-11-08 16927.0 82671.2
Here's an option using window functions.
However, they require complete daily data in order to work, so the amount of memory required can be large (you have to have a row for every day for each from).
Also note that this method is only useful for large datasets or for performing calculations directly on a database. It takes a lot of set-up time to get the original data into a form that has no gaps. And it takes time to join the data at the end.
However, the slide function is relatively consistent in how fast it is regardless of size of data. As opposed to subsetting which increases in time as the size of the data being subset increases.
library(tidyverse)library(tsibble)# Calculate the 6 month windowsix_mo_rollup <- data %>% ## NOTE: You have to deal with duplicates somehow...either remove ## false duplicates or make them not duplicates... # We can get a unique from/date combo by summing since we need # to sum anyway. group_by(from,date) %>% summarise(amount = sum(amount), .groups = "keep") %>% ungroup() %>% # Now that each from/date is unique # convert data to a tsibble object as_tsibble(key = c(from),index = date) %>% # window functions can't have any missing time periods...so fill gaps # window functions grab 180 rows...not 180 days from the date group_by_key() %>% fill_gaps(.full = TRUE) %>% ungroup() %>% # arrange data from lowest to highest so slide can work right. arrange(date) %>% group_by(from) %>% mutate( six_mo_sum = slide_dbl( amount, sum, na.rm = TRUE, .size = 180, .align = "right" ) ) %>% ungroup() %>% # any row without amount was created by fill_gaps in the example # so we can drop those rows to save space filter(!is.na(amount))six_mo_rollup %>% filter(from == "5370")# # A tsibble: 41 x 4 [1D]# # Key: from [1]# from date amount six_mo_sum# <chr> <date> <dbl> <dbl># 1 5370 2005-05-31 24.4 NA # 2 5370 2005-08-05 7618 NA # 3 5370 2005-09-12 21971 NA # 4 5370 2005-11-12 2921 NA # 5 5370 2005-11-30 169. 32679.# 6 5370 2006-01-31 71.5 32751.# 7 5370 2006-04-11 4214 7376.# 8 5370 2006-08-31 261. 4475.# 9 5370 2006-10-31 182 443.# 10 5370 2006-11-08 16927 17370.# # ... with 31 more rows# Join the windowed data to the original datasetdata <- data %>% left_join( six_mo_rollup %>% select(from,date,six_mo_sum), by = c("from","date") )
UPDATE:
In the comments it became apparent that you wanted to sum up the to values for each for. I didn't understand that originally. The update to the code is to change all of the rollup to to
instead of for
.
Also, you wanted values that didn't have 6 months of complete data. So you add .partial = TRUE
.
# Calculate the 6 month windowsix_mo_rollup <- data %>% ## NOTE: You have to deal with duplicates somehow...either remove ## false duplicates or make them not duplicates... # We can get a unique from/date combo by summing since we need # to sum anyway. group_by(to,date) %>% summarise(amount = sum(amount), .groups = "keep") %>% ungroup() %>% # Now that each from/date is unique # convert data to a tsibble object as_tsibble(key = c(to),index = date) %>% # window functions can't have any missing time periods...so fill gaps # window functions grab 180 rows...not 180 days from the date group_by_key() %>% fill_gaps(.full = TRUE) %>% ungroup() %>% # arrange data from lowest to highest so slide can work right. arrange(date) %>% group_by(to) %>% mutate( six_mo_sum = slide_dbl( amount, sum, na.rm = TRUE, .size = 180, .align = "right", .partial = TRUE ) ) %>% ungroup() %>% # any row without amount was created by fill_gaps in the example # so we can drop those rows to save space filter(!is.na(amount))six_mo_rollup %>% filter(to == "5370")# # A tsibble: 50 x 4 [1D]# # Key: to [1]# to date amount six_mo_sum# <chr> <date> <dbl> <dbl># 1 5370 2005-10-05 5245 5245 # 2 5370 2005-11-26 8000 13245 # 3 5370 2006-03-31 14.6 13260.# 4 5370 2006-04-30 14.6 8029.# 5 5370 2006-05-28 13920 13949.# 6 5370 2006-05-31 14.6 13964.# 7 5370 2006-06-10 24640 38604.# 8 5370 2006-06-15 1600 40204.# 9 5370 2006-09-09 16400 56604.# 10 5370 2006-09-13 3500 60104.# # ... with 40 more rows# Join the windowed data to the original datasetdata <- data %>% left_join( six_mo_rollup %>% select(to,date,six_mo_sum), by = c("from" = "to","date" = "date") )