dplyr: inner_join with a partial string match dplyr: inner_join with a partial string match r r

dplyr: inner_join with a partial string match


The fuzzyjoin library has two functions regex_inner_join and fuzzy_inner_join that allow you to match partial strings:

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))y <- data.frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))x$string = as.character(x$string)y$seed = as.character(y$seed)library(fuzzyjoin)x %>% regex_inner_join(y, by = c(string = "seed"))  idX         string idY   seed1   1     Motorcycle   b otorcy2   2 TractorTrailer   a ractorlibrary(stringr)x %>% fuzzy_inner_join(y, by = c("string" = "seed"), match_fun = str_detect)  idX         string idY   seed1   1     Motorcycle   b otorcy2   2 TractorTrailer   a ractor


You can also use base-r with this function (slightly adapted from this answer here: https://stackoverflow.com/a/34723496/3048453, it uses dplyr to bind the columns together, use cbind if you don't want to use dplyr):

partial_join <- function(x, y, by_x, pattern_y) idx_x <- sapply(y[[pattern_y]], grep, x[[by_x]]) idx_y <- sapply(seq_along(idx_x), function(i) rep(i, length(idx_x[[i]]))) df <- dplyr::bind_cols(x[unlist(idx_x), , drop = F],                        y[unlist(idx_y), , drop = F]) return(df)}

With your example

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))df_merged <- partial_join(x, y, by_x = "string", pattern_y = "seed")df_merged# # A tibble: 2 × 4#     idX         string   idY   seed#   <int>          <chr> <chr>  <chr># 1     1     Motorcycle     b otorcy# 2     2 TractorTrailer     a ractor

Speed Benchmarks:

Functions


library(dplyr)x <- data_frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))partial_join <- function(x, y, by_x, pattern_y) { idx_x <- sapply(y[[pattern_y]], grep, x[[by_x]]) idx_y <- sapply(seq_along(idx_x), function(i) rep(i, length(idx_x[[i]]))) df <- dplyr::bind_cols(x[unlist(idx_x), , drop = F],                        y[unlist(idx_y), , drop = F]) return(df)}partial_join(x, y, by_x = "string", pattern_y = "seed")#> # A tibble: 2 × 4#>     idX         string   idY   seed#>   <int>          <chr> <chr>  <chr>#> 1     1     Motorcycle     b otorcy#> 2     2 TractorTrailer     a ractorjoran <- function(x, y, by_x, pattern_y) { library(dplyr) my_db <- src_sqlite(path = tempfile(), create= TRUE) x_tbl <- copy_to(dest = my_db, df = x) y_tbl <- copy_to(dest = my_db, df = y) result <- tbl(my_db,                sql(sprintf("select * from x, y where x.%s like '%%' || y.%s || '%%'", by_x, pattern_y))) collect(result, n = Inf)}joran(x, y, "string", "seed")#> # A tibble: 2 × 4#>     idX         string   idY   seed#>   <int>          <chr> <chr>  <chr>#> 1     1     Motorcycle     b otorcy#> 2     2 TractorTrailer     a ractorstephen <- function(x, y, by_x, pattern_y) { library(dplyr) d <- full_join(mutate(x, i=1),                 mutate(y, i=1), by = "i") # quoting issue here, defaulting to base-r d$take <- stringr::str_detect(d[[by_x]], d[[pattern_y]]) d %>%   filter(take == T) %>%   select(-i, -take)}stephen(x, y, "string", "seed")#> # A tibble: 2 × 4#>     idX         string   idY   seed#>   <int>          <chr> <chr>  <chr>#> 1     1     Motorcycle     b otorcy#> 2     2 TractorTrailer     a ractorfeng <- function(x, y, by_x, pattern_y) { library(fuzzyjoin) by_string <- pattern_y names(by_string) <- by_x regex_inner_join(x, y, by = by_string)}feng(x, y, "string", "seed")#> # A tibble: 2 × 4#>     idX         string   idY   seed#>   <int>          <chr> <chr>  <chr>#> 1     1     Motorcycle     b otorcy#> 2     2 TractorTrailer     a ractor

Benchmark

library(microbenchmark)res <- microbenchmark( joran(x, y, "string", "seed"), stephen(x, y, "string", "seed"), feng(x, y, "string", "seed"), partial_join(x, y, "string", "seed"))res#> Unit: microseconds#>                                  expr       min         lq       mean#>         joran(x, y, "string", "seed") 18953.008 20099.0540 21641.6646#>       stephen(x, y, "string", "seed")  1320.161  1456.9415  1704.9218#>          feng(x, y, "string", "seed")  5187.366  5625.8825  6926.2336#>  partial_join(x, y, "string", "seed")   190.264   222.0055   257.7906#>      median        uq        max neval cld#>  20675.5855 21827.764  70707.324   100   c#>   1579.8925  1670.719   9676.176   100 a  #>   5842.8150  6065.530 107961.805   100  b #>    242.0735   283.870    523.649   100 aset.seed(123123)x_large <- x %>% sample_n(1000, replace = T)y_large <- y %>% sample_n(1000, replace = T)res_large <- microbenchmark( joran(x_large, y_large, "string", "seed"), # stephen(x_large, y_large, "string", "seed"), feng(x_large, y_large, "string", "seed"), partial_join(x_large, y_large, "string", "seed"))res_large#> Unit: milliseconds#>                                              expr       min        lq     mean    median        uq      max neval cld#>         joran(x_large, y_large, "string", "seed") 321.03631 324.49262 334.2760 329.13991 335.30185 368.1153    10   c#>          feng(x_large, y_large, "string", "seed")  88.00369  89.85744 103.8686  93.84477  97.69121 200.0473    10 a  #>  partial_join(x_large, y_large, "string", "seed") 286.01533 286.78024 290.6295 288.89405 291.79887 303.4524    10  b 


I don't know how this will perform for larger data, but it (or a variant of it) might be worth a try:

library(dplyr)x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))my_db <- src_sqlite(path = tempfile(),create= TRUE)x_tbl <- copy_to(dest = my_db,df = x)y_tbl <- copy_to(dest = my_db,df = y)result <- tbl(my_db,sql("select * from x,y where x.string like '%' || y.seed || '%'"))> collect(result)Source: local data frame [2 x 4]    idX         string   idY   seed  (int)          (chr) (chr)  (chr)1     1     Motorcycle     b otorcy2     2 TractorTrailer     a ractor

I also can't speak to how the performance of this might vary across DBs. postgres or mysql might be better or worse at this sort of query.