dplyr: inner_join with a partial string match
The fuzzyjoin
library has two functions regex_inner_join
and fuzzy_inner_join
that allow you to match partial strings:
x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))y <- data.frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))x$string = as.character(x$string)y$seed = as.character(y$seed)library(fuzzyjoin)x %>% regex_inner_join(y, by = c(string = "seed")) idX string idY seed1 1 Motorcycle b otorcy2 2 TractorTrailer a ractorlibrary(stringr)x %>% fuzzy_inner_join(y, by = c("string" = "seed"), match_fun = str_detect) idX string idY seed1 1 Motorcycle b otorcy2 2 TractorTrailer a ractor
You can also use base-r with this function (slightly adapted from this answer here: https://stackoverflow.com/a/34723496/3048453, it uses dplyr to bind the columns together, use cbind
if you don't want to use dplyr):
partial_join <- function(x, y, by_x, pattern_y) idx_x <- sapply(y[[pattern_y]], grep, x[[by_x]]) idx_y <- sapply(seq_along(idx_x), function(i) rep(i, length(idx_x[[i]]))) df <- dplyr::bind_cols(x[unlist(idx_x), , drop = F], y[unlist(idx_y), , drop = F]) return(df)}
With your example
x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))df_merged <- partial_join(x, y, by_x = "string", pattern_y = "seed")df_merged# # A tibble: 2 × 4# idX string idY seed# <int> <chr> <chr> <chr># 1 1 Motorcycle b otorcy# 2 2 TractorTrailer a ractor
Speed Benchmarks:
Functions
library(dplyr)x <- data_frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))partial_join <- function(x, y, by_x, pattern_y) { idx_x <- sapply(y[[pattern_y]], grep, x[[by_x]]) idx_y <- sapply(seq_along(idx_x), function(i) rep(i, length(idx_x[[i]]))) df <- dplyr::bind_cols(x[unlist(idx_x), , drop = F], y[unlist(idx_y), , drop = F]) return(df)}partial_join(x, y, by_x = "string", pattern_y = "seed")#> # A tibble: 2 × 4#> idX string idY seed#> <int> <chr> <chr> <chr>#> 1 1 Motorcycle b otorcy#> 2 2 TractorTrailer a ractorjoran <- function(x, y, by_x, pattern_y) { library(dplyr) my_db <- src_sqlite(path = tempfile(), create= TRUE) x_tbl <- copy_to(dest = my_db, df = x) y_tbl <- copy_to(dest = my_db, df = y) result <- tbl(my_db, sql(sprintf("select * from x, y where x.%s like '%%' || y.%s || '%%'", by_x, pattern_y))) collect(result, n = Inf)}joran(x, y, "string", "seed")#> # A tibble: 2 × 4#> idX string idY seed#> <int> <chr> <chr> <chr>#> 1 1 Motorcycle b otorcy#> 2 2 TractorTrailer a ractorstephen <- function(x, y, by_x, pattern_y) { library(dplyr) d <- full_join(mutate(x, i=1), mutate(y, i=1), by = "i") # quoting issue here, defaulting to base-r d$take <- stringr::str_detect(d[[by_x]], d[[pattern_y]]) d %>% filter(take == T) %>% select(-i, -take)}stephen(x, y, "string", "seed")#> # A tibble: 2 × 4#> idX string idY seed#> <int> <chr> <chr> <chr>#> 1 1 Motorcycle b otorcy#> 2 2 TractorTrailer a ractorfeng <- function(x, y, by_x, pattern_y) { library(fuzzyjoin) by_string <- pattern_y names(by_string) <- by_x regex_inner_join(x, y, by = by_string)}feng(x, y, "string", "seed")#> # A tibble: 2 × 4#> idX string idY seed#> <int> <chr> <chr> <chr>#> 1 1 Motorcycle b otorcy#> 2 2 TractorTrailer a ractor
Benchmark
library(microbenchmark)res <- microbenchmark( joran(x, y, "string", "seed"), stephen(x, y, "string", "seed"), feng(x, y, "string", "seed"), partial_join(x, y, "string", "seed"))res#> Unit: microseconds#> expr min lq mean#> joran(x, y, "string", "seed") 18953.008 20099.0540 21641.6646#> stephen(x, y, "string", "seed") 1320.161 1456.9415 1704.9218#> feng(x, y, "string", "seed") 5187.366 5625.8825 6926.2336#> partial_join(x, y, "string", "seed") 190.264 222.0055 257.7906#> median uq max neval cld#> 20675.5855 21827.764 70707.324 100 c#> 1579.8925 1670.719 9676.176 100 a #> 5842.8150 6065.530 107961.805 100 b #> 242.0735 283.870 523.649 100 aset.seed(123123)x_large <- x %>% sample_n(1000, replace = T)y_large <- y %>% sample_n(1000, replace = T)res_large <- microbenchmark( joran(x_large, y_large, "string", "seed"), # stephen(x_large, y_large, "string", "seed"), feng(x_large, y_large, "string", "seed"), partial_join(x_large, y_large, "string", "seed"))res_large#> Unit: milliseconds#> expr min lq mean median uq max neval cld#> joran(x_large, y_large, "string", "seed") 321.03631 324.49262 334.2760 329.13991 335.30185 368.1153 10 c#> feng(x_large, y_large, "string", "seed") 88.00369 89.85744 103.8686 93.84477 97.69121 200.0473 10 a #> partial_join(x_large, y_large, "string", "seed") 286.01533 286.78024 290.6295 288.89405 291.79887 303.4524 10 b
I don't know how this will perform for larger data, but it (or a variant of it) might be worth a try:
library(dplyr)x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat"))y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan"))my_db <- src_sqlite(path = tempfile(),create= TRUE)x_tbl <- copy_to(dest = my_db,df = x)y_tbl <- copy_to(dest = my_db,df = y)result <- tbl(my_db,sql("select * from x,y where x.string like '%' || y.seed || '%'"))> collect(result)Source: local data frame [2 x 4] idX string idY seed (int) (chr) (chr) (chr)1 1 Motorcycle b otorcy2 2 TractorTrailer a ractor
I also can't speak to how the performance of this might vary across DBs. postgres or mysql might be better or worse at this sort of query.