dplyr：部分的に文字列が一致するinner_join

Question

データフレームseedのy列がstringのx列の部分一致である場合、2つのデータフレームを結合したいと思います。この例は、次のことを示しています。

# What I have x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat")) y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan")) x idX string 1 1 Motorcycle 2 2 TractorTrailer 3 3 Sailboat y Source: local data frame [3 x 2] idY seed (chr) (chr) 1 a ractor 2 b otorcy 3 c irplan # What I want want <- data.frame(idX=c(1,2), idY=c("b", "a"), string=c("Motorcycle", "TractorTrailer"), seed=c("otorcy", "ractor")) want idX idY string seed 1 1 b Motorcycle otorcy 2 2 a TractorTrailer ractor

つまり、

inner_join(x, y, by=stringr::str_detect(x$string, y$seed))

Feng Mai · Answer

fuzzyjoinライブラリには2つの関数regex_inner_joinおよびfuzzy_inner_joinを使用すると、部分的な文字列を照合できます。

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat")) y <- data.frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan")) x$string = as.character(x$string) y$seed = as.character(y$seed) library(fuzzyjoin) x %>% regex_inner_join(y, by = c(string = "seed")) idX string idY seed 1 1 Motorcycle b otorcy 2 2 TractorTrailer a ractor library(stringr) x %>% fuzzy_inner_join(y, by = c("string" = "seed"), match_fun = str_detect) idX string idY seed 1 1 Motorcycle b otorcy 2 2 TractorTrailer a ractor

David · Answer

また、この関数でbase-rを使用することもできます（ここでのこの回答からわずかに変更： https://stackoverflow.com/a/34723496/304845 、それはdplyrを使用して列をバインドし、cbindを使用しますdplyrを使用したくない場合）：

partial_join <- function(x, y, by_x, pattern_y) idx_x <- sapply(y[[pattern_y]], grep, x[[by_x]]) idx_y <- sapply(seq_along(idx_x), function(i) rep(i, length(idx_x[[i]]))) df <- dplyr::bind_cols(x[unlist(idx_x), , drop = F], y[unlist(idx_y), , drop = F]) return(df) }

あなたの例で

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat")) y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan")) df_merged <- partial_join(x, y, by_x = "string", pattern_y = "seed") df_merged # # A tibble: 2 × 4 # idX string idY seed # <int> <chr> <chr> <chr> # 1 1 Motorcycle b otorcy # 2 2 TractorTrailer a ractor

速度のベンチマーク：

関数

library(dplyr) x <- data_frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat")) y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan")) partial_join <- function(x, y, by_x, pattern_y) { idx_x <- sapply(y[[pattern_y]], grep, x[[by_x]]) idx_y <- sapply(seq_along(idx_x), function(i) rep(i, length(idx_x[[i]]))) df <- dplyr::bind_cols(x[unlist(idx_x), , drop = F], y[unlist(idx_y), , drop = F]) return(df) } partial_join(x, y, by_x = "string", pattern_y = "seed") #> # A tibble: 2 × 4 #> idX string idY seed #> <int> <chr> <chr> <chr> #> 1 1 Motorcycle b otorcy #> 2 2 TractorTrailer a ractor joran <- function(x, y, by_x, pattern_y) { library(dplyr) my_db <- src_sqlite(path = tempfile(), create= TRUE) x_tbl <- copy_to(dest = my_db, df = x) y_tbl <- copy_to(dest = my_db, df = y) result <- tbl(my_db, sql(sprintf("select * from x, y where x.%s like '%%' || y.%s || '%%'", by_x, pattern_y))) collect(result, n = Inf) } joran(x, y, "string", "seed") #> # A tibble: 2 × 4 #> idX string idY seed #> <int> <chr> <chr> <chr> #> 1 1 Motorcycle b otorcy #> 2 2 TractorTrailer a ractor stephen <- function(x, y, by_x, pattern_y) { library(dplyr) d <- full_join(mutate(x, i=1), mutate(y, i=1), by = "i") # quoting issue here, defaulting to base-r d$take <- stringr::str_detect(d[[by_x]], d[[pattern_y]]) d %>% filter(take == T) %>% select(-i, -take) } stephen(x, y, "string", "seed") #> # A tibble: 2 × 4 #> idX string idY seed #> <int> <chr> <chr> <chr> #> 1 1 Motorcycle b otorcy #> 2 2 TractorTrailer a ractor feng <- function(x, y, by_x, pattern_y) { library(fuzzyjoin) by_string <- pattern_y names(by_string) <- by_x regex_inner_join(x, y, by = by_string) } feng(x, y, "string", "seed") #> # A tibble: 2 × 4 #> idX string idY seed #> <int> <chr> <chr> <chr> #> 1 1 Motorcycle b otorcy #> 2 2 TractorTrailer a ractor

基準

library(microbenchmark) res <- microbenchmark( joran(x, y, "string", "seed"), stephen(x, y, "string", "seed"), feng(x, y, "string", "seed"), partial_join(x, y, "string", "seed") ) res #> Unit: microseconds #> expr min lq mean #> joran(x, y, "string", "seed") 18953.008 20099.0540 21641.6646 #> stephen(x, y, "string", "seed") 1320.161 1456.9415 1704.9218 #> feng(x, y, "string", "seed") 5187.366 5625.8825 6926.2336 #> partial_join(x, y, "string", "seed") 190.264 222.0055 257.7906 #> median uq max neval cld #> 20675.5855 21827.764 70707.324 100 c #> 1579.8925 1670.719 9676.176 100 a #> 5842.8150 6065.530 107961.805 100 b #> 242.0735 283.870 523.649 100 a set.seed(123123) x_large <- x %>% sample_n(1000, replace = T) y_large <- y %>% sample_n(1000, replace = T) res_large <- microbenchmark( joran(x_large, y_large, "string", "seed"), # stephen(x_large, y_large, "string", "seed"), feng(x_large, y_large, "string", "seed"), partial_join(x_large, y_large, "string", "seed") ) res_large #> Unit: milliseconds #> expr min lq mean median uq max neval cld #> joran(x_large, y_large, "string", "seed") 321.03631 324.49262 334.2760 329.13991 335.30185 368.1153 10 c #> feng(x_large, y_large, "string", "seed") 88.00369 89.85744 103.8686 93.84477 97.69121 200.0473 10 a #> partial_join(x_large, y_large, "string", "seed") 286.01533 286.78024 290.6295 288.89405 291.79887 303.4524 10 b

joran · Answer

これがより大きなデータに対してどのように機能するかはわかりませんが、それ（またはその変形）は試してみる価値があります。

library(dplyr) x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat")) y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan")) my_db <- src_sqlite(path = tempfile(),create= TRUE) x_tbl <- copy_to(dest = my_db,df = x) y_tbl <- copy_to(dest = my_db,df = y) result <- tbl(my_db,sql("select * from x,y where x.string like '%' || y.seed || '%'")) > collect(result) Source: local data frame [2 x 4] idX string idY seed (int) (chr) (chr) (chr) 1 1 Motorcycle b otorcy 2 2 TractorTrailer a ractor

このパフォーマンスがDB間でどのように異なるかについても説明できません。この種のクエリでは、postgresまたはmysqlの方が良い場合と悪い場合があります。

Stephen Turner · Answer

これは機能しますが、巨大なデータセットでは信じられないほど遅くなります。

x <- data.frame(idX=1:3, string=c("Motorcycle", "TractorTrailer", "Sailboat")) y <- data_frame(idY=letters[1:3], seed=c("ractor", "otorcy", "irplan")) library(dplyr) full_join(mutate(x, i=1), mutate(y, i=1)) %>% select(-i) %>% filter(str_detect(string, seed))