UweとGKiの両方の答えが正しいです。 Uweが遅れたため、Gkiは賞金を受け取りましたが、Uweのソリューションは約15倍の速さで実行されます
df1 <- data.frame("ID" = c("patient1","patient1","patient1","patient1","patient2","patient3"),
"Days" = c(0,25,235,353,100,538),
"Score" = c(NA,2,3,4,5,6),
stringsAsFactors = FALSE)
df2 <- data.frame("ID" = c("patient1","patient1","patient1","patient1","patient2","patient2","patient3"),
"Days" = c(0,25,248,353,100,150,503),
"Score" = c(1,10,3,4,5,7,6),
stringsAsFactors = FALSE)
> df1
ID Days Score
1 patient1 0 NA
2 patient1 25 2
3 patient1 235 3
4 patient1 353 4
5 patient2 100 5
6 patient3 538 6
> df2
ID Days Score
1 patient1 0 1
2 patient1 25 10
3 patient1 248 3
4 patient1 353 4
5 patient2 100 5
6 patient2 150 7
7 patient3 503 6
> full_join(df1, df2, by=c("ID","Days")) %>%
+ arrange(.[[1]], as.numeric(.[[2]]))
ID Days Score.x Score.y
1 patient1 0 NA 1
2 patient1 25 2 10
3 patient1 235 3 NA
4 patient1 248 NA 3
5 patient1 353 4 4
6 patient2 100 5 5
7 patient2 150 NA 7
8 patient3 503 NA 6
9 patient3 538 6 NA
> threshold <- 30
> *** insert join code ***
ID Days Score.x Score.y
1 patient1 0 NA 1
2 patient1 25 2 10
3 patient1 248 3 3
4 patient1 353 4 4
5 patient2 100 5 5
6 patient2 150 NA 7
7 patient3 503 NA 6
8 patient3 538 6 NA
この出力は、前の出力の行3と4がマージされ(248-235 <30であるため)、2番目のdf(248)のDays
変数の最大4つの値が同じデータフレームに存在するため、マージしないでください。これらの値の1つが他のデータフレームのしきい値内に存在している可能性があり、これらをマージする必要があります。以下の例の3行目を参照してください。> df1
ID Days Score
1 patient1 0 1
2 patient1 5 2
3 patient1 10 3
4 patient1 15 4
5 patient1 50 5
> df2
ID Days Score
1 patient1 0 1
2 patient1 5 2
3 patient1 12 3
4 patient1 15 4
5 patient1 50 5
> df_combined
ID Days Score.x Score.y
1 patient1 0 1 1
2 patient1 5 2 2
3 patient1 12 3 3
4 patient1 15 4 4
5 patient1 50 5 5
> df1
ID Days Score
1: patient1 0 1
2: patient1 116 2
3: patient1 225 3
4: patient1 309 4
5: patient1 351 5
6: patient2 0 6
7: patient2 49 7
> df2
ID Days Score
1: patient1 0 11
2: patient1 86 12
3: patient1 195 13
4: patient1 279 14
5: patient1 315 15
6: patient2 0 16
7: patient2 91 17
8: patient2 117 18
testSO2 <- function(DT1,DT2) {
names(DT1) <- c("ID","Days","X")
names(DT2) <- c("ID","Days","Y")
DT1$Days <- as.numeric(DT1$Days)
DT2$Days <- as.numeric(DT2$Days)
DT1[, c("s1", "e1", "s2", "e2") := .(Days - 30L, Days + 30L, Days, Days)]
DT2[, c("s1", "e1", "s2", "e2") := .(Days, Days, Days - 30L, Days + 30L)]
byk <- c("ID", "s1", "e1")
setkeyv(DT1, byk)
setkeyv(DT2, byk)
o1 <- foverlaps(DT1, DT2)
byk <- c("ID", "s2", "e2")
setkeyv(DT1, byk)
setkeyv(DT2, byk)
o2 <- foverlaps(DT2, DT1)
olaps <- funion(o1, setcolorder(o2, names(o1)))[
is.na(Days), Days := i.Days]
outcome <- olaps[, {
if (all(!is.na(Days)) && any(Days == i.Days)) {
s <- .SD[Days == i.Days, .(Days = Days[1L],
X = X[1L],
Y = Y[1L])]
} else {
s <- .SD[, .(Days = max(Days, i.Days), X, Y)]
keyby = .(ID, md = pmax(Days, i.Days))][, md := NULL][]
> testSO2(df1,df2)
ID Days X Y
1: patient1 0 1 11
2: patient1 116 2 12
3: patient1 225 3 13
4: patient1 309 4 14
5: patient1 315 4 15
6: patient1 351 5 NA
7: patient2 0 6 16
8: patient2 49 7 NA
9: patient2 91 NA 17
10: patient2 117 NA 18
ご覧のとおり、4行目と5行目が間違っています。 df1のScore
ID Days X Y
4: patient1 309 4 14
5: patient1 315 NA 15
6: patient1 351 5 NA
> dput(df1)
structure(list(ID = c("patient1", "patient1", "patient1", "patient1",
"patient1", "patient2", "patient2"), Days = c("0", "116", "225",
"309", "351", "0", "49"), Score = 1:7), row.names = c(NA, 7L), class = "data.frame")
> dput(df2)
structure(list(ID = c("patient1", "patient1", "patient1", "patient1",
"patient1", "patient2", "patient2", "patient2"), Days = c("0",
"86", "195", "279", "315", "0", "91", "117"), Score = 11:18), row.names = c(NA,
8L), class = "data.frame")
#convert df1 and df2 to data.table format
#set colnames for later on
# (add .df1/.df2 suffix after Days and Score-colnamaes)
cols <- c("Days", "Score")
setnames(df1, cols, paste0( cols, ".df1" ) )
setnames(df2, cols, paste0( cols, ".df2" ) )
#update df1 with new measures from df2 (and df2 with df1)
# copies are made, to prevent changes in df1 and df2
dt1 <- copy(df1)[ df2, `:=`(Days.df2 = i.Days.df2, Score.df2 = i.Score.df2), on = .(ID, Days.df1 = Days.df2), roll = 30]
dt2 <- copy(df2)[ df1, `:=`(Days.df1 = i.Days.df1, Score.df1 = i.Score.df1), on = .(ID, Days.df2 = Days.df1), roll = -30]
#rowbind by columnnames (here the .df1/.df2 suffix is needed!), only keep unique rows
ans <- unique( rbindlist( list( dt1, dt2), use.names = TRUE ) )
#wrangle data to get to desired output
ans[, Days := ifelse( is.na(Days.df2), Days.df1, Days.df2 ) ]
ans <- ans[, .(Days, Score.x = Score.df1, Score.y = Score.df2 ), by = .(ID) ]
setkey( ans, ID, Days ) #for sorting; setorder() can also be used.
# ID Days Score.x Score.y
# 1: patient1 0 NA 1
# 2: patient1 25 2 10
# 3: patient1 248 3 3
# 4: patient1 353 4 4
# 5: patient2 100 5 5
# 6: patient2 150 NA 7
# 7: patient3 503 NA 6
# 8: patient3 538 6 NA
このコードを使用すると、しきい値を指定して、df1のスコアを新しい列としてdf1にマージできます。 df2 +/-しきい値のスコアの単一の範囲内にあるスコアのみが追加されます。すべてのスコアが一意に一致するしきい値がないため、すべてのスコアを結合することはできません。
threshold <- 40
WhereDF1inDF2 <- apply(sapply(lapply(df2$Days, function(x) (x+threshold):(x-threshold)), function(y) df1$Days %in% y),1,which)
useable <- sapply(WhereDF1inDF2, function(x) length(x) ==1 )
df2$Score1 <- NA
df2$Score1[unlist(WhereDF1inDF2[useable])] <- df1$Score[useable]
> df2
ID Days Score Score1
1 patient1 0 1 NA
2 patient1 25 10 NA
3 patient1 248 3 3
4 patient1 353 4 4
5 patient2 100 5 5
6 patient2 150 7 NA
7 patient3 503 6 6
df1 <- data.frame("ID" = c("patient1","patient1","patient1","patient1","patient2","patient3"),
"Days1" = c(0,25,235,353,100,538),
"Score1" = c(NA,2,3,4,5,6),
stringsAsFactors = FALSE)
df2 <- data.frame("ID" = c("patient1","patient1","patient1","patient1","patient2","patient2","patient3"),
"Days2" = c(0,25,248,353,100,150,503),
"Score2" = c(1,10,3,4,5,7,6),
stringsAsFactors = FALSE)
## define a dummy sequence for each patient
df11 <- df1 %>% group_by(ID) %>% mutate(ptseq = row_number())
df21 <- df2 %>% group_by(ID) %>% mutate(ptseq = row_number())
df3 <- dplyr::full_join(df11, df21, by=c("ID","ptseq")) %>%
arrange(.[[1]], as.numeric(.[[2]]))
df31 <- df3 %>% mutate(Days=Days2, diff=Days1-Days2) %>%
mutate(Score1=ifelse(abs(diff)>30, NA, Score1))
df32 <- df3 %>% mutate(diff=Days1-Days2) %>%
mutate(Days = case_when(abs(diff)>30 ~ Days1), Score2=c(NA), Days2=c(NA)) %>%
df <- rbind(df31,df32) %>% select(ID, ptseq, Days, Score1, Score2) %>%
arrange(.[[1]], as.numeric(.[[2]])) %>% select(-2)
ID Days Score1 Score2
<chr> <dbl> <dbl> <dbl>
1 patient1 0 NA 1
2 patient1 25 2 10
3 patient1 248 3 3
4 patient1 353 4 4
5 patient2 100 5 5
6 patient2 150 NA 7
7 patient3 503 NA 6
8 patient3 538 6 NA