jimmyoentung / RunsCorrelation

Compute pairwise Kendall's Tau-b correlation between two runs

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

RunsCorrelation

Compute pairwise Kendall's Tau-b correlation between two runs

getScoreRanksCorrelation <- function(runA, runB){
  # generate rank for scores with ties
  runA$scoreRank = unlist(with(runA, tapply(-score, queryNum, rank, ties.method='min')))
  runB$scoreRank = unlist(with(runB, tapply(-score, queryNum, rank, ties.method='min')))
  
  # get list of queryNum and Document Id pairs from the first run
  dfQueryDocs = unique(runA[, c("queryNum", "docId")])
  
  # append list of queryNum and Document Id pairs from the second run to the first
  dfQueryDocs = rbind(dfQueryDocs, runB[, c("queryNum", "docId")])
  
  
  # remove duplicates query-docId pairs
  dfQueryDocs = unique(dfQueryDocs)
  
  # generate document score rank for each run by LEFT JOINing the QueryDocs with the run
  docScoreRank_A = merge(x = dfQueryDocs, y = runA[,c("queryNum","docId", "scoreRank")], by = c("queryNum", "docId"), all.x = TRUE)
  docScoreRank_B = merge(x = dfQueryDocs, y = runB[,c("queryNum","docId", "scoreRank")], by = c("queryNum", "docId"), all.x = TRUE)
  
  # Sort both docScoreRank lists to ensure both are in the same comparable order
  docScoreRank_A = docScoreRank_A[order(docScoreRank_A$queryNum, docScoreRank_A$docId),]
  docScoreRank_B = docScoreRank_B[order(docScoreRank_B$queryNum, docScoreRank_B$docId),]
  
  # refresh the row names according to the new order
  rownames(docScoreRank_A) = NULL
  rownames(docScoreRank_B) = NULL
  
  # replace NA with 0
  docScoreRank_A[is.na(docScoreRank_A)] <- 1001
  docScoreRank_B[is.na(docScoreRank_B)] <- 1001
  
  # calculate correlaction between the two document score rank
  return(cor(docScoreRank_A$scoreRank, docScoreRank_B$scoreRank, method="kendall", use="pairwise"))
}

About

Compute pairwise Kendall's Tau-b correlation between two runs