################################################################################ ## ## ## R-scripts for the manuscript ## ## ## ## DISMS2: A flexible algorithm for direct proteome-wide distance ## ## calculation of LC-MS/MS runs ## ## ## ## ## ## by ## ## ## ## Vera Rieder, Bernhard Blank-Landeshammer, Marleen Stuhr, Tilman Schell, ## ## Karsten Biß, Laxmikanth Kollipara, Achim Meyer, Markus Pfenninger, ## ## Hildegard Westphal, Albert Sickmann and Jörg Rahnenführer ## ## ## ################################################################################ library(readMzXmlData) library(mzR) library(tools) library(gtools) ## D_DISMS2 returns a distance matrix for N MS/MS runs (DISMS2). Runs ## in mzxml format are saved as single workspaces previously with read_ms. ## Functions binning and topn_fun are needed for filtering (Step 1). Three ## distance measures for mass spectra are implemented, i.e. cosine distance ## cos_dist, angle distance angle_dist and parametrized Hausdorff distance ## PH_dist. Directed distances for two MS/MS runs are computed by ## dstar_DISMS2 and used for calculation of distance matrix (see D_DISMS2). ################################################################################ # read_ms - Read MS/MS runs in mzxml format. # Input: # mzxml_file - File name of mzxml. # name - Name of file to save. # Output: # List saved as paste0(name, ".RData") with elements: # mzxml - A list of spectra and metadata (see ?readMzXmlFile) # ms2_index - Index of scans ms2 spectra reported along retention time # order. # precMZ - Precursor mass-to-charge ratio of all scans reported along # retention time order. # precursorCharge - Precursor Charge of all scans reported along retention # time order. read_ms <- function(mzxml_file, name) { mzxml <- readMzXmlFile(mzxml_file, removeMetaData = FALSE, verbose = FALSE) mzR <- openMSfile(mzxml_file, backend="Ramp", verbose = FALSE) ms2_index <- which(header(mzR)$msLevel == 2) precMZ <- header(mzR)$precursorMZ precursorCharge <- header(mzR)$precursorCharge result <- list(mzxml = mzxml, ms2_index = ms2_index, precMZ = precMZ, precursorCharge = precursorCharge) assign(name, result) save(list = name, file = paste0(name, ".RData")) } ################################################################################ # binning - Bin mass spectra with binsize bin # Input: # spec - Mass spectrum; list with entries mass and intensity. # bin - Binsize; default 0.2. # Output: # Binned mass spectrum; list with entries mass and intensity. binning <- function(spec, bin = 0.2) { mass <- floor(spec$mass/bin)*bin + bin/2 int <- spec$intensity if(any(duplicated(mass))) { int <- as.vector(tapply(int, mass, max)) mass <- unique(mass) } return(list(mass = mass, intensity = int)) } ################################################################################ # topn_fun - Filter peaks with topn highest intensities # Input: # spec - Mass spectrum; list with entries mass and intensity. # n - Number of peaks with highest intensities; default 100. # Output: # Filtered mass spectrum; list with entries mass and intensity. topn_fun <- function(spec, n = 100){ if(length(spec$mass) <= n) return(spec) if(length(spec$mass) > n){ y <- data.frame(spec) y <- y[rev(order(y$intensity))[1:n],] # n top int. and related masses y <- y[order(y$mass),] # spectrum with ascending order of mass return(as.list(y)) } } ################################################################################ # cos_dist - Cosine distance # Input: # x - mass spectrum; list with entries mass and intensity. # y - mass spectrum; list with entries mass and intensity. # Output: # Cosine distance of two mass spectra. cos_dist <- function(x, y) { x_in_y <- x$mass %in% y$mass y_in_x <- y$mass %in% x$mass if(any(x_in_y)) { arg1 <- x$intensity[x_in_y]/sqrt(sum(x$intensity^2)) arg2 <- y$intensity[y_in_x]/sqrt(sum(y$intensity^2)) result <- 1 - sum(arg1 * arg2) } else result <- 1 return(result) } ############################################################################### # angle_dist - Angle distance # Input: # x - mass spectrum; list with entries mass and intensity. # y - mass spectrum; list with entries mass and intensity. # eps - Parameter epsilon in angle distance. # Output: # Angle distance of two mass spectra. angle_dist <- function (x, y, eps = 0.05) { n1 <- length(x$mass) n2 <- length(y$mass) d <- abs(rep(x$mass, each = n2) - rep(y$mass, n1)) <= eps dangle <- acos(sum(apply(matrix(d, ncol = n2), 2, max))/sqrt(n1*n2)) return(dangle) } ############################################################################### # PH_dist - Parametrized Hausdorff distance # Input: # x - mass spectrum; list with entries mass and intensity. # y - mass spectrum; list with entries mass and intensity. # del - Parameter delta in Parametrized Hausdorff distance; default 0.05. # k - Parameter k in Parametrized Hausdorff distance; default 50. # Output: # Parametrized Hausdorff distance of two mass spectra. PH_dist <- function (x, y, k = 50, del = 0.05) { n1 <- length(x$mass) n2 <- length(y$mass) d <- abs (rep(x$mass, each = n2) - rep (y$mass, n1)) d[d <= del] <- 0 dH <- matrix (d, nrow = n1, byrow = TRUE) dHab <- sum((apply(dH, 1, min))^(1/k))/n1 dHba <- sum((apply(dH, 2, min))^(1/k))/n2 dH <- max (dHab, dHba) return(dH) } ############################################################################### # dstar_DISMS2 - Directed distance of two MS/MS runs # Input: # data1 - MS/MS run 1. # data2 - MS/MS run 2. # bint - TRUE/FALSE to decide if spectra shall be binned; default TRUE; # FALSE implies to keep original spectra. # bin - Bin spectra with binsize bin, if binning == TRUE; default 0.2. # rtol - Size of retention time window using ranks of scan numbers; # default 1000. # prec - Accepted precursor mass shift (ppm); default 10. # topnt - TRUE/FALSE to state filtering peaks with topn highest intensities; # default TRUE; FALSE stands for 'topn = inf'. # topn - Number of peaks with highest intensities, if topnt == TRUE; # default 50. # d_dist - Distance measure for the pairwise comparison of mass spectra; # default "cos", cosine distance. Alternative "angle" for angle # distance or "PH" for parametrized Hausdorff distance. Other # distance measures possible. # cdis - Cutoff for distance. Default 0.3. # Output: # Directed distance dstar for two MS/MS runs (DISMS2 Step 3). dstar_DISMS2 <- function(data1, data2, bint = T, bin = 0.2, rtol = 1000, prec = 10, topnt = T, topn = 50, d_dist = "cos", cdis = 0.3) { if(d_dist == "cos") .dist <- cos_dist if(d_dist == "angle") .dist <- angle_dist if(d_dist == "PH") .dist <- PH_dist precCh1 <- data1$precursorCharge[data1$ms2_index] precCh2 <- data2$precursorCharge[data2$ms2_index] ix1 <- data1$ms2_index ix2 <- data2$ms2_index prec1 <- data1$precMZ[ix1] prec2 <- data2$precMZ[ix2] n1 <- length(ix1) n2 <- length(ix2) spec1 <- lapply(data1$mzxml[ix1], function(x) x$spectrum) spec2 <- lapply(data2$mzxml[ix2], function(x) x$spectrum) ## Step 1 ## Filter spectra (topn selection and binning). if(topnt == T) { spec1 <- lapply(spec1, function(x) topn_fun(x, n = topn)) spec2 <- lapply(spec2, function(x) topn_fun(x, n = topn)) } if(bint == T) { spec1 <- lapply(spec1, function(x) binning(x, bin = bin)) spec2 <- lapply(spec2, function(x) binning(x, bin = bin)) } ## Step 2 ## Check constraints. matches <- sapply(1:n1, function(x) { # 2.1 retention time rtolok <- seq(max(1, x - rtol), min(n1, x + rtol)) # 2.2 precursor charge chok <- rtolok[precCh2[rtolok] == precCh1[x]] # 2.3 precursor mass ok <- chok[which(prec2[chok] >= prec1[x]*(1-(10^(-6)*prec)) & prec2[chok] <= prec1[x]*(1+(10^(-6)*prec)))] if(length(ok) > 0) { sca <- sapply(spec2[ok], function(y) .dist(spec1[[x]], y)) result <- min(sca) } else result <- NA return(result) }) ## Step 3 dstar <- (sum(na.omit(matches) > cdis) + sum(is.na(matches)))/length(matches) return(dstar) } ################################################################################ # D_DISMS2 - Distance matrix of N MS/MS runs. # Input: # msmsdata - list with file names of N workspaces (.RData) each containing a # list of MS/MS run (see function read_ms above). # bint - TRUE/FALSE to decide if spectra shall be binned; default TRUE; # FALSE implies to keep original spectra. # bin - Bin spectra with binsize bin, if binning == TRUE; default 0.2. # rtol - Size of retention time window using ranks of scan numbers; # default 1000. # prec - Accepted precursor mass shift (ppm); default 10. # topnt - TRUE/FALSE to state filtering peaks with topn highest intensities; # default TRUE; FALSE stands for 'topn = inf'. # topn - Number of peaks with highest intensities, if topnt == TRUE; # default 50. # d_dist - distance measure for the pairwise comparison of mass spectra; # default "cos", cosine distance. Alternative "angle" for angle # distance or "PH" for parametrized Hausdorff distance. Other # distance measures possible. # cdis - Cutoff for distance. Default 0.3. # Output: # "dist" object; Distance matrix of N MS/MS runs (DISMS2 Output) D_DISMS2 <- function(msmsdata, bint = T, bin = 0.2, rtol = 1000, prec = 10, topnt = T, topn = 50, d_dist = "cos", cdis = 0.3){ N <- length(msmsdata) Dmat <- matrix(nrow = N, ncol = N) comb <- combinations(N, 2) for(i in 1:nrow(comb)) { files <- msmsdata[comb[i,]] obj <- file_path_sans_ext(msmsdata[comb[i,]]) invisible(lapply(files, load, .GlobalEnv)) dstar12 <- dstar_DISMS2(data1 = get(obj[1]), data2 = get(obj[2]), bin = bin, rtol = rtol, prec = prec, topnt = topnt, topn = topn, d_dist = d_dist) dstar21 <- dstar_DISMS2(data1 = get(obj[2]), data2 = get(obj[1]), bin = bin, rtol = rtol, prec = prec, topnt = topnt, topn = topn, d_dist = d_dist) Dmat[comb[i,2],comb[i,1]] <- mean(c(dstar12,dstar21)) rm(list = obj, envir=.GlobalEnv) } return(as.dist(Dmat)) } ################################################################################ ## Example #library("rpx") #id <- "PXD002193" #px <- PXDataset(id) #pxget(px, "OTT0650-S68-A-Heart_1.mzML") #pxget(px, "OTT0650-S69-A-Heart_2.mzML") # msconvert: mzML -> mzXML #read_ms("OTT0650-S68-A-Heart_1.mzXML", "ex1") #read_ms("OTT0650-S69-A-Heart_2.mzXML", "ex2") #invisible(lapply(c("ex1.RData", "ex2.RData"), load, .GlobalEnv)) #dstar1 <- dstar_DISMS2(ex1, ex2) #dstar2 <- dstar_DISMS2(ex2, ex1) #Dex <- D_DISMS2(c("ex1.RData", "ex2.RData")) ################################################################################