Hi,
A while back, Tony Larson posted code on the old Google groups forum. The code went through each mzRT feature and checked for a Gaussian fit for each peak. I have found it is stricter that the 'fitgauss=TRUE' in centWave and using XCMS < 3.0, I implement the code after I do the peakPicking. I am trying to implement this in XCMS > 3.0, but have run into one final hiccup when I try to reinsert my results back into the XCMSnExp object.
The code follows. I have left the 'old code' in the file for now, but commented out. I have managed to figure out how to get the data I want out of the mzML files, but I am stuck at the end (comment says 'STUCK HERE'). It seems like I should be able to use chromPeaks <- to set my answer. It will allow me to do this, but then I lose all the chromatography information that is needed in downstream processing.
Any thoughts? This is the final step keeping me from switching to XCMS > 3.0, so I hope to get it working.
Cheers,
Krista
#xdata <- peakShape_XCMS3(my_data,cor.val=0.9, useNoise = setNoise)
#and then proceed with retention time correction and correspondence/grouping
# Decreasing cor.val will allow more non-gaussian peaks through the filter
#original file version from the Google Groups for xcms from Tony Larson
#KL corrected this version 8/23/2011 for XCMS < 3
#KL updating to using XCMS3, 10/30/2018
#original peakShape function to remove non-gaussian peaks from an xcmsSet
#code originally had cor.val = 0.9; 0.5 is too low (not doing enough pruning)
#object is updated to be an XCMSnExp class
peakShape_XCMS3 <- function(object, cor.val=0.9, useNoise = setNoise)
{
require(xcms)
#files <- object@filepaths #old code
files <- fileNames(object)
#peakmat <- object@peaks #old code
peakmat <- chromPeaks(object) #extract everything
peakmat.new <- matrix(-1,1,ncol(peakmat))
colnames(peakmat.new) <- colnames(peakmat)
for(f in 1:length(files))
{
#xraw <- xcmsRaw(files[f], profstep=0) #old code
raw_data <- readMSData(files[f],msLevel = 1, mode = "onDisk") #use 'onDisk' to make the next step faster
sub.peakmat <- peakmat[which(peakmat[,"sample"]==f),,drop=F]
corr <- numeric()
for (p in 1:nrow(sub.peakmat))
{
#old code
#tempEIC <-
#as.integer(rawEIC(xraw,mzrange=c(sub.peakmat[p,"mzmin"]-0.001,sub.peakmat[p,"mzmax"]+0.001))$intensity)
#minrt.scan <- which.min(abs(xraw@scantime-sub.peakmat[p,"rtmin"]))[1]
#maxrt.scan <- which.min(abs(xraw@scantime-sub.peakmat[p,"rtmax"]))[1]
#eics <- tempEIC[minrt.scan:maxrt.scan]
mzRange = c(sub.peakmat[p,"mzmin"]-0.001,sub.peakmat[p,"mzmax"]+0.001)
subsetOnMZ <- filterMz(raw_data, mz = mzRange)
#now set the Rt range...use sub.peakmat min and max RT
rtRange = c(sub.peakmat[p,"rtmin"],sub.peakmat[p,"rtmax"])
subsetOnMZandRT <- filterRt(subsetOnMZ, rt = rtRange)
eics <- intensity(subsetOnMZandRT) #get the intensity values
eics[sapply(eics, function(x) length(x)==0)] <- 0 #if empty in a scan, convert to 0
eics <- as.integer(unlist(eics)) #use as.double for Lumos
#filter out features that are less than the noise level I have already set...
setThreshold <- which(eics < useNoise)
eics <- eics[-setThreshold]
rm(setThreshold)
#remove any NA (easier bc downstream leaving it in causes issues)
eics <- eics[!is.na(eics)]
getIdx <- which(eics == min(eics))[1] #if multiple values, just need the first match
#set min to 0 and normalise
eics <- eics-eics[getIdx]
if(max(eics,na.rm=TRUE)>0)
{
eics <- eics/max(eics, na.rm=TRUE)
}
#fit gauss and let failures to fit through as corr=1
fit <- try(nls(y ~ SSgauss(x, mu, sigma, h),
data.frame(x = 1:length(eics), y = eics)),silent=T)
if(class(fit) == "try-error")
{
corr[p] <- 1
} else {
#calculate correlation of eics against gaussian fit
if (length(which(!is.na(eics - fitted(fit)))) > 4 &&
length(!is.na(unique(eics)))>4 &&
length(!is.na(unique(fitted(fit))))>4)
{
cor <- NULL
options(show.error.messages = FALSE)
cor <- try(cor.test(eics,fitted(fit),method="pearson",use="complete"))
options(show.error.messages = TRUE)
if (!is.null(cor))
{
if(cor$p.value <= 0.05) {
corr[p] <- cor$estimate
} else {
corr[p] <- 0 }
}
else corr[p] <- 0
} else corr[p] <- 0
}
} #this ends to the 'p' loop (going through one mzRT feature at a time)
filt.peakmat <- sub.peakmat[which(corr >= cor.val),]
peakmat.new <- rbind(peakmat.new, filt.peakmat)
n.rmpeaks <- nrow(sub.peakmat)-nrow(filt.peakmat)
cat("Peakshape evaluation: sample ",
basename(files[f]),"
",n.rmpeaks,"/",nrow(sub.peakmat)," peaks removed","\n")
if (.Platform$OS.type == "windows") flush.console()
} #this ends the 'f' loop (going through one file at a time())
peakmat.new <- peakmat.new[-1,] #all but the first row that is all -1
object.new <- object #copy to a new object
#object.new@peaks <- peakmat.new #old code
chromPeaks(object.new) <- peakmat.new #STUCK HERE...why doesn't this work?
return(object.new)
} #this ends the function itself
Some example data would probably make it a lot easier to help.
What is the error?
If you post the output of
chromPeaks(object.new)
and
peakmat.new
it might give a hint about the problem.
One comment: With XCMS3 you make an object with the raw data before the XCMS object. Instead of re-reading the raw files in your function you could reuse this object.
Hi Jan,
Sorry...sample code would be helpful. In the section below where I source 'peakShape_XCMS3.r' that is the code in the chunk from my first post (if I could figure out how to post an attachment, I would just attach it...).
library(xcms)
library(faahKO)
faahko_3_files <- c(system.file('cdf/KO/ko15.CDF', package = "faahKO"),
system.file('cdf/KO/ko16.CDF', package = "faahKO"),
system.file('cdf/KO/ko18.CDF', package = "faahKO"))
## Do a quick and dirty preprocessing
od <- readMSData(faahko_3_files, mode = "onDisk")
od <- findChromPeaks(od, param = CentWaveParam(peakwidth = c(30, 80),
noise = 1000))
source("peakShape_XCMS3.r")
xdata <- peakShape_XCMS3(od,cor.val = 0.9,useNoise = 1000)
Then, if I see what the output from od is :
Compare to the output from xdata which is:
Hence the details on the chromatographic peak detection seem to be lost.
Thanks Krista for providing the example!
It's also nice to see that you used the filterRt/filterMz in your code.
I'll have a look at it and post soon.
cheers, jo
In fact you did everything correctly. The problem was that replacing the identified peaks with chromPeaks(object) <- removes also the processing history (i.e. the information with which algorithm the peaks were detected). The "show" method for the XCMSnExp object had a bug that caused the error that you've seen. So nothing you did wrong, it was a bug in the function that ouputs the information on the object (I'll fix that in xcms).
The simple solution is to add the line
object.new@.processHistory <- object@.processHistory
to your function right before you return the result
I did also tweak your function a little bit. This version works also with objects after alignment:
peakShape_XCMS3v2 <- function(object, cor.val = 0.9, useNoise = 100) {
require(xcms)
## Ensure we use adjusted retention times!
xdata <- applyAdjustedRtime(object)
peakmat <- chromPeaks(xdata) #extract everything
res <- lapply(split.data.frame(peakmat, peakmat[, "sample"]), function(z) {
## Subset the data to the present file keeping only spectra that
## are in the peak range.
currentSample <- filterRt(filterFile(xdata, z[1, "sample"]),
rt = range(z[, c("rtmin", "rtmax")]))
## Loading the data into memory - could be faster for some settings
## can however also be removed
currentSample <- as(as(currentSample, "OnDiskMSnExp"), "MSnExp")
corr <- numeric(nrow(z))
for (i in seq_len(nrow(z))) {
mzRange <- z[i, c("mzmin", "mzmax")] + c(-0.001, 0.001)
rtRange <- z[i, c("rtmin", "rtmax")]
suppressWarnings(
ints <- intensity(filterMz(filterRt(currentSample, rtRange),
mzRange))
)
ints[lengths(ints) == 0] <- 0
ints <- as.integer(unlist(ints))
ints <- ints[!is.na(ints)]
ints <- ints[ints > useNoise]
## Only proceed if we have values left
if (length(ints)) {
## Normalize/baseline subtract
ints <- ints - min(ints)
if (max(ints) > 0)
ints <- ints / max(ints)
fit <- try(nls(y ~ SSgauss(x, mu, sigma, h),
data.frame(x = 1:length(ints), y = ints)),
silent = TRUE)
if (class(fit) == "try-error") {
corr[i] <- 1 # Shouldn't that be 0???
} else {
## calculate correlation of eics against gaussian fit
if (sum(!is.na(ints - fitted(fit))) > 4 &&
sum(!is.na(unique(ints))) > 4 &&
sum(!is.na(unique(fitted(fit)))) > 4) {
cor <- NULL
options(show.error.messages = FALSE)
cor <- try(cor.test(ints, fitted(fit),
method = "pearson",
use = "complete"))
options(show.error.messages = TRUE)
if (!is.null(cor) && cor$p.value <= 0.05)
corr[i] <- cor$estimate
}
}
}
} # End for loop
message("Peakshape evaluation: sample ",
basename(fileNames(currentSample)), ": ", sum(corr < cor.val),
"/", nrow(z), " peaks removed")
z[corr >= cor.val, , drop = FALSE]
})
## Need to "remember" the process history.
ph <- processHistory(object)
chromPeaks(object) <- do.call(rbind, res)
## Have to add also the process history
object@.processHistory <- ph
object
}
Excellent. I am glad this was such an easy fix.
Johannes - thanks for taking the time to go through and find the error, and for tweaking my code. I have learned a few more short cuts in looking at how you changed the function.
With this, I am not ready to move into XCMS > 3. I look forward to seeing future developments in the program.
I really appreciate all the help.
Cheers,
Krista