## ----install, eval=FALSE------------------------------------------------------ # if (!requireNamespace('BiocManager', quietly=TRUE)) # install.packages('BiocManager') # # BiocManager::install('AWAggregator') # BiocManager::install('AWAggregatorData') ## ----load package------------------------------------------------------------- library(AWAggregator) library(AWAggregatorData) ## ----aggregate PSMs from FragPipe--------------------------------------------- # Load the pre-trained random forest model that does not include the average CV # as a feature, which indicates the average CV in percentage for processed PSM # reporter ion intensities across different replicate groups. It is recommended # to load the pre-trained model with average CV when replicates are available; # otherwise, use the model without the average CV data(sample.PSM.FP) regr <- loadQuantInaccuracyModel(useAvgCV=FALSE) # Load sample names (Sample 1 ~ Sample 9) samples <- colnames(sample.PSM.FP)[grep('Sample', colnames(sample.PSM.FP))] groups <- samples df <- getPSMAttributes( PSM=sample.PSM.FP, # TMT tag (229.1629) and carbamidomethylation (57.0214) are applied as # fixed post-translational modifications (PTMs) fixedPTMs=c('229.1629', '57.0214'), colOfReporterIonInt=samples, groups=groups, setProgressBar=TRUE ) aggregated_results <- aggregateByAttributes( PSM=df, colOfReporterIonInt=samples, ranger=regr, ratioCalc=FALSE ) ## ----aggregate PSMs from Proteome Discoverer---------------------------------- # Load the pre-trained random forest model that does not include the average CV # as a feature, which indicates the average CV in percentage for processed PSM # reporter ion intensities across different replicate groups. It is recommended # to load the pre-trained model with average CV when replicates are available; # otherwise, use the model without the average CV data(sample.PSM.PD) data(sample.prot.PD) regr <- loadQuantInaccuracyModel(useAvgCV=FALSE) # Load sample names (Sample 1 ~ Sample 9) samples <- colnames(sample.PSM.PD)[grep('Sample', colnames(sample.PSM.PD))] groups <- samples df <- convertPDFormat( PSM=sample.PSM.PD, protein=sample.prot.PD, colOfReporterIonInt=samples ) df <- getPSMAttributes( PSM=df, # TMT tag and carbamidomethylation are applied as static PTMs fixedPTMs=c('TMT6plex', 'Carbamidomethyl'), colOfReporterIonInt=samples, groups=groups, setProgressBar=TRUE ) aggregated_results <- aggregateByAttributes( PSM=df, colOfReporterIonInt=samples, ranger=regr, ratioCalc=FALSE ) ## ----load spike-in datasets--------------------------------------------------- library(ExperimentHub) eh <- ExperimentHub() benchmarkSet1 <- eh[['EH9637']] # Benchmark Set 1 benchmarkSet2 <- eh[['EH9638']] # Benchmark Set 2 benchmarkSet3 <- eh[['EH9639']] # Benchmark Set 3 ## ----calculate X and y for benchmark.set.1------------------------------------ library(stringr) # Load sample names (Sample 'H1+E1_1' ~ Sample 'H1+E6_3') samples <- colnames(benchmarkSet1)[ grep('H1[+]E[0-9]+_[1-4]', colnames(benchmarkSet1)) ] groups <- str_match(samples, 'H1[+]E[0-9]+')[, 1] PSM1 <- getPSMAttributes( PSM=benchmarkSet1, # TMT tag (229.1629) and carbamidomethylation (57.0214) are applied as # fixed PTMs fixedPTM=c('229.1629', '57.0214'), colOfReporterIonInt=samples, groups=groups ) PSM1 <- getAvgScaledErrorOfLog2FC( PSM=PSM1, colOfReporterIonInt=samples, groups=groups, # The actual protein fold change may be deviated from the intended values # after TMT labelling as the original work indicates when H1+Y6 is # involved, and therefore, H1+Y6 is not used in the calculation of Average # of Scaled Error of log2FC expectedRelativeAbundance=list(`H1+E1`=1, `H1+E2`=2, `H1+E6`=NA), speciesAtConstLevel='HUMAN' ) ## ----calculate X and y for benchmark.set.2------------------------------------ library(dplyr) # Load sample names (Sample 'H1+Y1_1' ~ Sample 'H1+Y10_3') samples <- colnames(benchmarkSet2)[ grep('H1[+]Y[0-9]+_[1-3]', colnames(benchmarkSet2)) ] groups <- str_match(samples, 'H1[+]Y[0-9]+')[, 1] # Process each replicate separately using lapply() # lapply() loops over all unique replicate IDs in benchmarkSet2. # 'X' is the current replicate ID. tmp <- lapply(unique(benchmarkSet2$Replicate), FUN=function(X){ # Select PSMs from the current replicate X df <- benchmarkSet2[benchmarkSet2$Replicate == X, ] df <- getPSMAttributes( PSM=df, fixedPTM=c('229.1629', '57.0214'), colOfReporterIonInt=samples, groups=groups, setProgressBar=FALSE ) df <- getAvgScaledErrorOfLog2FC( PSM=df, colOfReporterIonInt=samples, groups=groups, expectedRelativeAbundance=list(`H1+Y1`=1, `H1+Y4`=4, `H1+Y10`=10), speciesAtConstLevel='HUMAN' ) # Return the processed PSMs from the current replicate return(df) }) # Combine results from all replicates into one dataframe PSM2 <- bind_rows(tmp) ## ----calculate X and y for benchmark.set.3------------------------------------ # Load sample names (Sample 'H1+Y1_1' ~ Sample 'H1+Y10_2') samples <- colnames(benchmarkSet3)[ grep('H1[+]Y[0-9]+_[1-2]', colnames(benchmarkSet3)) ] groups <- str_match(samples, 'H1[+]Y[0-9]+')[, 1] PSM3 <- getPSMAttributes( PSM=benchmarkSet3, fixedPTM=c('304.2071', '125.0476'), colOfReporterIonInt=samples, groups=groups, # The signals for yeast PSMs in group H1+Y0 is completely from noise, so # they are not used for calculating Average CV groupsExcludedFromCV='H1+Y0' ) PSM3 <- getAvgScaledErrorOfLog2FC( PSM=PSM3, colOfReporterIonInt=samples, groups=groups, expectedRelativeAbundance=list( `H1+Y0`=0, `H1+Y1`=1, `H1+Y5`=5, `H1+Y10`=10 ), speciesAtConstLevel='HUMAN' ) ## ----merge spike-in datasets-------------------------------------------------- set.seed(1000) PSM <- mergeTrainingSets( PSMList=list( `Benchmark Set 1`=PSM1, `Benchmark Set 2`=PSM2, `Benchmark Set 3`=PSM3 ), numPSMs=min(nrow(PSM1), nrow(PSM2), nrow(PSM3)) ) ## ----train new model with average CV------------------------------------------ regr <- fitQuantInaccuracyModel(PSM, useAvgCV=TRUE, seed=3979) ## ----session info------------------------------------------------------------- sessionInfo()