Vince Carey
June 14, 2017
Road map:
Mystery functions?
ts_zpm = function(d) sqrt(length(d))*mean(d)/sd(d)contam1 = function(x, slip=5) {x[1] = x[1]+slip; x}set.seed(12345)
X = rnorm(50)
ts_zpm(X)## [1] 1.157889
ts_zpm(contam1(X))## [1] 1.479476
ts_zpm(contam1(X,100))## [1] 1.082075
set.seed(12345)
X = rnorm(50)
tst = t.test(X)
tst## 
##  One Sample t-test
## 
## data:  X
## t = 1.1579, df = 49, p-value = 0.2525
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.1320800  0.4912126
## sample estimates:
## mean of x 
## 0.1795663
str(tst)## List of 9
##  $ statistic  : Named num 1.16
##   ..- attr(*, "names")= chr "t"
##  $ parameter  : Named num 49
##   ..- attr(*, "names")= chr "df"
##  $ p.value    : num 0.253
##  $ conf.int   : atomic [1:2] -0.132 0.491
##   ..- attr(*, "conf.level")= num 0.95
##  $ estimate   : Named num 0.18
##   ..- attr(*, "names")= chr "mean of x"
##  $ null.value : Named num 0
##   ..- attr(*, "names")= chr "mean"
##  $ alternative: chr "two.sided"
##  $ method     : chr "One Sample t-test"
##  $ data.name  : chr "X"
##  - attr(*, "class")= chr "htest"
tst$statistic  # from R##        t 
## 1.157889
sqrt(50)*mean(X)/sd(X) # by hand## [1] 1.157889
set.seed(12345)
simdist = replicate(10000,  ts_zpm( rnorm(50) ))
head(simdist)## [1]  1.15788926  1.92818028 -0.08173422  0.82143100  0.39880144 -1.21706025
hist(simdist, freq=FALSE)
lines(seq(-3,3,.01), dt(seq(-3,3,.01), 49))simulated:
mean(simdist > 1.1579)## [1] 0.123
theoretical (exact):
integrate(function(x) dt(x,49), 1.1579, Inf)$value## [1] 0.1262589
contsim = replicate(10000, 
    ts_zpm( contam1( rnorm(50) ) ) )
critval_1sided = qt(.95, 49)
mean(simdist > critval_1sided) # uncontaminated## [1] 0.0483
mean(contsim > critval_1sided) # contaminated## [1] 0.0946
library(parody)## Loading required package: tools
robust_t = function(x) {
 outchk = calout.detect(x, method="GESD")
 if (!is.na(outchk$ind[1])) x = x[-outchk$ind]
 sqrt(length(x))*mean(x)/sd(x)
}
set.seed(12345)
contsim_r = replicate(10000, 
    robust_t( contam1( rnorm(50) ) ) )
mean(contsim_r > critval_1sided) # robust test on contaminated## [1] 0.0531
power.t.test(n=50, type="one.sample", 
   alt="one.sided", delta=.4)## 
##      One-sample t test power calculation 
## 
##               n = 50
##           delta = 0.4
##              sd = 1
##       sig.level = 0.05
##           power = 0.8737242
##     alternative = one.sided
# empirical
mean( replicate(10000, 
    ts_zpm(rnorm(50, .4))>qt(.95, 49)) )## [1] 0.8764
mean( replicate(10000, ts_zpm(
     contam1(rnorm(50, .4), slip=25))>qt(.95, 49)) )## [1] 0.5834
Exercises:
We’ll use a series of outlier magnitudes (0:10) and summarize distributions of estimators
set.seed(12345)
mns <- sapply(0:10, function(o) 
  median(replicate(5000, mean(contam1(rnorm(50),o))))) 
set.seed(12345)
meds <- sapply(0:10, function(o) 
  median(replicate(5000, median(contam1(rnorm(50),o))))) plot(0:10, mns, xlab="outlier magnitude", ylab="median of stat",
  pch=19)
points(0:10, meds, pch=19, col="blue")
legend(0, .1, pch=19, col=c("black", "blue"), legend=c("mean", "median"))set.seed(12345)
sds <- sapply(0:10, function(o) 
  median(replicate(5000, sd(contam1(rnorm(50),o))))) 
set.seed(12345)
mads <- sapply(0:10, function(o) 
  median(replicate(5000, mad(contam1(rnorm(50),o))))) plot(0:10, sds, xlab="outlier magnitude", ylab="median of stat",
  pch=19)
points(0:10, mads, pch=19, col="blue")
legend(0, 1.2, pch=19, col=c("black", "blue"), legend=c("SD", "MAD"))contam1() to help demonstrate the breakdown conceptqsignrank(.95, 50)## [1] 808
set.seed(12345)
wilcox.test(rnorm(50, .4))$statistic##   V 
## 977
Show that the (x,y) pairs have identical
- marginal means
- marginal SDs
- correlation coefficients
- linear regressions of y on x
Use MASS::rlm to get a model for y3, x3 that fits the majority of points exactly
suppressMessages({set.seed(12345)
library(DESeq2)
S1 = makeExampleDESeqDataSet(betaSD=.75)
D1 = DESeq(S1)
R1 = results(D1)})summary(R1)## 
## out of 997 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)     : 99, 9.9% 
## LFC < 0 (down)   : 95, 9.5% 
## outliers [1]     : 2, 0.2% 
## low counts [2]   : 211, 21% 
## (mean count < 6)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results