Horvath 2013 Analysis

This is a re-analysis of Horvath 2013

library(WGCNA)
Loading required package: dynamicTreeCut
Loading required package: fastcluster

Attaching package: 'fastcluster'
The following object is masked from 'package:stats':

    hclust

Attaching package: 'WGCNA'
The following object is masked from 'package:stats':

    cor
library(sqldf)
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
library(impute)

Age transformation

Define transformation function for age and its inverse

trafo <- function(x, adult.age = 20) {
  x <- (x + 1) / (1 + adult.age)
  y <- ifelse(x <= 1, log(x), x - 1)
  y
}
anti.trafo <- function(x, adult.age = 20) {
  ifelse(x < 0, (1 + adult.age) * exp(x) - 1, (1 + adult.age) * x + adult.age)
}
source("Horvath2013_scripts/AdditionalFile24_Normalization.R")
Loading required package: RPMM
Loading required package: cluster

Let’s look at the annotation of the 21k “probes”:

probeAnnotation21kdatMethUsed <- read.csv("Horvath2013_scripts/AdditionalFile22_ProbeAnnotation21kDatMethUsed.csv")
head(probeAnnotation21kdatMethUsed[1:5,1:5])
        Name Gene_ID GenomeBuild Chr   MapInfo
1 cg00000292     487          36  16  28797601
2 cg00002426    7871          36   3  57718583
3 cg00003994    4223          36   7  15692387
4 cg00005847    3232          36   2 176737319
5 cg00007981   24145          36  11  93502242

Annotation of all the 27k probes:

probeAnnotation27k <- read.csv("Horvath2013_scripts/AdditionalFile21_datMiniAnnotation27k.csv")
head(probeAnnotation27k[1:5,1:5])
        Name Gene_ID GenomeBuild Chr   Accession
1 cg00000292     487          36  16 NM_173201.2
2 cg00002426    7871          36   3 NM_007159.2
3 cg00003994    4223          36   7 NM_005924.3
4 cg00005847    3232          36   2 NM_006898.4
5 cg00006414   57541          36   7 NM_020781.2

Coefficients of the Horvath clock:

datClock <- read.csv("Horvath2013_scripts/AdditionalFile23_predictor.csv")
head(datClock[1:5, 1:5])
    CpGmarker CoefficientTraining CoefficientTrainingShrunk varByCpG minByCpG
1 (Intercept)         0.695507258                 0.8869198       NA       NA
2  cg00075967         0.129336610                 0.1080961  0.02600   0.0160
3  cg00374717         0.005017857                        NA  0.01200   0.0031
4  cg00864867         1.599764050                 1.8639686  0.00087   0.0000
5  cg00945507         0.056852418                        NA  0.01600   0.0000

Step 1: Read methylation data

dat0 <- read.csv.sql("Horvath2013_scripts/AdditionalFile26_MethylationDataExample55.csv")
nSamples <- dim(dat0)[[2]] - 1
nProbes <- dim(dat0)[[1]]
# the following command may not be needed. But it is sometimes useful when you use read.csv.sql
dat0[, 1] <- gsub(x = dat0[, 1], pattern = "\"", replacement = "")
dat0[1:5,1:5]
     ProbeID  GSM946048  GSM946049  GSM946052  GSM946054
1 cg00000292 0.70586143 0.72979037 0.70458701 0.75085162
2 cg00002426 0.27244344 0.27398577 0.31064866 0.27864884
3 cg00003994 0.03703247 0.01469238 0.01711572 0.02896054
4 cg00005847 0.13324682 0.12048361 0.12086005 0.10693940
5 cg00006414 0.03093906 0.01923675 0.02171565 0.01316270
# the following command may not be needed. But it is sometimes useful when
# you use read.csv.sql
dat0[, 1] <- gsub(x = dat0[, 1], pattern = "\"", replacement = "")

# Create a log file which will be output into your directory
# The code looks a bit complicated because it serves to create a log file (for error checks etc).
# It will automatically create a log file.

if (file.exists("LogFile.txt")) { 
  file.remove("LogFile.txt")
}
[1] TRUE
file.create("LogFile.txt")
[1] TRUE
DoNotProceed <- FALSE
cat(paste(
  "The methylation data set contains", nSamples,
  "samples (e.g. arrays) and ", nProbes, " probes."
), file = "LogFile.txt")

if (nSamples == 0) {
  DoNotProceed <- TRUE
  cat(paste("\n ERROR: There must be a data input error since there seem to be no samples.\n Make sure that you input a comma delimited file (.csv file)\n that can be read using the R command read.csv.sql . Samples correspond to columns in that file  ."), file = "LogFile.txt", append = TRUE)
}

if (nProbes == 0) {
  DoNotProceed <- TRUE
  cat(paste("\n ERROR: There must be a data input error since there seem to be zero probes.\n Make sure that you input a comma delimited file (.csv file)\n that can be read using the R command read.csv.sql  CpGs correspond to rows."), file = "LogFile.txt", append = TRUE)
}

if (nSamples > nProbes) {
  cat(paste("\n MAJOR WARNING: It worries me a lot that there are more samples than CpG probes.\n Make sure that probes correspond to rows and samples to columns.\n I wonder whether you want to first transpose the data and then resubmit them? In any event, I will proceed with the analysis."), file = "LogFile.txt", append = TRUE)
}

if (is.numeric(dat0[, 1])) {
  DoNotProceed <- TRUE
  cat(paste("\n Error: The first column does not seem to contain probe identifiers (cg numbers from Illumina) since these entries are numeric values. Make sure that the first column of the file contains probe identifiers such as cg00000292. Instead it contains ", dat0[1:3, 1]), file = "LogFile.txt", append = TRUE)
}

if (!is.character(dat0[, 1])) {
  cat(paste("\n Major Warning: The first column does not seem to contain probe identifiers (cg numbers from Illumina) since these entries are numeric values. Make sure that the first column of the file contains CpG probe identifiers such as cg00000292. Instead it contains ", dat0[1:3, 1]), file = "LogFile.txt", append = TRUE)
}

datout <- data.frame(
  Error = c("Input error. Please check the log file for details", "Please read the instructions carefully."),
  Comment = c("", "email Steve Horvath.")
)

Steps 2-4: Analysis

There are three main steps in the analysis:

  1. Restrict the data to 21k probes and ensure they are numeric
  2. Normalize the data
  3. Run the methylation clock
if (!DoNotProceed) {
  nonNumericColumn <- rep(FALSE, dim(dat0)[[2]] - 1)
  for (i in 2:dim(dat0)[[2]]) {
    nonNumericColumn[i - 1] <- !is.numeric(dat0[, i])
  }
  if (sum(nonNumericColumn) > 0) {
    cat(paste("\n MAJOR WARNING: Possible input error. The following samples contain non-numeric beta values: ", colnames(dat0)[-1][nonNumericColumn], "\n Hint: Maybe you use the wrong symbols for missing data. Make sure to code missing values as NA in the Excel file. To proceed, I will force the entries into numeric values but make sure this makes sense.\n"), file = "LogFile.txt", append = TRUE)
  }

  XchromosomalCpGs <- as.character(probeAnnotation27k$Name[probeAnnotation27k$Chr == "X"])
  selectXchromosome <- is.element(dat0[, 1], XchromosomalCpGs)
  selectXchromosome[is.na(selectXchromosome)] <- FALSE
  if (sum(selectXchromosome) >= 500) {
    meanXchromosome <- rep(NA, dim(dat0)[[2]] - 1)
    meanXchromosome <- as.numeric(apply(as.matrix(dat0[selectXchromosome, -1]), 2, mean, na.rm = TRUE))
  }

  if (sum(is.na(meanXchromosome)) > 0) {
    cat(paste("\n \n Comment: There are lots of missing values for X chromosomal probes for some of the samples. This is not a problem when it comes to estimating age but I cannot predict the gender of these samples.\n "), file = "LogFile.txt", append = TRUE)
  }

  match1 <- match(probeAnnotation21kdatMethUsed$Name, dat0[, 1])
  if (sum(is.na(match1)) > 0) {
    missingProbes <- probeAnnotation21kdatMethUsed$Name[!is.element(probeAnnotation21kdatMethUsed$Name, dat0[, 1])]
    DoNotProceed <- TRUE
    cat(paste("\n \n Input error: You forgot to include the following ", length(missingProbes), " CpG probes (or probe names):\n ", paste(missingProbes, sep = "", collapse = ", ")), file = "LogFile.txt", append = TRUE)
  }

  # STEP 2: Restrict the data to 21k probes and ensure they are numeric
  match1 <- match(probeAnnotation21kdatMethUsed$Name, dat0[, 1])

  if (sum(is.na(match1)) > 0) {
    stop(paste(sum(is.na(match1)), "CpG probes cannot be matched"))
  }

  dat1 <- dat0[match1, ]
  asnumeric1 <- function(x) {
    as.numeric(as.character(x))
  }

  dat1[, -1] <- apply(as.matrix(dat1[, -1]), 2, asnumeric1)

  # STEP 3: Create the output file called datout
  set.seed(1)

  # Do you want to normalize the data (recommended)?
  normalizeData <- TRUE
  source("Horvath2013_scripts/AdditionalFile25_StepWiseAnalysis.R")

  # STEP 4: Output the results
  if (sum(datout$Comment != "") == 0) {
    cat(paste("\n The individual samples appear to be fine. "), file = "LogFile.txt", append = TRUE)
  }

  if (sum(datout$Comment != "") > 0) {
    cat(paste("\n Warnings were generated for the following samples.\n", datout[, 1][datout$Comment != ""], "\n Hint: Check the output file for more details."), file = "LogFile.txt", append = TRUE)
  }
}
[1] "Fitting EM beta mixture to goldstandard probes"
[1] Inf
[1] 0.008562084
[1] 0.007243956
[1] 0.009607635
[1] 0.01066752
[1] "Done"
ii= 1
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.01337954
[1] 0.01360036
[1] 0.01287579
[1] 0.01171657
[1] "Done"
[1] "Start normalising input probes"
ii= 2
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.01289601
[1] 0.01350655
[1] 0.01281407
[1] 0.01162541
[1] "Done"
[1] "Start normalising input probes"
ii= 3
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.00977992
[1] 0.0112908
[1] 0.0110434
[1] 0.01014714
[1] "Done"
[1] "Start normalising input probes"
ii= 4
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.009314652
[1] 0.01058022
[1] 0.01028964
[1] 0.00937324
[1] "Done"
[1] "Start normalising input probes"
ii= 5
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.0105508
[1] 0.01165802
[1] 0.01119981
[1] 0.01017059
[1] "Done"
[1] "Start normalising input probes"
ii= 6
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.01043823
[1] 0.01195218
[1] 0.01168271
[1] 0.01072515
[1] "Done"
[1] "Start normalising input probes"
ii= 7
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.009234892
[1] 0.01118355
[1] 0.01112528
[1] 0.01029523
[1] "Done"
[1] "Start normalising input probes"
ii= 8
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.009543425
[1] 0.01113342
[1] 0.01102735
[1] 0.01016235
[1] "Done"
[1] "Start normalising input probes"
ii= 9
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.01095459
[1] 0.01203559
[1] 0.01150238
[1] 0.01040007
[1] "Done"
[1] "Start normalising input probes"
ii= 10
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.009200094
[1] 0.01105969
[1] 0.01095622
[1] 0.01013558
[1] "Done"
[1] "Start normalising input probes"
ii= 11
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.01223449
[1] 0.01272071
[1] 0.01201
[1] 0.01085055
[1] "Done"
[1] "Start normalising input probes"
ii= 12
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.008109912
[1] 0.01015761
[1] 0.01022228
[1] 0.009495642
[1] "Done"
[1] "Start normalising input probes"
ii= 13
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.01129534
[1] 0.01199846
[1] 0.01151383
[1] 0.01054727
[1] "Done"
[1] "Start normalising input probes"
ii= 14
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.01079093
[1] 0.01194793
[1] 0.01163652
[1] 0.01069982
[1] "Done"
[1] "Start normalising input probes"
ii= 15
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.009180497
[1] 0.01071575
[1] 0.01047845
[1] 0.009613194
[1] "Done"
[1] "Start normalising input probes"
ii= 16
[1] "Fitting EM beta mixture to input probes"
[1] Inf
[1] 0.01141164
[1] 0.01223564
[1] 0.01175005
[1] 0.01072489
[1] "Done"
[1] "Start normalising input probes"
# output the results into the directory
write.table(datout, "Horvath2013_scripts/Example55_Output.csv", row.names = F, sep = ",")

Compare samples

sample_anotation <- read.csv("Horvath2013_scripts/AdditionalFile27_SampleAnnotation.csv")

rownames(sample_anotation) <- sample_anotation$id
sample_anotation
          OriginalOrder        id               title geo_accession
GSM946048             3 GSM946048  Autism_occ_AN09714     GSM946048
GSM946049             4 GSM946049 Control_occ_AN05475     GSM946049
GSM946052             7 GSM946052  Autism_occ_AN08166     GSM946052
GSM946054             9 GSM946054  Autism_occ_AN06420     GSM946054
GSM946055            10 GSM946055  Autism_occ_AN19511     GSM946055
GSM946056            11 GSM946056  Autism_occ_AN09730     GSM946056
GSM946059            14 GSM946059 Control_occ_UMB4670     GSM946059
GSM946062            17 GSM946062 Control_occ_UMB4543     GSM946062
GSM946064            19 GSM946064  Autism_occ_AN08873     GSM946064
GSM946065            20 GSM946065  Autism_occ_AN03345     GSM946065
GSM946066            21 GSM946066  Autism_occ_AN11989     GSM946066
GSM946067            22 GSM946067 Control_occ_BTB1453     GSM946067
GSM946073            28 GSM946073 Control_occ_AN10723     GSM946073
GSM946074            29 GSM946074 Control_occ_AN10833     GSM946074
GSM946075            30 GSM946075 Control_occ_UMB1860     GSM946075
GSM946076            31 GSM946076 Control_occ_AN15622     GSM946076
                     TissueDetailed          Tissue diseaseStatus Age
GSM946048 Fresh frozen brain tissue occipitalcortex             1  60
GSM946049 Fresh frozen brain tissue occipitalcortex             0  39
GSM946052 Fresh frozen brain tissue occipitalcortex             1  28
GSM946054 Fresh frozen brain tissue occipitalcortex             1  39
GSM946055 Fresh frozen brain tissue occipitalcortex             1   8
GSM946056 Fresh frozen brain tissue occipitalcortex             1  22
GSM946059 Fresh frozen brain tissue occipitalcortex             0   4
GSM946062 Fresh frozen brain tissue occipitalcortex             0  28
GSM946064 Fresh frozen brain tissue occipitalcortex             1   5
GSM946065 Fresh frozen brain tissue occipitalcortex             1   2
GSM946066 Fresh frozen brain tissue occipitalcortex             1  30
GSM946067 Fresh frozen brain tissue occipitalcortex             0   1
GSM946073 Fresh frozen brain tissue occipitalcortex             0  60
GSM946074 Fresh frozen brain tissue occipitalcortex             0  22
GSM946075 Fresh frozen brain tissue occipitalcortex             0   8
GSM946076 Fresh frozen brain tissue occipitalcortex             0  30
          PostMortemInterval CauseofDeath individual Female Caucasian
GSM946048               26.5       cancer         18      0        NA
GSM946049                 NA      cardiac          2      0        NA
GSM946052               43.0       cancer          3      0        NA
GSM946054               14.0      cardiac          6      0        NA
GSM946055               22.2       cancer          4      0        NA
GSM946056               25.0      hypoxia          8      0        NA
GSM946059               17.0      cardiac         28      0        NA
GSM946062               13.0        other         35      0        NA
GSM946064               25.5      hypoxia         21      0        NA
GSM946065                4.0      hypoxia         14      0        NA
GSM946066               16.0      cardiac         17      0        NA
GSM946067               19.0      unknown         30      0        NA
GSM946073               24.2      unknown         13      0        NA
GSM946074               21.5      unknown         24      0        NA
GSM946075                5.0      cardiac         33      0        NA
GSM946076               15.0      hypoxia         11      0        NA
          FemaleOriginal
GSM946048             NA
GSM946049             NA
GSM946052             NA
GSM946054             NA
GSM946055             NA
GSM946056             NA
GSM946059             NA
GSM946062             NA
GSM946064             NA
GSM946065             NA
GSM946066             NA
GSM946067             NA
GSM946073             NA
GSM946074             NA
GSM946075             NA
GSM946076             NA
combined.data <- cbind(datout[rownames(sample_anotation),], sample_anotation)
library(ggplot2)
ggplot(combined.data, aes(x = Age, y = DNAmAge)) + 
    geom_point() + 
    geom_abline() + 
    theme_minimal()