27k platform - how to identify probes that overlap with known SNPs and repetitive regions

According to these publications here and here when analyzing Illumina Human Methylation 27k arrays probes should be removed from consideration if they:

Contain SNPs within the 5 base pairs from the interrogated CpG site
Overlap with a repetitive element
Not uniquely mapped to the human genome (which is because they overlap with the repetitive region?)
Overlap with the known regions of insertions and deletions
Are on X,Y chromosomes
Have P values => 0.05

While 5 and 6 are very easy to do 1-4 are not very easy without any experience. I found that following code snippets from the vignette for R/Bioconductor package IlluminaHumanMethylation450kprobe makes it very easy. Also, Tim Triche is a very helpful person (author of the package)! Since some of the code is a little outdated I am posting here an updated version of the key functions.

Note: All analyses were done on my work computer.

Identification of SNPs

Packages needed:

library(IlluminaHumanMethylation27k.db)
library(parallel)
library(GenomicRanges)
library(Biostrings)
library(SNPlocs.Hsapiens.dbSNP.20110815)

Get the genomic locations of each interrogated CpG probe. Add 5 bp around the position. I will look for SNPs within this region since it will affect probe hybridization.

chr <- toTable(IlluminaHumanMethylation27kCHR)
cpg <- toTable(IlluminaHumanMethylation27kCPGCOORDINATE)
start <- cpg$CpG_Coordinate - 5
end <- cpg$CpG_Coordinate + 5

I put strand as "+" because CpGs are symmetrical.

cpg <-cbind(cpg,strand="+")
head(cpg)
Probe_ID CpG_Coordinate     start       end strand
1 cg24054653      119647857 119647852 119647862      +
  2 cg24621042       93927028  93927023  93927033      +
  3 cg20969242      154093668 154093663 154093673      +
  4 cg12832649      136862762 136862757 136862767      +
  5 cg15789095       32054053  32054048  32054058      +
  6 cg25748127         833228    833223    833233      +

Add the chromosome number to each position:

meth27k<-merge(chr,cpg,by.x=1,by.y=1)
dim(meth27k)
[1] 25809     6
head(meth27k)
probe_id chromosome CpG_Coordinate     start       end strand
1 cg00000292         16       28797601  28797596  28797606      +
  2 cg00002426          3       57718583  57718578  57718588      +
  3 cg00003994          7       15692387  15692382  15692392      +
  4 cg00005847          2      176737319 176737314 176737324      +
  5 cg00006414          7      148453770 148453765 148453775      +
  6 cg00007981         11       93502242  93502237  93502247      +
  > class(meth27k$chromosome)
[1] "character"
> meth27k$chromosome<-as.factor(meth27k$chromosome)
> class(meth27k$chromosome)
[1] "factor"

In the vignette I found that the chromosome number should be a factor.

> chs = levels(meth27k$chromosome)
> chs
[1] "1"  "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "2"  "20" "21" "22" "3"  "4"  "5" 
[19] "6"  "7"  "8"  "9"  "X"  "Y" 
> names(chs) = paste('chr',chs,sep="")
> chs
chr1 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19  chr2 chr20 chr21 chr22 
"1"  "10"  "11"  "12"  "13"  "14"  "15"  "16"  "17"  "18"  "19"   "2"  "20"  "21"  "22" 
chr3  chr4  chr5  chr6  chr7  chr8  chr9  chrX  chrY 
"3"   "4"   "5"   "6"   "7"   "8"   "9"   "X"   "Y"

Create GRanges object (Genomic Ranges library):

CpGs.27k = with(meth27k,GRanges(paste('chr',chromosome,sep=""),IRanges(start=start,width=10,names=probe_id),strand=strand))

Here is the parallelized version of identifying the SNPs using library parallel. It returns CpG ID and associated SNP ID.

CpG.snps.by.chr = mclapply(chs, function(ch) { 
  snps = getSNPlocs(paste('ch', ch, sep=''), as.GRanges=TRUE)
  seqlevels(snps) <- gsub('ch','chr',seqlevels(snps))
  names(snps) = paste('rs',elementMetadata(snps)$RefSNP_id,sep='')
  message(paste('Scanning for CpG SNPs in probes on chromosome', ch))
  overlapping = findOverlaps(CpGs.27k, snps)
  overlap = as.matrix(overlapping) #Originally I used matchMatrix but this function seems to be deprecated
  results = data.frame(Probe_ID=as.character(names(CpGs.27k)[overlap[,1]]), rsID=as.character(names(snps)[overlap[,2]]))
  return(results)
})
 
source("getSNPCpGprobeOverlap.R")

Combine results into a nice looking data frame. Total number of CpGs identified is 2894, some of them are repetitive = 239. So the total to be removed because of the SNPs is ~2500.

> SNPs = do.call(rbind,CpG.snps.by.chr)
> SNPs$rsID = levels(SNPs$rsID)[SNPs$rsID]
> SNPs$Probe_ID = levels(SNPs$Probe_ID)[SNPs$Probe_ID]
> head(SNPs)
Probe_ID        rsID
chr1.1 cg00128766 rs114678855
chr1.2 cg00262415  rs56153290
chr1.3 cg00265415   rs4649238
chr1.4 cg00282347 rs148818341
chr1.5 cg00692549  rs12029869
chr1.6 cg00750606 rs115904164
> dim(SNPs)
[1] 2894    2
> length(which(duplicated(SNPs$Probe_ID)))
[1] 239

Identification of the repetitive regions

Install the library (hg18 annotation, I chose this one because everything for 27k is hg18, including the library IlluminaHumanMethylation27k.db):

source("http://bioconductor.org/biocLite.R")
biocLite("BSgenome.Hsapiens.UCSC.hg18")
library(BSgenome.Hsapiens.UCSC.hg18)

Run the following function. RM = repeat masker track, TRF = tandem repeats finder

CpG.rpts.by.chr = mclapply(chs, function(ch){ 
  chr = Hsapiens[[paste('chr',ch,sep='')]]
  rpts = union(masks(chr)$RM, masks(chr)$TRF)
  probes = which(seqnames(CpGs.27k)==paste('chr',ch,sep=''))
  # note how we have to use RangedData instead!!
  CpGs.chr = ranges(CpGs.27k[probes])
  #message(paste('Scanning for repeats in CpG sites on chromosome', ch))
  overlapping = findOverlaps(CpGs.chr, rpts)
  overlap = as.matrix(overlapping)
  results = data.frame(Probe_ID=as.character(names(CpGs.chr)[overlap[,1]]))
  return(results)
})
source("getRepetitiveREgionCpGProbeOverlap.R")

Note: I am looking only within 5 bp of the CpG site, not the entire probe. It might be wrong. The function returns the list of CpGs that were mapped to the repetitive regions on each chromosome. Combine everything into a nice data frame. Total number of CpGs identified: 2716. Seems that there is one duplicate:

RPT<-do.call(rbind,CpG.rpts.by.chr)
> head(RPT)
Probe_ID
chr1.1 cg00128766
chr1.2 cg00208830
chr1.3 cg00410895
chr1.4 cg00613344
chr1.5 cg00689010
chr1.6 cg00712898
> length(which(duplicated(RPT[,1])))
[1] 1 
> dim(RPT)
[1] 2716    1

Total number of probes that contain either SNPs or repetitive regions or both:

x<-union(RPT$Probe_ID,SNPs$Probe_ID)
> length(x)
[1] 5107

When I was reanalyzing Hinoue, 2012 dataset I found that they masked total of 5060 probes but this includes all categories of bad probes listed above.