Reading microarray data from the Gene Expression Omnibus

Hi Caitlin,

For this study, the uploaded data is normalised via GC-RMA, but is not log [base 2] (log2) transformed.

To retrieve it, you need to do:

library(GEOquery)
gset <- getGEO('GSE12657', GSEMatrix = TRUE, getGPL= FALSE)
if (length(gset) > 1) idx <- grep('GPL8300', attr(gset, 'names')) else idx <- 1
gset <- gset[[idx]]
gset <- log2(exprs(gset))

We can see some of the probe names:

head(rownames(gset))
[1] "1000_at"   "1001_at"   "1002_f_at" "1003_s_at" "1004_at"   "1005_at"

We can get the equivalent Ensembl and HGNC IDs for these:

require(hgu95av2.db)
annotLookup <- select(hgu95av2.db, keys = rownames(gset),
  columns = c('PROBEID', 'ENSEMBL', 'SYMBOL'))

# remove probes with any NA mapping, and duplicates
  annotLookup <- annotLookup[!is.na(annotLookup$ENSEMBL) & !is.na(annotLookup$SYMBOL),]
  annotLookup <- annotLookup[!duplicated(annotLookup$PROBEID),]

head(annotLookup)
    PROBEID         ENSEMBL  SYMBOL
1   1000_at ENSG00000102882   MAPK3
2   1001_at ENSG00000066056    TIE1
3 1002_f_at ENSG00000165841 CYP2C19
4 1003_s_at ENSG00000160683   CXCR5
5   1004_at ENSG00000160683   CXCR5
6   1005_at ENSG00000120129   DUSP1

rownames(gset) will not be perfectly aligned with annotLookup; so, watch out for that.

If you want a few lines to align these:

# look up the ensembl ID and gene symbol
  gset <- gset[which(rownames(gset) %in% annotLookup$PROBEID),]
  annotLookup <- annotLookup[which(annotLookup$PROBEID %in% rownames(gset)),]

  annotLookup <- annotLookup[match(rownames(gset), annotLookup$PROBEID),]

  all(rownames(gset) == annotLookup$PROBEID)

Kevin

Read more here: Source link