Hi Caitlin,
For this study, the uploaded data is normalised via GC-RMA, but is not log [base 2] (log2) transformed.
To retrieve it, you need to do:
library(GEOquery)
gset <- getGEO('GSE12657', GSEMatrix = TRUE, getGPL= FALSE)
if (length(gset) > 1) idx <- grep('GPL8300', attr(gset, 'names')) else idx <- 1
gset <- gset[[idx]]
gset <- log2(exprs(gset))
We can see some of the probe names:
head(rownames(gset))
[1] "1000_at" "1001_at" "1002_f_at" "1003_s_at" "1004_at" "1005_at"
We can get the equivalent Ensembl and HGNC IDs for these:
require(hgu95av2.db)
annotLookup <- select(hgu95av2.db, keys = rownames(gset),
columns = c('PROBEID', 'ENSEMBL', 'SYMBOL'))
# remove probes with any NA mapping, and duplicates
annotLookup <- annotLookup[!is.na(annotLookup$ENSEMBL) & !is.na(annotLookup$SYMBOL),]
annotLookup <- annotLookup[!duplicated(annotLookup$PROBEID),]
head(annotLookup)
PROBEID ENSEMBL SYMBOL
1 1000_at ENSG00000102882 MAPK3
2 1001_at ENSG00000066056 TIE1
3 1002_f_at ENSG00000165841 CYP2C19
4 1003_s_at ENSG00000160683 CXCR5
5 1004_at ENSG00000160683 CXCR5
6 1005_at ENSG00000120129 DUSP1
rownames(gset)
will not be perfectly aligned with annotLookup
; so, watch out for that.
If you want a few lines to align these:
# look up the ensembl ID and gene symbol
gset <- gset[which(rownames(gset) %in% annotLookup$PROBEID),]
annotLookup <- annotLookup[which(annotLookup$PROBEID %in% rownames(gset)),]
annotLookup <- annotLookup[match(rownames(gset), annotLookup$PROBEID),]
all(rownames(gset) == annotLookup$PROBEID)
Kevin
Read more here: Source link