Network plot from expression data in R using igraph

[last update: January 27, 2020]

igraph allows you to generate a graph object and search for communities (clusters or modules) of related nodes / vertices. igraph is utilised in the R implementation of the popular Phenograph cluster and community detection algorithm (used in scRNA-seq and mass cytometry), and also in the popular scRNA-seq package Seurat.

The cluster detection algorithm in these packages is default to Louvain. To utilise this with this tutorial (Step 3), use cluster_louvain() in place of edge.betweenness.community()).

This tutorial will allow you to:

  1. create graph and spanning tree objects from a data-frame or -matrix of
    numerical data
  2. identify community structure (clusters or modules) in your graph and tree objects
  3. perform some basic downstream analyses on the tree objects

For simple testing, we’ll use the estrogen microarray data-set in R and follow the tutorial from R Gentleman / W Huber. This dataset was looking at genes that respond to estrogen stimulation.

NB – the method can be applied to any type of numerical data, be it microarray, RNA-seq, metabolomics, qPCR, etc.




NB – IF YOU ALREADY HAVE YOUR OWN DATA, PROCEED TO STEP 2

library(affy)
library(estrogen)
library(vsn)
library(genefilter)

datadir <- system.file("extdata", package="estrogen")
dir(datadir)
setwd(datadir)

# Read in phenotype data and the raw CEL files
pd <- read.AnnotatedDataFrame("estrogen.txt", header=TRUE, sep="", row.names=1)
a <- ReadAffy(filenames=rownames(pData(pd)), phenoData=pd, verbose=TRUE)

# Normalise the data
x <- expresso(
  a,
  bgcorrect.method="rma",
  normalize.method="constant",
  pmcorrect.method="pmonly",
  summary.method="avgdiff")

# Remove control probes
controlProbeIdx <- grep("^AFFX", rownames(x))
x <- x[-controlProbeIdx,]

# Identify genes of significant effect
lm.coef <- function(y) lm(y ~ estrogen * time.h)$coefficients
eff <- esApply(x, 1, lm.coef)
effectUp <- names(sort(eff[2,], decreasing=TRUE)[1:25])
effectDown <- names(sort(eff[2,], decreasing=FALSE)[1:25])
main.effects <- c(effectUp, effectDown)

# Filter the expression set object to include only genes of significant effect
estrogenMainEffects <- exprs(x)[main.effects,]

We now have the top 25 genes showing statistically significant effects in both directions and have filtered our expression object to include only these genes. The object that we’ll use for further analysis is estrogenMainEffects

head(estrogenMainEffects)
          low10-1.cel low10-2.cel high10-1.cel high10-2.cel low48-1.cel
36785_at    4257.2678   4127.1452     5783.745     4888.527   5936.3714
39755_at     699.1178    658.3222     2026.508     1869.705    468.5581
151_s_at    1517.0178   1378.5044     2892.287     2852.310   1372.7407
32174_at     441.2678    416.9485     1442.827     1253.596    257.8186
39781_at     536.2678    508.1147     1590.378     1340.251    472.4352
1985_s_at    750.1178    643.6604     1612.051     1406.858    306.4513



In this example, we now have our data-frame/-matrix stored as estrogenMainEffects and will construct a graph adjacency object from this. estrogenMainEffects can be any numerical data-frame/-matrix from you own experiment(s).

library(igraph)

# Create a graph adjacency based on correlation distances between genes in  pairwise fashion.
g <- graph.adjacency(
  as.matrix(as.dist(cor(t(estrogenMainEffects), method="pearson"))),
  mode="undirected",
  weighted=TRUE,
  diag=FALSE
)

NB – if you have trouble creating the correlation matrix due to its size, take a look at 2 (b), here: Parallel processing in R

NB – Euclidean distances can also be used instead of correlation, but this changes the tutorial slightly:

#g <- graph.adjacency(
#  as.matrix(dist(estrogenMainEffects, method="euclidean")),
#  mode="undirected",
#  weighted=TRUE,
#  diag=FALSE
#)

# Simplfy the adjacency object
g <- simplify(g, remove.multiple=TRUE, remove.loops=TRUE)

# Colour negative correlation edges as blue
E(g)[which(E(g)$weight<0)]$color <- "darkblue"

# Colour positive correlation edges as red
E(g)[which(E(g)$weight>0)]$color <- "darkred"

# Convert edge weights to absolute values
E(g)$weight <- abs(E(g)$weight)

# Change arrow size
# For directed graphs only
#E(g)$arrow.size <- 1.0

# Remove edges below absolute Pearson correlation 0.8
g <- delete_edges(g, E(g)[which(E(g)$weight<0.8)])

# Remove any vertices remaining that have no edges
g <- delete.vertices(g, degree(g)==0)

# Assign names to the graph vertices (optional)
V(g)$name <- V(g)$name

# Change shape of graph vertices
V(g)$shape <- "sphere"

# Change colour of graph vertices
V(g)$color <- "skyblue"

# Change colour of vertex frames
V(g)$vertex.frame.color <- "white"

# Scale the size of the vertices to be proportional to the level of expression of each gene represented by each vertex
# Multiply scaled vales by a factor of 10
scale01 <- function(x){(x-min(x))/(max(x)-min(x))}
vSizes <- (scale01(apply(estrogenMainEffects, 1, mean)) + 1.0) * 10

# Amplify or decrease the width of the edges
edgeweights <- E(g)$weight * 2.0

# Convert the graph adjacency object into a minimum spanning tree based on Prim's algorithm
mst <- mst(g, algorithm="prim")

# Plot the tree object
plot(
  mst,
  layout=layout.fruchterman.reingold,
  edge.curved=TRUE,
  vertex.size=vSizes,
  vertex.label.dist=-0.5,
  vertex.label.color="black",
  asp=FALSE,
  vertex.label.cex=0.6,
  edge.width=edgeweights,
  edge.arrow.mode=0,
  main="My first graph")

j

Note that layout can be modified to various other values, including:

  • layout.fruchterman.reingold
  • layout.kamada.kawai
  • layout.lgl
  • layout.circle
  • layout.graphopt

For further information, see Graph layouts




mst.communities <- edge.betweenness.community(mst, weights=NULL, directed=FALSE)
mst.clustering <- make_clusters(mst, membership=mst.communities$membership)
V(mst)$color <- mst.communities$membership + 1

par(mfrow=c(1,2))
plot(
  mst.clustering, mst,
  layout=layout.fruchterman.reingold,
  edge.curved=TRUE,
  vertex.size=vSizes,
  vertex.label.dist=-0.5,
  vertex.label.color="black",
  asp=FALSE,
  vertex.label.cex=0.6,
  edge.width=edgeweights,
  edge.arrow.mode=0,
  main="My first graph")

plot(
  mst,
  layout=layout.fruchterman.reingold,
  edge.curved=TRUE,
  vertex.size=vSizes,
  vertex.label.dist=-0.5,
  vertex.label.color="black",
  asp=FALSE,
  vertex.label.cex=0.6,
  edge.width=edgeweights,
  edge.arrow.mode=0,
  main="My first graph")

aaa

Other community identification methods, including fastgreedy.community, can be found at Detecting Community Structure.




# Check the vertex degree, i.e., number of connections to each vertex
degree(mst)

# Output information for each community, including vertex-to-community assignments and modularity
commSummary <- data.frame(
  mst.communities$names,
  mst.communities$membership,
  mst.communities$modularity)
colnames(commSummary) <- c("Gene", "Community", "Modularity")
options(scipen=999)
commSummary
         Gene Community   Modularity
1    39755_at         2 -0.027161612
2    151_s_at         3 -0.006790403
3    32174_at         4  0.014033499
4    39781_at         2  0.034631055
5   1985_s_at         5  0.055454957
....

# Compare community structures using the variance of information (vi) metric (not relevant here)
# Community structures that are identical will have a vi=0
compare(mst.communities, mst.communities, method="vi")
[1] 0

Other metrics that one can observe include (check the igraph documentation):

  • hub score
  • closeness centrality
  • betweenness centrality

In a case-control study, a useful approach is to generate separate networks for cases and controls and then compare the genes in these based on these metrics.

Read more here: Source link