Network analysis of liver expression data in female mice¶

Tutorial for Module 6 DUBII 2019

Costas Bouyioukos Universite Paris Diderot and Anais Baudot CNRS¶

1. Preliminaries and data input¶

#  Code chunk 1
# Display the current working directory
getwd();
# If necessary, change the path below to the directory where the data files are stored.
# "." means current directory.
workingDir = ".";
setwd(workingDir);
# Load the WGCNA package
library(WGCNA);
# The following setting is important, do not omit.
options(stringsAsFactors = FALSE);
#Read in the female liver data set
femData = read.csv("LiverFemale3600.csv");
# Take a quick look at what is in the data set:
dim(femData);
names(femData);
head(femData);

Keep only the part of the data that contains the gene expression and keep the gene names as data frame index¶

#  Code chunk 2
datExpr0 = as.data.frame(t(femData[, -c(1:8)]));
names(datExpr0) = femData$substanceBXH;
rownames(datExpr0) = names(femData)[-c(1:8)];

Check if there are genes with missing values.¶

#  Code chunk 3
gsg = goodSamplesGenes(datExpr0, verbose = 3);
gsg$allOK

 Flagging genes and samples with too many missing values...
  ..step 1

All genes are OK.¶

Cluster the transposed matrix to identify sample outliers.¶

#  Code chunk 4
sampleTree = hclust(dist(datExpr0), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
par(cex = 0.5);
par(mar = c(0,2,1,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="",
     cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)
# Plot a line to show the cut
abline(h = 15, col = "red");

Identify the outlier.¶

#  Code chunk 5
# Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10)
table(clust)

clust
  0   1 
  1 134

Remove the outlier and construct the main data frame.¶

#  Code chunk 5
# clust 1 contains the samples we want to keep.
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

Introduce the clinical data, preapre and clean it.¶

#  Code chunk 7
traitData = read.csv("ClinicalTraits.csv");
dim(traitData)
names(traitData)
# remove columns that hold information we do not need.
allTraits = traitData[, -c(31, 16)];
allTraits = allTraits[, c(2, 11:36) ];
dim(allTraits)
names(allTraits)
# Form a data frame analogous to expression data that will hold the clinical traits.
femaleSamples = rownames(datExpr);
traitRows = match(femaleSamples, allTraits$Mice);
datTraits = allTraits[traitRows, -1];
rownames(datTraits) = allTraits[traitRows, 1];

Repeat the sample clustering together with a heat map of the phenotypic data.¶

#  Code chunk 8
# Re-cluster samples
sampleTree2 = hclust(dist(datExpr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits, signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datTraits),
                    main = "Sample dendrogram and trait heatmap")

Save the analysis to an RData file.¶

# Code chunk 9
save(datExpr, datTraits, file = "FemaleLiver-01-dataInput.RData")

2. Automatic network construction and module detection¶

#  Code chunk 10
# Allow multi-threading within WGCNA. This helps speed up certain calculations.
# At present this call is necessary for the code to work.
# Any error here may be ignored but you may want to update WGCNA if you see one.
# See note above.
allowWGCNAThreads()
# Load the data saved in the first part
lnames = load(file = "FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames

Allowing multi-threading with up to 8 threads.

This is the most convenient and automatic way to detect modules and construct a network with WGCNA.¶

Here the developers of WGCNA are proposing a "soft thresholding" approach. This method identifies a power -to wich the correlation matrix is raised in order to calculate the network adjacency matrix- based on the criterion of scale-free approximation.

#  Code chunk 11
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# Plot the results:
par(mfrow = c(1,2));
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

pickSoftThreshold: will use block size 3600.
 pickSoftThreshold: calculating connectivity for given powers...
   ..working on genes 1 through 3600 of 3600
   Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
1      1   0.0278  0.345          0.456  747.00  762.0000 1210.0
2      2   0.1260 -0.597          0.843  254.00  251.0000  574.0
3      3   0.3400 -1.030          0.972  111.00  102.0000  324.0
4      4   0.5060 -1.420          0.973   56.50   47.2000  202.0
5      5   0.6810 -1.720          0.940   32.20   25.1000  134.0
6      6   0.9020 -1.500          0.962   19.90   14.5000   94.8
7      7   0.9210 -1.670          0.917   13.20    8.6800   84.1
8      8   0.9040 -1.720          0.876    9.25    5.3900   76.3
9      9   0.8590 -1.700          0.836    6.80    3.5600   70.5
10    10   0.8330 -1.660          0.831    5.19    2.3800   65.8
11    12   0.8530 -1.480          0.911    3.33    1.1500   58.1
12    14   0.8760 -1.380          0.949    2.35    0.5740   51.9
13    16   0.9070 -1.300          0.970    1.77    0.3090   46.8
14    18   0.9120 -1.240          0.973    1.39    0.1670   42.5
15    20   0.9310 -1.210          0.977    1.14    0.0951   38.7

Thisis the actual network construction step.¶

We choose 6 as the lowest power that constructs a scale free topology. And then we instruct the function to generate modules of size 30, merge modules which are more than 25% similar and save the Topological Overlap Matrix in an object.

#  Code chunk 12
net = blockwiseModules(datExpr, power = 6,
                       TOMType = "unsigned", minModuleSize = 30,
                       reassignThreshold = 0, mergeCutHeight = 0.25,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       saveTOMs = TRUE,
                       saveTOMFileBase = "femaleMouseTOM",
                       verbose = 3)

 Calculating module eigengenes block-wise from all genes
   Flagging genes and samples with too many missing values...
    ..step 1
Cluster size 3600 broken into 2133 1467 
Cluster size 2133 broken into 1221 912 
Done cluster 1221 
Done cluster 912 
Done cluster 2133 
Done cluster 1467 
 ..Working on block 1 .
    TOM calculation: adjacency..
    ..will use 8 parallel threads.
     Fraction of slow calculations: 0.362314
    ..connectivity..
    ..matrix multiplication (system BLAS)..
    ..normalization..
    ..done.
   ..saving TOM for block 1 into file femaleMouseTOM-block.1.RData
 ....clustering..
 ....detecting modules..
 ....calculating module eigengenes..
 ....checking kME in modules..
     ..removing 1 genes from module 1 because their KME is too low.
     ..removing 1 genes from module 7 because their KME is too low.
     ..removing 1 genes from module 8 because their KME is too low.
     ..removing 1 genes from module 21 because their KME is too low.
 ..merging modules that are too close..
     mergeCloseModules: Merging modules whose distance is less than 0.25
       Calculating new MEs...

Here is the modules (as numbers and not colours yet) of each module with its size.¶

table(net$colors)

  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
 99 609 460 409 316 312 221 211 157 123 106 100  94  91  77  76  58  47  34

Here is the resuling plot dendrogram of the module construction and the clustering of the genes¶

#  Code chunk 13
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

Save some results of this part in an .RData file.¶

#  Code chunk 14
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];
save(MEs, moduleLabels, moduleColors, geneTree,
     file = "FemaleLiver-02-networkConstruction-auto.RData")

3. Relating modules to external information and identifying important genes¶

#  Code chunk 15
lnames = load(file = "FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames
# Load network data saved in the second part.
lnames = load(file = "FemaleLiver-02-networkConstruction-auto.RData");
lnames

Quantifying module–trait associations¶

Here we identify modules that are significantly associated with the measured clinical traits. We already have a computed summary profile (eigengene) for each module, so then we simply correlate eigengenes with phenotypic traits and look for the most significant associations:

#  Code chunk 16
# Define numbers of genes and samples
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);

Vsualise the module-trait association.¶

Each module eigengene and its correlation coefficient are ploted here. Since we have many a colour code aids the interprettion of the plot.

#  Code chunk 17
# Will display correlations and their p-values
textMatrix =  paste(signif(moduleTraitCor, 2), "\n(",
                           signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8, 1, 1));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(datTraits),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = blueWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))

#  Code chunk 18
names(datExpr)[moduleColors=="magenta"]

Probe nnotation file provided by the manufacturer to facilitate functional annotation.¶

#  Code chunk 19
annot = read.csv(file = "GeneAnnotation.csv");
dim(annot)
names(annot)
probes = names(datExpr)
probes2annot = match(probes, annot$substanceBXH)
# The following is the number or probes without annotation:
sum(is.na(probes2annot))
# Should return 0.

Collect all the information for significant genes related to body weight.¶

#  Code chunk 20
# Create the starting data frame
geneInfo0 = data.frame(substanceBXH = probes,
                      geneSymbol = annot$gene_symbol[probes2annot],
                      LocusLinkID = annot$LocusLinkID[probes2annot],
                      moduleColor = moduleColors,
                      geneTraitSignificance,
                      GSPvalue)
# Order modules by their significance for weight
modOrder = order(-abs(cor(MEs, weight, use = "p")));
# Add module membership information in the chosen order
for (mod in 1:ncol(geneModuleMembership))
{
  oldNames = names(geneInfo0)
  geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]],
                         MMPvalue[, modOrder[mod]]);
  names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""),
                       paste("p.MM.", modNames[modOrder[mod]], sep=""))
}
# Order the genes in the geneInfo variable first by module color, then by geneTraitSignificance
geneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.weight));
geneInfo = geneInfo0[geneOrder, ]

Save the results in an output file for further analysis.¶

#  Code chunk 21
write.csv(geneInfo, file = "geneInfo.csv")

4. Interfacing network analysis with other data such as functional annotation and gene ontology¶

#  Code chunk 22
# Load the expression and trait data saved in the first part
lnames = load(file = "FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames
# Load network data saved in the second part.
lnames = load(file = "FemaleLiver-02-networkConstruction-auto.RData");
lnames

#  Code chunk 23
# Read in the probe annotation
annot = read.csv(file = "GeneAnnotation.csv");
# Match probes in the data set to the probe IDs in the annotation file
probes = names(datExpr)
probes2annot = match(probes, annot$substanceBXH)
# Get the corresponding Locuis Link IDs
allLLIDs = annot$LocusLinkID[probes2annot];
# $ Choose interesting modules
intModules = c("brown", "red", "salmon")
for (module in intModules)
{
  # Select module probes
  modGenes = (moduleColors==module)
  # Get their entrez ID codes
  modLLIDs = allLLIDs[modGenes];
  # Write them into a file
  fileName = paste("LocusLinkIDs-", module, ".txt", sep="");
  write.table(as.data.frame(modLLIDs), file = fileName,
              row.names = FALSE, col.names = FALSE)
}
# As background in the enrichment analysis, we will use all probes in the analysis.
fileName = paste("LocusLinkIDs-all.txt", sep="");
write.table(as.data.frame(allLLIDs), file = fileName,
            row.names = FALSE, col.names = FALSE)

#  Code chunk 24
GOenr = GOenrichmentAnalysis(moduleColors, allLLIDs, organism = "mouse", nBestP = 10);

Warning message in GOenrichmentAnalysis(moduleColors, allLLIDs, organism = "mouse", :
“This function is deprecated and will be removed in the near future. 
We suggest using the replacement function enrichmentAnalysis 
in R package anRichment, available from the following URL:
https://labs.genetics.ucla.edu/horvath/htdocs/CoexpressionNetwork/GeneAnnotation/”

 GOenrichmentAnalysis: loading annotation data...
  ..of the 3038  Entrez identifiers submitted, 2829 are mapped in current GO categories.
  ..will use 2829 background genes for enrichment calculations.
  ..preparing term lists (this may take a while).. 
  ..working on label set 1 ..
    ..calculating enrichments (this may also take a while)..
    ..putting together terms with highest enrichment significance..

#anRichment(moduleColors, allLLIDs, organism = "mouse", nBestP = 10);  # Does not work yet.

#  Code chunk 25
tab = GOenr$bestPTerms[[4]]$enrichment

#  Code chunk 26
names(tab)

#  Code chunk 27
write.table(tab, file = "GOEnrichmentTable.csv", sep = ",", quote = TRUE, row.names = FALSE)

#  Code chunk 28
keepCols = c(1, 2, 5, 6, 7, 12, 13);
screenTab = tab[, keepCols];
# Round the numeric columns to 2 decimal places:
numCols = c(3, 4);
screenTab[, numCols] = signif(apply(screenTab[, numCols], 2, as.numeric), 2)
# Truncate the the term name to at most 40 characters
screenTab[, 7] = substring(screenTab[, 7], 1, 40)
# Shorten the column names:
colnames(screenTab) = c("module", "size", "p-val", "Bonf", "nInTerm", "ont", "term name");
rownames(screenTab) = NULL;
# Set the width of R's output. The reader should play with this number to obtain satisfactory output.
options(width=95)
# Finally, display the enrichment table:
screenTab

5. Export of networks to external software¶

#  Code chunk 29
# Load the expression and trait data saved in the first part
lnames = load(file = "FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames
# Load network data saved in the second part.
lnames = load(file = "FemaleLiver-02-networkConstruction-auto.RData");
lnames

#  Code chunk 30
# Recalculate topological overlap if needed
TOM = TOMsimilarityFromExpr(datExpr, power = 6);
# Read in the annotation file
annot = read.csv(file = "GeneAnnotation.csv");
# Select modules
modules = c("brown", "red");
# Select module probes
probes = names(datExpr)
inModule = is.finite(match(moduleColors, modules));
modProbes = probes[inModule];
modGenes = annot$gene_symbol[match(modProbes, annot$substanceBXH)];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
# Export the network into edge and node list files Cytoscape can read
cyt = exportNetworkToCytoscape(modTOM,
  edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
  nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
  weighted = TRUE,
  threshold = 0.5,
  nodeNames = modProbes,
  altNodeNames = modGenes,
  nodeAttr = moduleColors[inModule]);

TOM calculation: adjacency..
..will use 8 parallel threads.
 Fraction of slow calculations: 0.361682
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.

Open these two files as node table and edge table with Cytoscape and inspect the network

substanceBXH	gene_symbol	LocusLinkID	ProteomeID	cytogeneticLoc	CHROMOSOME	StartPosition	EndPosition	F2_2	F2_3	⋯	F2_324	F2_325	F2_326	F2_327	F2_328	F2_329	F2_330	F2_332	F2_355	F2_357
MMT00000044	1700007N18Rik	69339	286025	0	16	50911260	50912491	-0.01810	0.0642	⋯	0.047700	-0.0488	0.0168	-0.0309	0.02740	-0.0310	0.0660	-0.0199	-0.0146	0.065000
MMT00000046	Mast2	17776	157466	0	4	115215318	115372404	-0.07730	-0.0297	⋯	-0.049200	-0.0350	-0.0738	-0.1730	-0.07380	-0.2010	-0.0820	-0.0939	0.0192	-0.049900
MMT00000051	Ankrd32	105377	321939	0	13	74940309	74982847	-0.02260	0.0617	⋯	0.000612	0.1210	0.0996	0.1090	0.02730	0.1200	-0.0629	-0.0395	0.1090	0.000253
MMT00000076	0	383154	0	0	16	49345114	49477048	-0.00924	-0.1450	⋯	-0.270000	0.0803	0.0424	0.1610	0.05120	0.2410	0.3890	0.0251	-0.0348	0.114000
MMT00000080	Ldb2	16826	157383	0	5	43546124	43613704	-0.04870	0.0582	⋯	0.113000	-0.0859	-0.1340	0.0639	0.00731	0.1240	-0.0212	0.0870	0.0512	0.024300
MMT00000102	Rdhs	216453	0	10_70.0_cM	10	1337265	1347607	0.17600	-0.1890	⋯	-0.080000	-0.1200	0.1230	0.1870	0.05410	0.0699	0.0708	0.1450	-0.0399	0.037500

module	size	p-val	Bonf	nInTerm	ont	term name
black	166	3.9e-04	1.0e+00	4	BP	dopamine transport
black	166	6.5e-04	1.0e+00	5	BP	mRNA transport
black	166	8.1e-04	1.0e+00	13	MF	receptor ligand activity
black	166	9.9e-04	1.0e+00	13	MF	receptor regulator activity
black	166	1.0e-03	1.0e+00	6	BP	RNA transport
black	166	1.3e-03	1.0e+00	6	BP	RNA localization
black	166	1.6e-03	1.0e+00	6	BP	amine transport
black	166	2.4e-03	1.0e+00	6	MF	growth factor activity
black	166	2.6e-03	1.0e+00	2	BP	ventricular compact myocardium morphogen
black	166	2.6e-03	1.0e+00	2	BP	detection of chemical stimulus involved
blue	428	3.2e-33	5.7e-29	166	BP	immune system process
blue	428	3.8e-32	6.9e-28	121	BP	immune response
blue	428	3.6e-23	6.4e-19	109	BP	defense response
blue	428	1.5e-22	2.7e-18	73	BP	innate immune response
blue	428	2.2e-22	4.0e-18	101	BP	regulation of immune system process
blue	428	8.9e-22	1.6e-17	82	BP	positive regulation of immune system pro
blue	428	1.4e-21	2.5e-17	85	BP	cell activation
blue	428	1.9e-18	3.3e-14	71	BP	cytokine production
blue	428	1.4e-17	2.6e-13	64	BP	regulation of cytokine production
blue	428	1.6e-17	2.8e-13	65	BP	regulation of immune response
brown	396	7.7e-22	1.4e-17	46	CC	extracellular matrix
brown	396	2.6e-19	4.7e-15	125	CC	extracellular region
brown	396	5.9e-15	1.1e-10	30	CC	collagen-containing extracellular matrix
brown	396	1.4e-13	2.6e-09	91	CC	extracellular space
brown	396	1.2e-12	2.2e-08	57	BP	blood vessel development
brown	396	1.8e-12	3.3e-08	58	BP	vasculature development
brown	396	2.0e-12	3.6e-08	59	BP	cardiovascular system development
brown	396	6.3e-12	1.1e-07	29	BP	extracellular matrix organization
brown	396	3.3e-11	5.9e-07	49	BP	blood vessel morphogenesis
brown	396	7.9e-11	1.4e-06	69	BP	circulatory system development
⋮	⋮	⋮	⋮	⋮	⋮	⋮
tan	81	5.9e-04	1	13	CC	catalytic complex
tan	81	7.9e-04	1	37	MF	catalytic activity
tan	81	1.2e-03	1	3	CC	COPII-coated ER to Golgi transport vesic
tan	81	1.9e-03	1	2	BP	anterograde synaptic vesicle transport
tan	81	1.9e-03	1	2	BP	endoplasmic reticulum tubular network or
tan	81	3.8e-03	1	2	CC	axon cytoplasm
tan	81	4.6e-03	1	4	BP	cellular response to xenobiotic stimulus
tan	81	4.7e-03	1	14	CC	Golgi apparatus
tan	81	5.2e-03	1	4	BP	drug catabolic process
tan	81	5.5e-03	1	47	CC	cytoplasmic part
turquoise	529	6.3e-05	1	9	BP	nuclear-transcribed mRNA catabolic proce
turquoise	529	2.4e-04	1	97	MF	nucleic acid binding
turquoise	529	3.0e-04	1	74	BP	positive regulation of macromolecule bio
turquoise	529	3.2e-04	1	9	BP	translational initiation
turquoise	529	3.3e-04	1	15	BP	sensory perception of chemical stimulus
turquoise	529	4.5e-04	1	13	BP	sensory perception of smell
turquoise	529	5.3e-04	1	36	MF	transcription factor binding
turquoise	529	6.2e-04	1	64	BP	positive regulation of transcription, DN
turquoise	529	6.2e-04	1	64	BP	positive regulation of RNA biosynthetic
turquoise	529	1.0e-03	1	4	MF	DNA-directed 5'-3' RNA polymerase activi
yellow	199	1.2e-04	1	3	MF	nickel cation binding
yellow	199	1.7e-04	1	5	BP	vesicle budding from membrane
yellow	199	2.7e-04	1	45	CC	cytosol
yellow	199	3.0e-04	1	13	BP	aromatic compound catabolic process
yellow	199	6.6e-04	1	6	BP	cell-cell junction assembly
yellow	199	7.5e-04	1	10	BP	monosaccharide metabolic process
yellow	199	8.4e-04	1	14	BP	cofactor metabolic process
yellow	199	9.6e-04	1	4	BP	regulation of animal organ formation
yellow	199	1.1e-03	1	3	BP	antigen processing and presentation of e
yellow	199	1.5e-03	1	22	CC	mitochondrion