Pavkovic, M., Pantano, L., Gerlach, C.V. et al. Multi omics analysis of fibrotic kidneys in two mouse models. Sci Data 6, 92 (2019).
Samples from two mouse models were collected. The first one is a reversible chemical-induced injury model (folic acid (FA) induced nephropathy). The second one is an irreversible surgically-induced fibrosis model (unilateral ureteral obstruction (UUO)). mRNA and small RNA sequencing, as well as 10-plex tandem mass tag (TMT) proteomics were performed with kidney samples from different time points over the course of fibrosis development. In the FA model, mice were sacrificed before the treatment (day 0) and 1, 2, 7, and 14 days after a single injection of folic acid. For the UUO model, mice were sacrificed before obstruction (day 0) and 3, 7, and 14 days after the ureter of the left kidney was obstructed via ligation. For both studies, kidneys were removed at each time point for total RNA isolation and protein sample preparation.
We will first explore the UUO transcriptome data.
Doc | URL |
---|---|
Total RNA for the experiment on Unilateral ureter obstruction (UUO) model | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE118339 |
#### Define parameters for the analysis ####
## Keep a trace of the original parameters
par.ori <- par(no.readonly = TRUE)
## Analysis parameters
parameters <- list(
dataset = "uuo", ## Supported: uuo, fa
datatype = "transcriptome",
epsilon = 0.1,
minCount = 10,
forceDownload = FALSE)
kable(as.data.frame(parameters))
dataset | datatype | epsilon | minCount | forceDownload |
---|---|---|---|---|
uuo | transcriptome | 0.1 | 10 | FALSE |
#### Output directories ####
outdirs <- list()
# outdirs$main <- getwd()
outdirs$main <- "."
## Data directory, where the data will be downloaded and uncompressed
outdirs$data <- file.path(outdirs$main, "data")
dir.create(outdirs$data, recursive = TRUE, showWarnings = FALSE)
## Main result directory
outdirs$results <- file.path(outdirs$main, "results")
# Transcriptome results
outdirs$transcriptome <- file.path(outdirs$results, "transcriptome")
dir.create(outdirs$transcriptome, recursive = TRUE, showWarnings = FALSE)
#### Download transcriptome data ####
archiveFile <- "MouseKidneyFibrOmics-v1.0.zip"
archiveURL <- file.path("https://zenodo.org/record/2592516/files/hbc", archiveFile)
localDataArchive <- file.path(outdirs$data, archiveFile)
if (file.exists(localDataArchive) & !parameters$forceDownload) {
message("Data archive already downloaded:\n\t", localDataArchive)
} else {
message("Downloading data archive from zenodo: ", archiveURL)
download.file(url = archiveURL, destfile = localDataArchive)
## Uncompess the archive
message("Uncompressing data archive")
unzip(zipfile = localDataArchive, exdir = outdirs$data)
}
## Define destination directory
# outdirs$csv <- file.path(outdirs$data, "CSV")
# dir.create(outdirs$csv, showWarnings = FALSE, recursive = TRUE)
#### Load raw counts data table ####
## Note: the "raw" counts are decimal numbers, I suspect that they have been somewhat normalised. To check.
rawCountFile <- file.path(
outdirs$data,
paste0("hbc-MouseKidneyFibrOmics-a39e55a/tables/",
parameters$dataset,
"/results/counts/raw_counts.csv.gz"))
# "hbc-MouseKidneyFibrOmics-a39e55a/tables/fa/results/counts/raw_counts.csv.gz")
rawValues <- read.csv(file = rawCountFile, header = 1, row.names = 1)
The RNA-seq transcriptome data was loaded as raw counts. This table contains 46679 rows (genes) and 15 columns (samples).
#### Build metadata table ####
metadata <- data.frame(
dataType = "transcriptome",
sampleName = colnames(rawValues))
metadata[ , c("condition", "sampleNumber")] <-
str_split_fixed(string = metadata$sampleName, pattern = "_", n = 2)
## Colors per condition
colPerCondition <- c(normal = "#BBFFBB",
day3 = "#FFFFDD",
day7 = "#FFDD88",
day14 = "#FF4400")
metadata$color <- colPerCondition[metadata$condition]
sampleStat <- metadata
sampleStat$mean <- apply(X = rawValues, 2, mean)
sampleStat$median <- apply(X = rawValues, 2, median)
sampleStat$sd <- apply(X = rawValues, 2, sd)
sampleStat$min <- apply(X = rawValues, 2, min)
sampleStat$perc05 <- apply(X = rawValues, 2, quantile, prob = 0.05)
sampleStat$Q1 <- apply(X = rawValues, 2, quantile, prob = 0.25)
sampleStat$median <- apply(X = rawValues, 2, quantile, prob = 0.5)
sampleStat$Q3 <- apply(X = rawValues, 2, quantile, prob = 0.75)
sampleStat$perc95 <- apply(X = rawValues, 2, quantile, prob = 0.95)
sampleStat$max <- apply(X = rawValues, 2, max)
sampleStat$iqr <- apply(X = rawValues, 2, IQR)
## Print statistics per sample
kable(format(x = format(digits = 5, sampleStat)))
dataType | sampleName | condition | sampleNumber | color | mean | median | sd | min | perc05 | Q1 | Q3 | perc95 | max | iqr |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
transcriptome | day14_12 | day14 | 12 | #FF4400 | 577.40 | 1.32543 | 4020.5 | 0 | 0 | 0 | 227.855 | 2563.5 | 480841 | 227.855 |
transcriptome | day14_13 | day14 | 13 | #FF4400 | 317.42 | 0.99779 | 2538.1 | 0 | 0 | 0 | 130.509 | 1371.5 | 258977 | 130.509 |
transcriptome | day14_14 | day14 | 14 | #FF4400 | 439.77 | 1.19451 | 2758.5 | 0 | 0 | 0 | 190.710 | 2043.2 | 280431 | 190.710 |
transcriptome | day14_15 | day14 | 15 | #FF4400 | 464.17 | 1.24853 | 5165.5 | 0 | 0 | 0 | 180.824 | 1855.8 | 693041 | 180.824 |
transcriptome | day3_4 | day3 | 4 | #FFFFDD | 736.30 | 1.00880 | 8571.4 | 0 | 0 | 0 | 197.157 | 2901.5 | 923949 | 197.157 |
transcriptome | day3_5 | day3 | 5 | #FFFFDD | 670.62 | 1.67158 | 7646.5 | 0 | 0 | 0 | 222.837 | 2687.5 | 874143 | 222.837 |
transcriptome | day3_6 | day3 | 6 | #FFFFDD | 657.52 | 1.02132 | 8424.8 | 0 | 0 | 0 | 181.088 | 2513.6 | 987332 | 181.088 |
transcriptome | day3_7 | day3 | 7 | #FFFFDD | 762.34 | 1.20783 | 11164.8 | 0 | 0 | 0 | 216.455 | 2781.6 | 1349892 | 216.455 |
transcriptome | day7_10 | day7 | 10 | #FFDD88 | 747.06 | 1.72124 | 7495.7 | 0 | 0 | 0 | 242.263 | 3132.0 | 889506 | 242.263 |
transcriptome | day7_11 | day7 | 11 | #FFDD88 | 507.88 | 1.01550 | 4781.0 | 0 | 0 | 0 | 185.765 | 2135.0 | 556133 | 185.765 |
transcriptome | day7_8 | day7 | 8 | #FFDD88 | 724.46 | 1.47772 | 7201.4 | 0 | 0 | 0 | 236.473 | 3005.6 | 904431 | 236.473 |
transcriptome | day7_9 | day7 | 9 | #FFDD88 | 756.34 | 1.99637 | 6429.8 | 0 | 0 | 0 | 291.494 | 3264.0 | 733494 | 291.494 |
transcriptome | normal_1 | normal | 1 | #BBFFBB | 801.47 | 0.95108 | 11585.8 | 0 | 0 | 0 | 123.308 | 2740.6 | 1259045 | 123.308 |
transcriptome | normal_2 | normal | 2 | #BBFFBB | 585.72 | 0.00000 | 8237.1 | 0 | 0 | 0 | 88.908 | 2035.8 | 912206 | 88.908 |
transcriptome | normal_3 | normal | 3 | #BBFFBB | 638.52 | 0.77229 | 9846.7 | 0 | 0 | 0 | 95.103 | 2149.2 | 1101035 | 95.103 |
hist(unlist(rawValues), breaks = 1000,
main = "Raw count distribution",
xlab = "Raw counts",
ylab = "Number of genes (all samples)")
The distribution of raw counts is not very informative, because the range is defined by some outlier, i.e. a gene having a huge number of reads. Even with strong zoom on the abcsissa range from 0 to 500, the histogram shows a steep drop in the first bins.
#### Count distrib - truncated abscissa ####
hist(unlist(rawValues), breaks = 500000,
main = "Raw count distribution",
xlab = "Raw counts (truncated abscissa",
ylab = "Number of genes (all samples)",
xlim = c(0, 500))
A typical approach is to normalise the counts by applying a log2 transformation . This however creates a problem when the counts of a given gene in a given sample is 0. To circumvent this, we can add an epsilon (\(\epsilon = 0.1\)) before the log2 transformation.
contents 1 …
pas l’air de marcher …
We can now inspect the distribution of counts per sample with the boxplot()
function.
We notice an obvious problem: the vast majority of counts is very small. This can result from different causes, which will not be investigated in this context.
## Filter out the genes with very low counts in all conditions
undetectedGenes <- apply(rawValues, MARGIN = 1, FUN = sum) < parameters$minCount
# table(undetectedGenes)
log2Filtered <- log2Values[!undetectedGenes, ]
We filtered out all the genes whose maximal count value across all samples was lower than 10. Among the 46679 genes from the raw count table, 20851 were considered undetected according to this criterion. We use the remaining 25828 genes for the subsequent analyses.
Before going any further, it is important to ensure some normalisation of the counts, in order to correct for biases due to inter-sample differences in sequencing depth.
For the sake of simplicity, we will use here a very simple criterion: median-based normalisation. The principle is to multiply the counts of each sample by a scaling factor in order to bring each sample to the same median count.
#### Median-based normalisation ####
sampleMedians <- apply(log2Filtered, 2, median)
seriesMedian <- median(sampleMedians)
scalingFactors <- seriesMedian / sampleMedians
log2Standardised <- data.frame(matrix(
nrow = nrow(log2Filtered),
ncol = ncol(log2Filtered)))
colnames(log2Standardised) <- colnames(log2Filtered)
rownames(log2Standardised) <- rownames(log2Filtered)
for (j in 1:ncol(log2Filtered)) {
log2Standardised[, j] <- log2Filtered[, j] * scalingFactors[j]
}
## Check the remaining medians
apply(log2Standardised, 2, median)
day14_12 day14_13 day14_14 day14_15 day3_4 day3_5 day3_6 day3_7 day7_10 day7_11 day7_8 day7_9 normal_1 normal_2 normal_3
6.772065 6.772065 6.772065 6.772065 6.772065 6.772065 6.772065 6.772065 6.772065 6.772065 6.772065 6.772065 6.772065 6.772065 6.772065
We can also inspect the distribution of counts per sample with the vioplot()
function.
We can also inspect the distribution of counts per sample with the plot()
function.
We export the pre-processed data in separate tables for further reuse.
#### Save tables ####
outfiles <- vector()
## Raw counts, all the variables
outfiles["raw"] <- file.path(outdirs[parameters$datatype],
paste0(parameters$dataset,
"_", parameters$datatype,
"_raw.tsv.gz"))
write.table(x = format(digits = 3, big.mark = "", decimal.mark = ".", rawValues),
dec = ".",
file = gzfile(outfiles["raw"], "w"),
quote = FALSE, sep = "\t")
## Log2-transformed counts, all the genes
outfiles["log2"] <- file.path(outdirs[parameters$datatype],
paste0(parameters$dataset,
"_", parameters$datatype,
"_log2.tsv.gz"))
write.table(x = format(digits = 3, big.mark = "", decimal.mark = ".", log2Values),
dec = ".",
file = gzfile(outfiles["log2"], "w"),
quote = FALSE, sep = "\t")
## Filtered genes only, log2-transformed counts
outfiles["filtered"] <- file.path(outdirs[parameters$datatype],
paste0(parameters$dataset,
"_", parameters$datatype,
"_log2_filtered.tsv.gz"))
write.table(x = format(digits = 3, big.mark = "", decimal.mark = ".", log2Filtered),
dec = ".",
file = gzfile(outfiles["filtered"], "w"),
quote = FALSE, sep = "\t")
## Filtered genes only, log2-transformed and standardized counts
outfiles["standardised"] <- file.path(outdirs[parameters$datatype],
paste0(parameters$dataset,
"_", parameters$datatype,
"_log2_norm.tsv.gz"))
write.table(x = format(digits = 3, big.mark = "", decimal.mark = ".", log2Standardised),
dec = ".",
file = gzfile(outfiles["standardised"], "w"),
quote = FALSE, sep = "\t")
## Metadata
outfiles["metadata"] <- file.path(outdirs[parameters$datatype],
paste0(parameters$dataset,
"_", parameters$datatype,
"_metadata.tsv"))
write.table(x = metadata,
file = outfiles["metadata"] ,
quote = FALSE, sep = "\t")
## Build a table to display the links in the report
fileTable <- data.frame(outfiles)
fileTable$basename <- basename(fileTable$outfiles)
fileTable$dirname <- dirname(fileTable$outfiles)
fileTable$link <- paste0("[", fileTable$basename, "]", "(", fileTable$outfiles, ")")
## Print the directories
kable(t(as.data.frame.list(outdirs)), col.names = "Directories", caption = "Directories")
Directories | |
---|---|
main | . |
data | ./data |
results | ./results |
transcriptome | ./results/transcriptome |
## Print the link table
kable(cbind(rownames(fileTable), fileTable$link), row.names = FALSE, col.names = c("Contents", "Link"),
caption = "Output files")
Contents | Link |
---|---|
raw | uuo_transcriptome_raw.tsv.gz |
log2 | uuo_transcriptome_log2.tsv.gz |
filtered | uuo_transcriptome_log2_filtered.tsv.gz |
standardised | uuo_transcriptome_log2_norm.tsv.gz |
metadata | uuo_transcriptome_metadata.tsv |
R version 4.0.0 (2020-04-24)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Mojave 10.14.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] vioplot_0.3.4 zoo_1.8-8 sm_2.2-5.6 stringr_1.4.0 knitr_1.28
loaded via a namespace (and not attached):
[1] Rcpp_1.0.4.6 lattice_0.20-41 digest_0.6.25 grid_4.0.0 magrittr_1.5 evaluate_0.14 highr_0.8 rlang_0.4.6 stringi_1.4.6 rmarkdown_2.1 tools_4.0.0 xfun_0.14 yaml_2.2.1 compiler_4.0.0 tcltk_4.0.0 htmltools_0.4.0