1 Installation

Install relevant R packages:

install.packages("BiocManager",dependencies=T)

# stable release version from Bioconductor
BiocManager::install(c("supraHex","dnet","XGR","dplyr"), dependencies=T)

# development release version from github
BiocManager::install(c("hfang-bristol/supraHex"))

# reload the installed package
detach(package:supraHex, unload=T)
library(supraHex)

2 Benchmarking

We use human embryo datasets from Fang et al to illustrate how to choose the map shape tailored to this data. This data involves six successive developmental stages (S9-S14) with three replicates (R1-R3) for each stage, including Fang (an expression matrix of 5,441 genes X 18 samples), Fang.geneinfo (a matrix of 5,441 X 3 containing gene information) and Fang.sampleinfo (a matrix of 18 X 3 containing sample information). We already know there are two groups of genes, displaying gradually reduced or gradually increased expression patterns across this developmental window; such priori knowledge can be used to assess the performance.

First of all, load the packages:

library(supraHex)
library(XGR)

Then, import human embryo datasets:

# import three variables ('Fang', 'Fang.geneinfo' and 'Fang.sampleinfo')
data(Fang) 
# a matrix of 5,441 genes expressed in 18 samples
# transform data by row/gene centering
data <- Fang - matrix(rep(apply(Fang,1,mean),ncol(Fang)),ncol=ncol(Fang))

Justify the use of the butterfly-shaped map (two highly dense regions/centres):

df <- as.data.frame(stats::prcomp(data, center=T, scale.=T)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2)) 
gp <- gp + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="terrain")(64)
gp <- gp + stat_binhex(bins=64) + scale_fill_gradientn(colours=myColor,trans="sqrt")
print(gp)

Train the butterfly-shaped map:

sM_butterfly <- sPipeline(data, shape="butterfly", finetuneSustain=T, scale=3)

Visualise human embryo map as the landscape:

xMap <- sM_butterfly
colormap <- xColormap("jet")
visHexMulComp(xMap, rect.grid=c(3,6), title.rotate=0, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(-1,1), gp=grid::gpar(cex=0.6), newpage=F)

As comparisons, train the sheet map:

sM_sheet <- sPipeline(data, shape="sheet", finetuneSustain=T, scale=3)

Visualise human embryo map as the landscape:

xMap <- sM_sheet
colormap <- xColormap("jet")
visHexMulComp(xMap, rect.grid=c(3,6), title.rotate=0, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(-1,1), gp=grid::gpar(cex=0.8), newpage=F)

3 Interpreting genetics of gene expression in brains

Here we illustrate the power of I3 in interpreting eGenes found in 10 brain subregions (and the whole blood as comparisons) with respect to their selective pressure against mutations. In brief, GTEx V6p tissue-specific eGene datasets are used as input data for training and ExAC loss-of-function (LoF) datasets used as additional data for integration.

Overview of the workflow:

First of all, load the packages:

library(supraHex)
library(XGR)

# Specify the local location of built-in RData
RData.location <- "http://galahad.well.ox.ac.uk/bigdata"

Then, import GTEx and ExAC datasets:

# import GTEx datasets
GTEx_V6p_matrix <- xRDataLoader('GTEx_V6p_matrix', RData.location=RData.location)
## brain or blood tissues
ind <- grepl('Brain|Blood', colnames(GTEx_V6p_matrix))
GTEx_V6p_matrix <- GTEx_V6p_matrix[,ind]
colnames(GTEx_V6p_matrix) <- gsub('Brain_','',colnames(GTEx_V6p_matrix))
## only significant eGenes (q-value<0.05)
GTEx_V6p_matrix <- -log10(GTEx_V6p_matrix)
ind <- apply(GTEx_V6p_matrix>-log10(0.05), 1, sum)
GTEx_matrix <- GTEx_V6p_matrix[ind!=0,]

# import ExAC datasets
ExAC <- xRDataLoader('ExAC', RData.location=RData.location)
ExAC_matrix <- data.frame(pLI=ExAC$pLI, stringsAsFactors=F)
rownames(ExAC_matrix) <- ExAC$Symbol
## ExAC LoF intolerance genes: binary (1 for yes, 0 for no)
ExAC_matrix[ExAC_matrix>0.9,1] <- 1
ExAC_matrix[ExAC_matrix<=0.9,1] <- 0

# extract genes commons to GTEx and ExAC datasets
ind <- match(rownames(ExAC_matrix), rownames(GTEx_matrix))
G2GTEx <- GTEx_matrix[ind[!is.na(ind)],]
G2ExAC <- data.frame(pLI=ExAC_matrix[!is.na(ind),], stringsAsFactors=F)
rownames(G2ExAC) <- rownames(ExAC_matrix)[!is.na(ind)]

3.1 Trained GTEx tissue map

Justify the use of the diamond-shaped map:

data <- G2GTEx
df <- as.data.frame(stats::prcomp(data, center=T, scale.=T)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2)) 
gp <- gp + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="terrain")(64)
gp <- gp + stat_binhex(bins=64) + scale_fill_gradientn(colours=myColor,trans="sqrt")
print(gp)

Train the diamond-shaped map using GTEx tissue eGene datasets:

data <- G2GTEx
sMg <- sPipeline(data, shape="diamond", finetuneSustain=T, scale=3)

3.2 Overlaid ExAC LoF intolerance map

The ExAC LoF intolerance map is obtained by overlaying the ExAC data onto the trained GTEx tissue map:

sOverlay_ExAC <- sMapOverlay(sMg, data=G2GTEx, additional=G2ExAC)

Visualise ExAC LoF intolerance map:

colormap <- xColormap("jet.top")
visHexMulComp(sOverlay_ExAC, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(0,0.4), gp=grid::gpar(cex=1), newpage=F)

Rankings of 11 tissues in terms of similarity (Pearson correlation) to the ExAC LoF intolerance map:

M_GTEx <- sMg$codebook
M_ExAC <- sOverlay_ExAC$codebook
y <- M_ExAC[,1]
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_GTEx), function(i){
    x <- data.frame(name=1:nrow(M_GTEx), M_GTEx[,i], stringsAsFactors=F)
    ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
    df_summary <- ls_res$df_summary
    data.frame(tissue=colnames(M_GTEx)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)

## polar dotplot
df_tmp <- df
df_tmp$tissue <- gsub('_','\n',df_tmp$tissue)
gp_GTEx <- xPolarDot(df_tmp, size=1.5, parallel=T, signature=F)
gp_GTEx + labs(y="Pearson correlation") + scale_y_reverse(limits=c(0,-1))

Visualise GTEx tissue map (reordered by correlation) as the landscape:

xMap <- sMg
ind <- match(df$tissue, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=11:1, rect.grid=c(1,11), title.rotate=30, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,6), gp=grid::gpar(cex=0.6), newpage=F)

3.3 Gene clusters

Obtain gene clusters based on GTEx tissue map:

xM <- sMg
ind <- match(df$tissue, colnames(xM$codebook))
xM$codebook <- xM$codebook[,ind]
#seed <- sDmatMinima(xM)
sBase <- sDmatCluster(xM, which_neigh=1)
colormap <- xColormap("spectral")
visDmatCluster(xM, sBase, colormap=colormap, newpage=F)

3.4 Summary by cluster

Calculate the probability of genes being LoF intolerant per gene cluster:

colormap <- xColormap("jet.top")
output_ExAC <- visDmatHeatmap(xM, data=G2GTEx[,ind], sBase, base.color=xColormap("spectral"), base.separated.arg=list(col="white",lty=1,lwd=1), base.legend.location="bottomleft", colormap=colormap, zlim=c(0,6), KeyValueName=expression(-log[10]("q-value")), labRow=NA, srtCol=90, cexCol=0.6, keep.data=T)

## Polar barplot
ind <- match(output_ExAC$ID, rownames(G2ExAC))
output_ExAC$val <- G2ExAC[ind,'pLI']
#write.table(output_ExAC,file='output.GTEx.txt',sep='\t',row.names=F)
ls_tmp <- split(x=output_ExAC$val, f=output_ExAC$Cluster_base)
ls_df <- lapply(1:length(ls_tmp), function(i){
    x <- ls_tmp[[i]]
    y <- mean(x)
    data.frame(Cluster=paste0('C',names(ls_tmp)[i]), Val=y, Num=length(x), stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
gp <- xPolarBar(df, colormap='spectral', size.name=10, size.value=2.5, parallel=F, signature=F)
gp

3.5 Enrichment

Perform gene set enrichment analysis for clustered genes:

ls_cluster <- split(x=output_ExAC$ID, f=output_ExAC$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- output_ExAC$ID
ontologies <- "CGL"
ls_res_CGL <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(10,20000), test="hypergeo", min.overlap=5, p.tail="two-tails", RData.location=RData.location, plot=T, fdr.cutoff=0.05, displayBy="zscore")
#gp <- xEnrichForest(ls_res_CGL, top_num="auto", FDR.cutoff=0.05, signature=F)
ls_res_CGL$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))

Perform evolutionary analysis for clustered genes:

ls_cluster <- split(x=output_ExAC$ID, f=output_ExAC$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- NULL
ontologies <- "PSG"
ls_res_PSG <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(10,20000), test="fisher", min.overlap=5, p.tail="two-tails", RData.location=RData.location, plot=T, fdr.cutoff=0.05, displayBy="zscore")
#gp <- xEnrichForest(ls_res_PSG, top_num="auto", FDR.cutoff=0.05, signature=F)
ls_res_PSG$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))

4 Interpreting genetic regulators of protein phenotypes

We here demonstrate the utility of I3 in understanding positive genetic regulators with respect to three factors: phenotypic effects, expression and LoF intolerance. That is, we use haploid genetic screen datasets as input for the training and datasets regarding phenotypic effects, expression and LoF intolerance used as additional data for integration.

Overview of the workflow:

First of all, load the packages:

library(supraHex)
library(XGR)

# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"

Then, import Haploid and define combined datasets (phenotypic effects, expression and LoF):

# import haploid datasets (only positive regulators with negative mutation index)
Haploid <- xRDataLoader('Haploid_regulators', RData.location=RData.location)
## all phenotypes except 'HMGCR'
ind <- grepl('HMGCR', Haploid$Phenotype)
df <- Haploid[!ind,]
df <- df[df$MI<0,]
Haploid_matrix <- as.matrix(xSparseMatrix(df[,c('Gene','Phenotype','MI')]))

# number of phenotypes per gene
vec_count <- apply(Haploid_matrix!=0, 1, sum)
# median expression
EXP <- xRDataLoader('Haploid_expression', RData.location=RData.location)
vec_expression <- apply(EXP, 1, median)
# ExAC LoF intolerant
ExAC <- xRDataLoader('ExAC', RData.location=RData.location)
ExAC_matrix <- data.frame(pLI=ExAC$pLI, stringsAsFactors=F)
rownames(ExAC_matrix) <- ExAC$Symbol
ExAC_matrix[ExAC_matrix>0.9,1] <- 1
ExAC_matrix[ExAC_matrix<=0.9,1] <- 0
vec_lof <- ExAC_matrix[,1]
names(vec_lof) <- rownames(ExAC_matrix)
# combine three above
mat <- matrix(0, nrow=nrow(Haploid_matrix), ncol=3)
rownames(mat) <- rownames(Haploid_matrix)
colnames(mat) <- c('Count','Expression','LoF')
ind <- match(rownames(mat), names(vec_count))
mat[!is.na(ind),1] <- vec_count[ind[!is.na(ind)]]
ind <- match(rownames(mat), names(vec_expression))
mat[!is.na(ind),2] <- vec_expression[ind[!is.na(ind)]]
ind <- match(rownames(mat), names(vec_lof))
mat[!is.na(ind),3] <- vec_lof[ind[!is.na(ind)]]
Combined_matrix <- mat

# extract genes commons to Haploid and Combined datasets
ind <- match(rownames(Combined_matrix), rownames(Haploid_matrix))
G2Haploid <- Haploid_matrix[ind[!is.na(ind)],]
G2Combined <- data.frame(Combined_matrix[!is.na(ind),], stringsAsFactors=F)
rownames(G2Combined) <- rownames(Combined_matrix)[!is.na(ind)]

4.1 Trained Haploid protein map

Justify the use of the trefoil-shaped map (two long branches and one short):

data <- G2Haploid
df <- as.data.frame(stats::prcomp(data, center=T, scale.=T)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2)) 
gp <- gp + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="terrain")(32)
gp <- gp + stat_binhex(bins=32) + scale_fill_gradientn(colours=myColor)
print(gp)

Train the trefoil-shaped map using Haploid genetic regulator datasets:

sMh <- sPipeline(data=G2Haploid, shape="trefoil", finetuneSustain=T, scale=3)

4.2 Overlaid map

The map is obtained by overlaying/fusing the combined datasets onto the trained Haploid protein map:

sO_Combined <- sMapOverlay(sMh, data=G2Haploid, additional=G2Combined)

Visualise overlaid phenotypic effect map:

colormap <- xColormap("jet.top")
visHexMulComp(sO_Combined, which.components=1, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(1,5), gp=grid::gpar(cex=1), newpage=F)

Visualise overlaid expression map:

colormap <- xColormap("jet.top")
visHexMulComp(sO_Combined, which.components=2, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(6,10), gp=grid::gpar(cex=1), newpage=F)

Visualise overlaid LoF map:

colormap <- xColormap("jet.top")
visHexMulComp(sO_Combined, which.components=3, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(0.3,0.7), gp=grid::gpar(cex=1), newpage=F)

Visualise Haploid protein map in 2D landscape:

sReorder <- sCompReorder(sMh, metric="euclidean")
colormap <- colorRampPalette(c("blue","#007FFF","cyan"), interpolate="spline")
visCompReorder(sMh, sReorder, title.rotate=0, title.xy=c(0.4,0.95), colormap=colormap, zlim=c(-1,0), gp=grid::gpar(cex=0.6), newpage=F)

4.3 Gene clusters

Obtain gene clusters based on Haploid protein map:

xM <- sMh
#seed <- sDmatMinima(xM)
sBase <- sDmatCluster(xM, which_neigh=1)
colormap <- xColormap("spectral")
visDmatCluster(xM, sBase, colormap=colormap, newpage=F)

4.4 Summary by cluster

Calculate the overlaid values averaged per gene cluster:

colormap <- xColormap("jet.both")
output_Combined <- visDmatHeatmap(xM, data=G2Haploid, sBase, base.color=xColormap("spectral"), base.separated.arg=list(col="white",lty=1,lwd=1), base.legend.location="bottomleft", colormap=colormap, zlim=c(-1,1), KeyValueName=expression(-log[2]("Mutation index")), labRow=NA, reorderRow="none", srtCol=90, cexCol=0.6, keep.data=T)

## Polar barplot
ind <- match(output_Combined$ID, rownames(G2Combined))
df_tmp <- cbind(output_Combined, G2Combined[ind,])
#write.table(df_tmp,file='output.Combined.txt',sep='\t',row.names=F)
df <- as.data.frame(df_tmp %>% dplyr::mutate(Cluster=paste0('C',Cluster_base)) %>% dplyr::group_by(Cluster) %>% dplyr::summarise(Count=mean(Count), Expression=mean(Expression),LoF=mean(LoF)))
ls_gp <- lapply(2:ncol(df), function(i){
    gp <- xPolarBar(df[,c(1,i)], colormap='spectral', size.name=10, size.value=2.5, parallel=F, signature=F)
    gp + labs(title=colnames(df)[i])
})
gridExtra::grid.arrange(grobs=ls_gp, nrow=1, ncol=3)

4.5 Enrichment

Perform pathway analysis for clustered genes:

ls_cluster <- split(x=output_Combined$ID, f=output_Combined$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- NULL
ontologies <- "REACTOME"
ls_res_reactome <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(20,500), test="hypergeo", min.overlap=10, p.tail="one-tail", RData.location=RData.location, plot=T, fdr.cutoff=0.01, displayBy="zscore")
#ls_res_reactome$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))

df <- ls_res_reactome$df
df <- as.data.frame(df %>% dplyr::filter(adjp < 0.01))
mat <- as.matrix(xSparseMatrix(df[,c('name','group','zscore')], columns=names(ls_cluster)))
mat[mat==0] <- NA
gp <- xHeatmap(t(mat), reorder=c("none","row","col","both")[3], colormap="#E6F598-#ABDDA4-#66C2A5-#3288BD", zlim=c(2,10), na.color='transparent', size=2.5, x.rotate=30, x.text.size=5, y.text.size=6, barheight=5, legend.title='Z-score')
gp

Perform druggable analysis for clustered genes:

ls_cluster <- split(x=output_Combined$ID, f=output_Combined$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- NULL
ontologies <- "DGIdb"
ls_res_dgidb <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(10,2000), test="fisher", min.overlap=5, p.tail="one-tail", RData.location=RData.location, plot=T, fdr.cutoff=0.05, displayBy="zscore")
#ls_res_dgidb$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))
gp <- xEnrichForest(ls_res_dgidb, top_num="auto", FDR.cutoff=0.01, signature=F)
gp
#write.table(gp$data,file='output.DGIdb.txt',sep='\t',row.names=F)

5 Session Info

Here is the output of sessionInfo():

> R version 3.6.1 (2019-07-05)
> Platform: x86_64-apple-darwin15.6.0 (64-bit)
> Running under: macOS Catalina 10.15
> 
> Matrix products: default
> BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
> LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
> 
> locale:
> [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
> 
> attached base packages:
> [1] grid      stats     graphics  grDevices utils     datasets  methods  
> [8] base     
> 
> other attached packages:
> [1] png_0.1-7        XGR_1.1.7        ggplot2_3.2.0    dnet_1.1.5      
> [5] igraph_1.2.4.1   supraHex_1.22.0  hexbin_1.27.3    BiocStyle_2.12.0
> 
> loaded via a namespace (and not attached):
>  [1] network_1.15           tidyselect_0.2.5       xfun_0.8              
>  [4] purrr_0.3.2            sna_2.4                lattice_0.20-38       
>  [7] colorspace_1.4-1       RCircos_1.2.1          htmltools_0.3.6       
> [10] stats4_3.6.1           yaml_2.2.0             rlang_0.4.0           
> [13] pillar_1.4.1           withr_2.1.2            glue_1.3.1            
> [16] ggnetwork_0.5.1        Rgraphviz_2.28.0       BiocGenerics_0.30.0   
> [19] GenomeInfoDbData_1.2.1 stringr_1.4.0          zlibbioc_1.30.0       
> [22] munsell_0.5.0          gtable_0.3.0           coda_0.19-2           
> [25] evaluate_0.14          knitr_1.23             IRanges_2.18.1        
> [28] GenomeInfoDb_1.20.0    parallel_3.6.1         Rcpp_1.0.1            
> [31] scales_1.0.0           BiocManager_1.30.4     S4Vectors_0.22.0      
> [34] graph_1.62.0           XVector_0.24.0         digest_0.6.19         
> [37] stringi_1.4.3          bookdown_0.11          dplyr_0.8.1           
> [40] ggrepel_0.8.1          GenomicRanges_1.36.0   bitops_1.0-6          
> [43] tools_3.6.1            magrittr_1.5           lazyeval_0.2.2        
> [46] RCurl_1.95-4.12        tibble_2.1.3           crayon_1.3.4          
> [49] ape_5.3                pkgconfig_2.0.2        MASS_7.3-51.4         
> [52] Matrix_1.2-17          assertthat_0.2.1       rmarkdown_1.13        
> [55] statnet.common_4.3.0   R6_2.4.0               nlme_3.1-140          
> [58] compiler_3.6.1

Intuitive Integrative Interpretation (I3) of complex genetic data

29 October 2019

1 Installation

2 Benchmarking

3 Interpreting genetics of gene expression in brains

3.1 Trained GTEx tissue map

3.2 Overlaid ExAC LoF intolerance map

3.3 Gene clusters

3.4 Summary by cluster

3.5 Enrichment

4 Interpreting genetic regulators of protein phenotypes

4.1 Trained Haploid protein map

4.2 Overlaid map

4.3 Gene clusters

4.4 Summary by cluster

4.5 Enrichment

5 Session Info