Commit 573617da authored by dmattek's avatar dmattek
Browse files

Initial commit

parents
# FreeClust: clustering made easy
#### Running on the server
This is source code of an interactive clustering web-app written in R/Shiny. A running instance can be found here:
http://bioz-lcms-chromclust.bioz.unibas.ch:3838/shiny-freeclust
#### Running locally from RStudio
After downloading the source code, open `server.R` or `ui.R` and click `Run App` button in the upper right corner of the window with the code. The following packages need to be installed before running this code:
- shiny
- shinyjs
- gplots (provides `heatmap.2`)
- dendextend (provides `color_branches`)
- RColorBrewer (provides `brewer.pal`)
- d3heatmap (provides interactive `d3heatmap`)
- sparcl (provides sparse hierarchical and k-means clustering)
- bclust (provides Bayesian clustering)
#### About
The web-app integrates several clustering algorithms. These include a widely-used hierarchical clustering (hclust) with a choice of commonly used linkage methods to construct the tree diagram (dendrogram). Additionally, sparse hierarchical clustering (sparcl) and a model-based Bayesian approach tailored to cluster high-dimensional data (with many more variables than samples). The sparse hierarchical and Bayesian clustering provide the information about the importance of a specific categories across the samples.
The web-app allows for rudimentary data manipulation. Users have an option to convert missing values to zeroes which is necessary to use Bayesian clustering. In general, it is important to understand the causes of missing data and treat them accordingly. Viable options include omitting such samples entirely or replacing missing data with imputed values, the population mean, etc.
Rescaling is another data manipulation option available to users. When switched on, each row has its mean subtracted and the result is divided by row’s standard deviation; this corresponds to calculating z-scores. Taking log10 of data is another option available.Users can also trim data to omit values below and above a threshold. Such data points are turned into missing values, however these are no longer subject to conversion of missing values in source data described above. Also available is data clipping which assigns threshold values to data exceeding (from below or above) these thresholds. Neither trimming nor clipping affects the zeroes resulting from replacement of missing data.
#### Data format
Users can generate artificial random dataset or can upload a text file in CSV format where rows correspond to different categories, and columns correspond to samples. The first column should include labels of categories, while the first row should contain sample names. An example file is in example-data folder.
Depending on regional settings (i.e. locale), Excel might save the CSV file differently from the default setting where columns are separated by a comma and dot is used as a decimal separator. For example, a German locale would result in a CSV file with columns separated by a semicolon and decimal point signified by a comma. The app can account for such variations of the input format as well as the convention used to represent missing values, e.g. whether it is a character string “NA”, a dash “-“, or an empty space.
\ No newline at end of file
categories,species1,species2,species3,species4,species5,species6,species7,species8,species9,species10,,,,
cat1,-,-,-,-,-,-,-,-,-,307609408,,,,
cat2,-,-,-,-,-,537825024,307609408,-,-,-,,,,
cat3,-,-,-,-,-,771978432,494108320,-,-,-,,,,
cat4,-,-,-,-,-,95841840,292572000,-,-,-,,,,
cat5,-,-,-,-,-,-,-,-,-,307609408,,,,
cat6,-,67445448,-,-,-,234705904,-,-,-,-,,,,
cat7,-,-,-,-,-,-,-,-,-,307609408,,,,
cat8,-,-,-,-,-,-,-,-,-,307609408,,,,
cat9,-,-,-,-,-,214319648,-,-,-,-,,,,
cat10,341216384,-,134485616,76037640,70646576,-,-,-,134183992,182809296,,,,
cat11,328572096,428627072,-,-,832382976,-,-,636496768,-,467344960,,,,
cat12,188422592,657460480,844082816,-,-,-,-,-,-,404283968,,,,
cat13,532154400,80290832,-,-,-,-,-,-,-,-,,,,
cat14,-,133105152,548460160,-,-,-,-,-,-,-,,,,
cat15,-,-,-,-,-,-,-,-,-,404283968,,,,
cat16,25866828,8269378,21645604,21738014,102893520,153650896,305186240,111867232,153310176,6320292,,,,
cat17,201420064,430386144,1117632256,561706880,293825472,1011808192,-,226602880,1869593984,258388720,,,,
cat18,-,-,-,1007009600,-,56902716,5228181,30271422,31765980,-,,,,
cat19,-,241968992,170721760,158485136,149803744,124493976,293502880,235966256,157396672,-,,,,
cat20,719030144,857534016,616804544,713191680,578310528,203675584,666675584,826807936,620248192,678372096,,,,
\ No newline at end of file
source('modules/auxfn.R')
source('modules/downPlot.R')
source('modules/tabHier.R')
source('modules/tabHierSparse.R')
source('modules/tabBayClust.R')
# From: https://www.r-bloggers.com/winsorization/
winsor1 <-
function (x, fraction=.05)
{
if(length(fraction) != 1 || fraction < 0 ||
fraction > 0.5) {
stop("bad value for 'fraction'")
}
lim <- quantile(x, probs=c(fraction, 1-fraction))
x[ x < lim[1] ] <- lim[1]
x[ x > lim[2] ] <- lim[2]
x
}
winsor2 <- function (x, multiple=3)
{
if(length(multiple) != 1 || multiple <= 0) {
stop("bad value for 'multiple'")
}
med <- median(x, na.rm = TRUE)
y <- x - med
sc <- mad(y, center=0, na.rm = TRUE) * multiple
y[ y > sc ] <- sc
y[ y < -sc ] <- -sc
y + med
}
# From: https://gist.github.com/jcheng5/5913297
helpPopup <- function(title, content,
placement=c('right', 'top', 'left', 'bottom'),
trigger=c('click', 'hover', 'focus', 'manual')) {
tagList(
singleton(
tags$head(
tags$script("$(function() { $(\"[data-toggle='popover']\").popover(); })")
)
),
tags$a(
href = "#", class = "btn btn-mini", `data-toggle` = "popover",
title = title, `data-content` = content, `data-animation` = TRUE,
`data-placement` = match.arg(placement, several.ok=TRUE)[1],
`data-trigger` = match.arg(trigger, several.ok=TRUE)[1],
#tags$i(class="icon-question-sign")
# changed based on http://stackoverflow.com/questions/30436013/info-bubble-text-in-a-shiny-interface
icon("question")
)
)
}
userDataGen <- function() {
require("MASS")
cat(file = stderr(), 'generate data \n')
# assign result to shared 'dataIn' variable
loc.x <-
rbind(mvrnorm(10, c(0, 0), matrix(c(1, 0.9, 0.9, 1), 2, 2)), mvrnorm(10, c(4, 0), matrix(c(1,-0.9,-0.9, 1), 2, 2)))
loc.x <- cbind(loc.x, matrix(rnorm(20 * 18), 20, 18))
rownames(loc.x) = c(paste("A", sprintf("%02d", 1:10), sep = ""), paste("B", sprintf("%02d", 1:10), sep = ""))
colnames(loc.x) = LETTERS[1:20]
return(loc.x)
}
# RShiny module for downloading pdf of the plot
# Use:
# in ui.R
# downPlotUI('uniqueID', "your_label")
#
# in server.R
# callModule(downPlot, "uniqueID", 'fname.pdf', input_plot_to_save)
downPlotUI <- function(id, label = "Download Plot") {
ns <- NS(id)
tagList(
# Label to display as h4 header
h4(label),
fluidRow(
column(
3,
numericInput(
ns('inPlotWidth'),
"Width",
17,
min = 1,
width = 100
)
),
column(
3,
numericInput(
ns('inPlotHeight'),
"Height",
10,
min = 1,
width = 100
)
),
column(6,
downloadButton(ns('downPlot'), 'PDF'))
)
)
}
downPlot <- function(input, output, session, in.fname, in.plot) {
output$downPlot <- downloadHandler(
filename = function() {
in.fname
},
content = function(file) {
pdf(file,
width = input$inPlotWidth,
height = input$inPlotHeight)
in.plot()
dev.off()
}
)
}
\ No newline at end of file
# RShiny module for performing Bayesian clustering using bclust
# Use:
# in ui.R
# tabPanel(
# 'Hierarchical',
# clustBayUI('TabClustBay'))
#
# in server.R
# callModule(clustBay, 'TabClustBay', dataMod)
# where dataMod is the output from a reactive function that returns dataset ready for clustering
require(gplots) # heatmap.2
require(dendextend) # color_branches
require(RColorBrewer) # brewer.pal
require(d3heatmap) # interactive heatmap
require(bclust) # Bayesian clustering
l.col.pal = list(
"Spectral" = 'Spectral',
"White-Orange-Red" = 'OrRd',
"Yellow-Orange-Red" = 'YlOrRd',
"Reds" = "Reds",
"Oranges" = "Oranges",
"Greens" = "Greens",
"Blues" = "Blues"
)
# UI
clustBayUI <- function(id, label = "Sparse Hierarchical CLustering") {
ns <- NS(id)
tagList(
h4(
"Bayesian clustering using ",
a("bclust", href = "https://cran.r-project.org/web/packages/bclust/index.html")
),
p('The algorithm does not deal with missing values. Use conversion to zeroes in the right panel.'),
p(
'Column labels in the heat-map are additionally labeled according to their Bayes weight (\"importance\"):'
),
tags$ol(
tags$li("Blue with \"-\" - variable not likely to participate in optimal clustering (negative weight)"),
tags$li("Black - low importance (weight factor in 1st quartile)"),
tags$li("Green with \"*\" - medium importance (weight factor in 2nd quartile)"),
tags$li("Orange with \"**\" - high importance (weight factor in 3rd quartile)"),
tags$li("Red with \"***\" - very high importance (weight factor in 4th quartile)")
),
br(),
fluidRow(
column(6,
selectInput(
ns("selectPlotBayHmPalette"),
label = "Select colour palette:",
choices = l.col.pal,
selected = 'Spctral'
),
checkboxInput(ns('inPlotBayHmRevPalette'), 'Reverse colour palette', TRUE),
checkboxInput(ns('selectPlotBayDend'),
'Plot dendrogram and re-order samples', TRUE),
checkboxInput(ns('selectPlotBayKey'), 'Plot colour key', TRUE)
),
column(6,
uiOutput(ns('inPlotBayHmNclustSlider')),
sliderInput(
ns('inPlotBayHmGridColor'),
'Shade of grey for grid lines (0 - black, 1 - white)',
min = 0,
max = 1,
value = 0.6,
step = .1,
ticks = TRUE
)
)
),
fluidRow(
column(
3,
numericInput(
ns('inPlotBayHmMarginX'),
'Margin below x-axis',
10,
min = 1,
width = 100
)
),
column(
3,
numericInput(
ns('inPlotBayHmMarginY'),
'Margin right of y-axis',
10,
min = 1,
width = 100
)
),
column(
3,
numericInput(
ns('inPlotBayHmFontX'),
'Font size row labels',
1,
min = 0,
width = 100,
step = 0.1
)
),
column(
3,
numericInput(
ns('inPlotBayHmFontY'),
'Font size column labels',
1,
min = 0,
width = 100,
step = 0.1
)
)
),
br(),
downPlotUI(ns('downPlotBayHM')),
br(),
checkboxInput(ns('inPlotBayInteractive'), 'Interactive Plot?', value = FALSE),
uiOutput(ns("plotBayInt_ui"))
)
}
# SERVER
clustBay <- function(input, output, session, dataMod) {
userFitBclus <- reactive({
cat(file = stderr(), 'userFitBclus \n')
loc.dm = dataMod()
if (is.null(loc.dm))
return(NULL)
bclust(loc.dm, transformed.par = c(0, -50, log(16), 0, 0, 0))
})
userDendBclus <- reactive({
cat(file = stderr(), 'userDendBclus \n')
d.bclus = userFitBclus()
if (is.null(d.bclus))
return(NULL)
dend <- as.dendrogram(d.bclus)
# dend <- color_branches(dend, k = d.bclus$optim.clustno)
dend <- color_branches(dend, k = input$inPlotBayHmNclust)
# browser()
})
userVarImpBclus <- reactive({
cat(file = stderr(), 'userVarImpBclus \n')
d.bclus = userFitBclus()
if (is.null(d.bclus))
return(NULL)
return(imp(d.bclus)$var)
})
output$inPlotBayHmNclustSlider = renderUI({
ns <- session$ns
loc.dm = dataMod()
if (is.null(loc.dm))
return(NULL)
loc.d.bclus = userFitBclus()
if (is.null(loc.d.bclus))
return(NULL)
sliderInput(
ns('inPlotBayHmNclust'),
'#clusters to colour (default: optimal # from bclust)',
min = 1,
max = nrow(loc.dm),
value = loc.d.bclus$optim.clustno,
step = 1,
ticks = TRUE,
round = TRUE
)
})
plotBayHm <- function() {
cat(file = stderr(), 'plotBayHm \n')
loc.dm = dataMod()
if (is.null(loc.dm))
return(NULL)
loc.dend = userDendBclus()
if (is.null(loc.dend))
return(NULL)
loc.var.imp = imp(userFitBclus())$var
if (is.null(loc.var.imp))
return(NULL)
col_labels <- get_leaves_branches_col(loc.dend)
col_labels <- col_labels[order(order.dendrogram(loc.dend))]
if (input$inPlotBayHmRevPalette)
my_palette <-
rev(colorRampPalette(brewer.pal(9, input$selectPlotBayHmPalette))(n = 99))
else
my_palette <-
colorRampPalette(brewer.pal(9, input$selectPlotBayHmPalette))(n = 99)
if (input$selectPlotBayDend) {
assign("var.tmp.1", loc.dend)
var.tmp.2 = "row"
} else {
assign("var.tmp.1", FALSE)
var.tmp.2 = "none"
}
loc.colnames = paste0(ifelse(loc.var.imp < 0, "- ",
ifelse(
loc.var.imp < quantile(loc.var.imp, 0.25), "",
ifelse(loc.var.imp < quantile(loc.var.imp, 0.5), "* ",
ifelse(loc.var.imp < quantile(loc.var.imp, 0.75), "** ", "*** "))
)), colnames(loc.dm))
loc.colcol = ifelse(loc.var.imp < 0, "blue",
ifelse(
loc.var.imp < quantile(loc.var.imp, 0.25), "black",
ifelse(loc.var.imp < quantile(loc.var.imp, 0.5), "green",
ifelse(loc.var.imp < quantile(loc.var.imp, 0.75), "orange", "red"))
))
heatmap.2(
loc.dm,
Colv = "NA",
Rowv = var.tmp.1,
srtCol = 90,
dendrogram = var.tmp.2,
trace = "none",
key = input$selectPlotBayKey,
margins = c(input$inPlotBayHmMarginX, input$inPlotBayHmMarginY),
col = my_palette,
na.col = grey(input$inPlotBayHmNAcolor),
denscol = "black",
density.info = "density",
RowSideColors = col_labels,
colRow = col_labels,
colCol = loc.colcol,
labCol = loc.colnames,
sepcolor = grey(input$inPlotBayHmGridColor),
colsep = 1:ncol(loc.dm),
rowsep = 1:nrow(loc.dm),
cexRow = input$inPlotBayHmFontX,
cexCol = input$inPlotBayHmFontY,
main = "Bayesian Clustering (bclust)"
)
}
plotBayImp <- function() {
cat(file = stderr(), 'plotBayImp \n')
loc.dm = dataMod()
if (is.null(loc.dm))
return(NULL)
loc.d.bclus = userFitBclus()
if (is.null(loc.d.bclus))
return(NULL)
#cat(imp(loc.d.bclus)$var)
viplot(
imp(loc.d.bclus)$var,
xlab = colnames(loc.dm),
xlab.srt = 90,
xlab.mar = input$inPlotBayHmMarginX,
xlab.cex = input$inPlotBayHmFontY,
main = '\nVariable importance\n'
)
}
output$outPlotBayHm <- renderPlot({
plotBayHm()
}, height = 800)
output$outPlotBayImp <- renderPlot({
plotBayImp()
}, height = 800)
output$plotBayInt <- renderD3heatmap({
cat(file = stderr(), 'plotBayInt \n')
loc.dm = dataMod()
if (is.null(loc.dm))
return(NULL)
loc.dend = userDendBclus()
if (is.null(loc.dend))
return(NULL)
loc.var.imp = imp(userFitBclus())$var
if (is.null(loc.var.imp))
return(NULL)
col_labels <- get_leaves_branches_col(loc.dend)
col_labels <- col_labels[order(order.dendrogram(loc.dend))]
if (input$inPlotBayHmRevPalette)
my_palette <-
rev(colorRampPalette(brewer.pal(9, input$selectPlotBayHmPalette))(n = 99))
else
my_palette <-
colorRampPalette(brewer.pal(9, input$selectPlotBayHmPalette))(n = 99)
if (input$selectPlotBayDend) {
assign("var.tmp.1", loc.dend)
var.tmp.2 = "row"
} else {
assign("var.tmp.1", FALSE)
var.tmp.2 = "none"
}
loc.colnames = paste0(ifelse(loc.var.imp < 0, "- ",
ifelse(
loc.var.imp < quantile(loc.var.imp, 0.25), "",
ifelse(loc.var.imp < quantile(loc.var.imp, 0.5), "* ",
ifelse(loc.var.imp < quantile(loc.var.imp, 0.75), "** ", "*** "))
)), colnames(loc.dm))
d3heatmap(
loc.dm,
Rowv = var.tmp.1,
dendrogram = var.tmp.2,
trace = "none",
revC = FALSE,
margins = c(input$inPlotBayHmMarginX, input$inPlotBayHmMarginY),
colors = my_palette,
na.col = grey(input$inPlotBayNAcolor),
cexRow = input$inPlotBayHmFontY,
cexCol = input$inPlotBayHmFontX,
xaxis_height = input$inPlotBayHmMarginX,
yaxis_width = input$inPlotBayHmMarginY,
show_grid = TRUE,
labRow = rownames(loc.dm),
labCol = loc.colnames
)
})
output$plotBayInt_ui <- renderUI({
ns <- session$ns
if (input$inPlotBayInteractive)
d3heatmapOutput(ns("plotBayInt"))
else {
plotOutput(ns('outPlotBayHm'))
}
})
callModule(downPlot, "downPlotBayHM", 'clust_bayesian_dend.pdf', plotBayHm)
}
# RShiny module for performing hierarchical clustering
# Use:
# in ui.R
# tabPanel(
# 'Hierarchical',
# clustHierUI('TabClustHier'))
#
# in server.R
# callModule(clustHier, 'TabClustHier', dataMod)
# where dataMod is the output from a reactive function that returns dataset ready for clustering
require(gplots) # heatmap.2
require(dendextend) # color_branches
require(RColorBrewer) # brewer.pal
require(d3heatmap) # interactive heatmap
s.cl.linkage = c("ward.D",
"ward.D2",
"single",
"complete",
"average",
"mcquitty",
"centroid")
s.cl.dist = c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski")
l.col.pal = list(
"Spectral" = 'Spectral',
"White-Orange-Red" = 'OrRd',
"Yellow-Orange-Red" = 'YlOrRd',
"Reds" = "Reds",
"Oranges" = "Oranges",
"Greens" = "Greens",
"Blues" = "Blues"
)
# UI
clustHierUI <- function(id, label = "Hierarchical CLustering") {
ns <- NS(id)
tagList(
h4(
"Hierarchical clustering using standard",
a("hclust", href = "https://stat.ethz.ch/R-manual/R-devel/library/stats/html/hclust.html")
),
br(),