Commit 573617da authored by dmattek's avatar dmattek

Initial commit

parents
# FreeClust: clustering made easy
#### Running on the server
This is source code of an interactive clustering web-app written in R/Shiny. A running instance can be found here:
http://bioz-lcms-chromclust.bioz.unibas.ch:3838/shiny-freeclust
#### Running locally from RStudio
After downloading the source code, open `server.R` or `ui.R` and click `Run App` button in the upper right corner of the window with the code. The following packages need to be installed before running this code:
- shiny
- shinyjs
- gplots (provides `heatmap.2`)
- dendextend (provides `color_branches`)
- RColorBrewer (provides `brewer.pal`)
- d3heatmap (provides interactive `d3heatmap`)
- sparcl (provides sparse hierarchical and k-means clustering)
- bclust (provides Bayesian clustering)
#### About
The web-app integrates several clustering algorithms. These include a widely-used hierarchical clustering (hclust) with a choice of commonly used linkage methods to construct the tree diagram (dendrogram). Additionally, sparse hierarchical clustering (sparcl) and a model-based Bayesian approach tailored to cluster high-dimensional data (with many more variables than samples). The sparse hierarchical and Bayesian clustering provide the information about the importance of a specific categories across the samples.
The web-app allows for rudimentary data manipulation. Users have an option to convert missing values to zeroes which is necessary to use Bayesian clustering. In general, it is important to understand the causes of missing data and treat them accordingly. Viable options include omitting such samples entirely or replacing missing data with imputed values, the population mean, etc.
Rescaling is another data manipulation option available to users. When switched on, each row has its mean subtracted and the result is divided by row’s standard deviation; this corresponds to calculating z-scores. Taking log10 of data is another option available.Users can also trim data to omit values below and above a threshold. Such data points are turned into missing values, however these are no longer subject to conversion of missing values in source data described above. Also available is data clipping which assigns threshold values to data exceeding (from below or above) these thresholds. Neither trimming nor clipping affects the zeroes resulting from replacement of missing data.
#### Data format
Users can generate artificial random dataset or can upload a text file in CSV format where rows correspond to different categories, and columns correspond to samples. The first column should include labels of categories, while the first row should contain sample names. An example file is in example-data folder.
Depending on regional settings (i.e. locale), Excel might save the CSV file differently from the default setting where columns are separated by a comma and dot is used as a decimal separator. For example, a German locale would result in a CSV file with columns separated by a semicolon and decimal point signified by a comma. The app can account for such variations of the input format as well as the convention used to represent missing values, e.g. whether it is a character string “NA”, a dash “-“, or an empty space.
\ No newline at end of file
categories,species1,species2,species3,species4,species5,species6,species7,species8,species9,species10,,,,
cat1,-,-,-,-,-,-,-,-,-,307609408,,,,
cat2,-,-,-,-,-,537825024,307609408,-,-,-,,,,
cat3,-,-,-,-,-,771978432,494108320,-,-,-,,,,
cat4,-,-,-,-,-,95841840,292572000,-,-,-,,,,
cat5,-,-,-,-,-,-,-,-,-,307609408,,,,
cat6,-,67445448,-,-,-,234705904,-,-,-,-,,,,
cat7,-,-,-,-,-,-,-,-,-,307609408,,,,
cat8,-,-,-,-,-,-,-,-,-,307609408,,,,
cat9,-,-,-,-,-,214319648,-,-,-,-,,,,
cat10,341216384,-,134485616,76037640,70646576,-,-,-,134183992,182809296,,,,
cat11,328572096,428627072,-,-,832382976,-,-,636496768,-,467344960,,,,
cat12,188422592,657460480,844082816,-,-,-,-,-,-,404283968,,,,
cat13,532154400,80290832,-,-,-,-,-,-,-,-,,,,
cat14,-,133105152,548460160,-,-,-,-,-,-,-,,,,
cat15,-,-,-,-,-,-,-,-,-,404283968,,,,
cat16,25866828,8269378,21645604,21738014,102893520,153650896,305186240,111867232,153310176,6320292,,,,
cat17,201420064,430386144,1117632256,561706880,293825472,1011808192,-,226602880,1869593984,258388720,,,,
cat18,-,-,-,1007009600,-,56902716,5228181,30271422,31765980,-,,,,
cat19,-,241968992,170721760,158485136,149803744,124493976,293502880,235966256,157396672,-,,,,
cat20,719030144,857534016,616804544,713191680,578310528,203675584,666675584,826807936,620248192,678372096,,,,
\ No newline at end of file
source('modules/auxfn.R')
source('modules/downPlot.R')
source('modules/tabHier.R')
source('modules/tabHierSparse.R')
source('modules/tabBayClust.R')
# From: https://www.r-bloggers.com/winsorization/
winsor1 <-
function (x, fraction=.05)
{
if(length(fraction) != 1 || fraction < 0 ||
fraction > 0.5) {
stop("bad value for 'fraction'")
}
lim <- quantile(x, probs=c(fraction, 1-fraction))
x[ x < lim[1] ] <- lim[1]
x[ x > lim[2] ] <- lim[2]
x
}
winsor2 <- function (x, multiple=3)
{
if(length(multiple) != 1 || multiple <= 0) {
stop("bad value for 'multiple'")
}
med <- median(x, na.rm = TRUE)
y <- x - med
sc <- mad(y, center=0, na.rm = TRUE) * multiple
y[ y > sc ] <- sc
y[ y < -sc ] <- -sc
y + med
}
# From: https://gist.github.com/jcheng5/5913297
helpPopup <- function(title, content,
placement=c('right', 'top', 'left', 'bottom'),
trigger=c('click', 'hover', 'focus', 'manual')) {
tagList(
singleton(
tags$head(
tags$script("$(function() { $(\"[data-toggle='popover']\").popover(); })")
)
),
tags$a(
href = "#", class = "btn btn-mini", `data-toggle` = "popover",
title = title, `data-content` = content, `data-animation` = TRUE,
`data-placement` = match.arg(placement, several.ok=TRUE)[1],
`data-trigger` = match.arg(trigger, several.ok=TRUE)[1],
#tags$i(class="icon-question-sign")
# changed based on http://stackoverflow.com/questions/30436013/info-bubble-text-in-a-shiny-interface
icon("question")
)
)
}
userDataGen <- function() {
require("MASS")
cat(file = stderr(), 'generate data \n')
# assign result to shared 'dataIn' variable
loc.x <-
rbind(mvrnorm(10, c(0, 0), matrix(c(1, 0.9, 0.9, 1), 2, 2)), mvrnorm(10, c(4, 0), matrix(c(1,-0.9,-0.9, 1), 2, 2)))
loc.x <- cbind(loc.x, matrix(rnorm(20 * 18), 20, 18))
rownames(loc.x) = c(paste("A", sprintf("%02d", 1:10), sep = ""), paste("B", sprintf("%02d", 1:10), sep = ""))
colnames(loc.x) = LETTERS[1:20]
return(loc.x)
}
# RShiny module for downloading pdf of the plot
# Use:
# in ui.R
# downPlotUI('uniqueID', "your_label")
#
# in server.R
# callModule(downPlot, "uniqueID", 'fname.pdf', input_plot_to_save)
downPlotUI <- function(id, label = "Download Plot") {
ns <- NS(id)
tagList(
# Label to display as h4 header
h4(label),
fluidRow(
column(
3,
numericInput(
ns('inPlotWidth'),
"Width",
17,
min = 1,
width = 100
)
),
column(
3,
numericInput(
ns('inPlotHeight'),
"Height",
10,
min = 1,
width = 100
)
),
column(6,
downloadButton(ns('downPlot'), 'PDF'))
)
)
}
downPlot <- function(input, output, session, in.fname, in.plot) {
output$downPlot <- downloadHandler(
filename = function() {
in.fname
},
content = function(file) {
pdf(file,
width = input$inPlotWidth,
height = input$inPlotHeight)
in.plot()
dev.off()
}
)
}
\ No newline at end of file
This diff is collapsed.
# RShiny module for performing hierarchical clustering
# Use:
# in ui.R
# tabPanel(
# 'Hierarchical',
# clustHierUI('TabClustHier'))
#
# in server.R
# callModule(clustHier, 'TabClustHier', dataMod)
# where dataMod is the output from a reactive function that returns dataset ready for clustering
require(gplots) # heatmap.2
require(dendextend) # color_branches
require(RColorBrewer) # brewer.pal
require(d3heatmap) # interactive heatmap
s.cl.linkage = c("ward.D",
"ward.D2",
"single",
"complete",
"average",
"mcquitty",
"centroid")
s.cl.dist = c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski")
l.col.pal = list(
"Spectral" = 'Spectral',
"White-Orange-Red" = 'OrRd',
"Yellow-Orange-Red" = 'YlOrRd',
"Reds" = "Reds",
"Oranges" = "Oranges",
"Greens" = "Greens",
"Blues" = "Blues"
)
# UI
clustHierUI <- function(id, label = "Hierarchical CLustering") {
ns <- NS(id)
tagList(
h4(
"Hierarchical clustering using standard",
a("hclust", href = "https://stat.ethz.ch/R-manual/R-devel/library/stats/html/hclust.html")
),
br(),
fluidRow(
column(
6,
selectInput(
ns("selectDist"),
label = ("Select distance method:"),
choices = list(
"Euclidean" = 1,
"Maximum" = 2,
"Manhattan" = 3,
"Canberra" = 4,
"Binary" = 5,
"Minkowski" = 6
),
selected = 1
),
selectInput(
ns("selectLinkage"),
label = ("Select linkage method:"),
choices = list(
"Ward" = 1,
"Ward D2" = 2,
"Single" = 3,
"Complete" = 4,
"Average" = 5,
"McQuitty" = 6,
"Centroid" = 7
),
selected = 1
),
checkboxInput(ns('selectDend'),
'Plot dendrogram and re-order samples',
TRUE),
sliderInput(
ns('inNclust'),
'#dendrogram branches to colour',
min = 1,
max = 10,
value = 1,
step = 1,
ticks = TRUE,
round = TRUE
),
checkboxInput(ns('selectKey'),
'Plot colour key',
TRUE)
),
column(
6,
selectInput(
ns("selectPalette"),
label = "Select colour palette:",
choices = l.col.pal,
selected = 'Spectral'
),
checkboxInput(ns('inRevPalette'),
'Reverse colour palette',
TRUE),
sliderInput(
ns('inNAcolor'),
'Shade of grey for NA values (0 - black, 1 - white)',
min = 0,
max = 1,
value = 0.8,
step = .1,
ticks = TRUE
),
sliderInput(
ns('inGridColor'),
'Shade of grey for grid lines (0 - black, 1 - white)',
min = 0,
max = 1,
value = 0.6,
step = .1,
ticks = TRUE
)
)
),
br(),
fluidRow(
column(3,
numericInput(
ns('inMarginX'),
'Margin below x-axis',
10,
min = 1,
width = 100
)
),
column(3,
numericInput(
ns('inMarginY'),
'Margin right of y-axis',
10,
min = 1,
width = 100
)
),
column(3,
numericInput(
ns('inFontX'),
'Font size row labels',
1,
min = 0,
width = 100,
step = 0.1
)
),
column(3,
numericInput(
ns('inFontY'),
'Font size column labels',
1,
min = 0,
width = 100,
step = 0.1
)
)
),
br(),
downPlotUI(ns('downPlotHier'), "Download PDF"),
br(),
checkboxInput(ns('plotInt'),
'Interactive Plot?',
value = FALSE),
uiOutput(ns("plotInt_ui"))
)
}
# SERVER
clustHier <- function(input, output, session, dataMod) {
userFitDendHier <- reactive({
cat(file = stderr(), 'userFitDendHier \n')
loc.dm = dataMod()
if (is.null(loc.dm))
return(NULL)
loc.cl.dist = dist(loc.dm, method = s.cl.dist[as.numeric(input$selectDist)])
loc.cl.hc = hclust(loc.cl.dist, method = s.cl.linkage[as.numeric(input$selectLinkage)])
loc.dend <- as.dendrogram(loc.cl.hc)
loc.dend <- color_branches(loc.dend, k = input$inNclust)
return(loc.dend)
})
# Function instead of reactive as per:
# http://stackoverflow.com/questions/26764481/downloading-png-from-shiny-r
# This function is used to plot and to downoad a pdf
plotHier <- function() {
cat(file = stderr(), 'plotHier \n')
in.dm = dataMod()
if (is.null(in.dm))
return(NULL)
in.dend <- userFitDendHier()
if (is.null(in.dend))
return(NULL)
if (input$inRevPalette)
my_palette <-
rev(colorRampPalette(brewer.pal(9, input$selectPalette))(n = 99))
else
my_palette <-
colorRampPalette(brewer.pal(9, input$selectPalette))(n = 99)
col_labels <- get_leaves_branches_col(in.dend)
col_labels <- col_labels[order(order.dendrogram(in.dend))]
if (input$selectDend) {
assign("var.tmp.1", in.dend)
var.tmp.2 = "row"
} else {
assign("var.tmp.1", FALSE)
var.tmp.2 = "none"
}
heatmap.2(
in.dm,
Colv = "NA",
Rowv = var.tmp.1,
srtCol = 90,
dendrogram = var.tmp.2,
trace = "none",
key = input$selectKey,
margins = c(input$inMarginX, input$inMarginY),
col = my_palette,
na.col = grey(input$inNAcolor),
denscol = "black",
density.info = "density",
RowSideColors = col_labels,
colRow = col_labels,
sepcolor = grey(input$inGridColor),
colsep = 1:ncol(in.dm),
rowsep = 1:nrow(in.dm),
cexRow = input$inFontX,
cexCol = input$inFontY,
main = paste(
"Distance measure: ",
s.cl.dist[as.numeric(input$selectDist)],
"\nLinkage method: ",
s.cl.linkage[as.numeric(input$selectLinkage)]
)
)
}
# Hierarchical - display plot
output$outPlotHier <- renderPlot({
plotHier()
}, height = 800)
# Hierarchical - download pdf
callModule(downPlot, "downPlotHier", paste0('clust_hierch_',
s.cl.dist[as.numeric(input$selectDist)],
'_',
s.cl.linkage[as.numeric(input$selectLinkage)], '.pdf'), plotHier)
# Hierarchical clustering - interactive version
output$outPlotInt <- renderD3heatmap({
cat(file = stderr(), 'Int \n')
loc.dm = dataMod()
if (is.null(loc.dm))
return(NULL)
loc.dend <- userFitDendHier()
if (is.null(loc.dend))
return(NULL)
if (input$inRevPalette)
my_palette <-
rev(colorRampPalette(brewer.pal(9, input$selectPalette))(n = 99))
else
my_palette <-
colorRampPalette(brewer.pal(9, input$selectPalette))(n = 99)
col_labels <- get_leaves_branches_col(loc.dend)
col_labels <- col_labels[order(order.dendrogram(loc.dend))]
if (input$selectDend) {
assign("var.tmp.1", loc.dend)
var.tmp.2 = "row"
} else {
assign("var.tmp.1", FALSE)
var.tmp.2 = "none"
}
d3heatmap(
loc.dm,
Rowv = var.tmp.1,
dendrogram = var.tmp.2,
trace = "none",
revC = FALSE,
na.rm = FALSE,
margins = c(input$inMarginX * 10, input$inMarginY * 10),
colors = my_palette,
na.col = grey(input$inNAcolor),
cexRow = input$inFontY * 0.5,
cexCol = input$inFontX * 0.5,
xaxis_height = input$inMarginX * 10,
yaxis_width = input$inMarginY * 10,
show_grid = TRUE
)
})
# Hierarchical - choose to display regular heatmap.2 or d3heatmap (interactive)
output$plotInt_ui <- renderUI({
ns <- session$ns
if (input$plotInt)
tagList(d3heatmapOutput(ns("outPlotInt")))
else
tagList(plotOutput(ns('outPlotHier')))
})
}
\ No newline at end of file
This diff is collapsed.
# This is the server logic for a Shiny web application.
# You can find out more about building applications with Shiny here:
#
# http://shiny.rstudio.com
#
library(shiny)
library(shinyjs) #http://deanattali.com/shinyjs/
shinyServer(function(input, output, session) {
# This is only set at session start
# we use this as a way to determine which input was
# clicked in the dataInBoth reactive
counter <- reactiveValues(
# The value of inDataGen1,2 actionButton is the number of times they were pressed
dataGen1 = isolate(input$butDataGen1),
dataLoad = isolate(input$butDataLoad)
)
# This button will reset the inFileDataLoad
observeEvent(input$butReset, {
reset("fileDataLoad") # reset is a shinyjs function
})
# generate random dataset 1
dataGen1 <- eventReactive(input$butDataGen1, {
cat("dataGen1\n")
return(userDataGen())
})
# load main data file
dataLoad <- eventReactive(input$butDataLoad, {
cat("dataLoad\n")
locFilePath = input$fileDataLoad$datapath
counter$dataLoad <- input$butDataLoad - 1
if (is.null(locFilePath) || locFilePath == '') {
cat("dataLoad: null\n")
return(NULL)
}
else {
cat("dataLoad: read\n")
loc.df = read.csv(
locFilePath,
na.strings = input$rButDataNA,
sep = input$rButDataSep,
dec = input$rButDataDec
)
row.names(loc.df) = loc.df[, 1]
loc.df[, 1] = NULL
# work with data matrix, where:
# columns - categories
# rows - samples
return(as.matrix(t(loc.df)))
}
})
dataInBoth <- reactive({
# Without direct references to inDataGen1,2 and inFileLoad, inDataGen2
# does not trigger running this reactive once inDataGen1 is used.
# This is one of the more nuanced areas of reactive programming in shiny
# due to the if else logic, it isn't fetched once inDataGen1 is available
# The morale is use direct retrieval of inputs to guarantee they are available
# for if else logic checks!
locInGen1 = input$butDataGen1
locInDataLoad = input$butDataLoad
cat(
"dataInBoth\ninGen1: ",
locInGen1,
" prev=",
isolate(counter$dataGen1),
"\ninDataNuc: ",
locInDataLoad,
" prev=",
isolate(counter$dataLoad),
"\n"
)
# isolate the checks of counter reactiveValues
# as we set the values in this same reactive
if (locInGen1 != isolate(counter$dataGen1)) {
cat("dataInBoth: inDataGen1\n")
dm = userDataGen()
# no need to isolate updating the counter reactive values!
counter$dataGen1 <- locInGen1
} else if (locInDataLoad != isolate(counter$dataLoad)) {
cat("dataInBoth: inDataLoad\n")
dm = dataLoad()
# no need to isolate updating the counter reactive values!
counter$dataLoad <- locInDataLoad
} else {
cat("dataInBoth: else\n")
dm = NULL
}
return(dm)
})
# return dt with an added column with unique track object label
dataMod <- reactive({
cat(file = stderr(), 'dataMod\n')
loc.dm = dataInBoth()
if (is.null(loc.dm))
return(NULL)
if (input$chBdataScale)
loc.dm = scale(loc.dm, center = TRUE, scale = TRUE)
# take log10 of data
if (input$chBdataLog)
loc.dm = log10(loc.dm)
# winsorize
if (input$chBdataWinsor2)
loc.dm = winsor2(loc.dm)
# convert missing values in the input data to 0's
if (input$chBdataNA20)
loc.dm[is.na(loc.dm)] <- 0
# Data trimming
# data points below a threshold are set to NA
# this isn't affected by conversion to 0's above
if (input$chBdataTrim) {
loc.dm[loc.dm < as.numeric(input$inDataTrimMin) & loc.dm != 0] <- NA
# data points above a threshold are set to NA
# this isn't affected by conversion to 0's above
loc.dm[loc.dm > as.numeric(input$inDataTrimMax)] <- NA
}
# Data clipping
if (input$chBdataClip) {
loc.dm[loc.dm < as.numeric(input$inDataClipMin) &
loc.dm != 0] <- input$inDataClipMin
loc.dm[loc.dm > as.numeric(input$inDataClipMax)] <-
input$inDataClipMax
}
return(loc.dm)
})
#####
## Dynamic UI in the side panel
output$dataMin <- renderText({