# # Time Course Inspector: Shiny app for plotting time series data # Author: Maciej Dobrzynski # # This module is a tab for hierarchical clustering (base R hclust + dist) helpText.clValid = c(alertClValidNAsPresent = paste0("NAs present. The selected distance measure will work, ", "however PCA will not be avaliable."), alertClValidNAsPresentDTW = paste0("NAs present. DTW distance measure will NOT work."), alLearnMore = paste0("
Clustering ", "is an unsupervised machine learning method for partitioning ", "dataset into a set of groups called clusters. The procedure will return clusters ", "even if the data does not contain any! ", "Therefore, it’s necessary to ", "assess clustering tendency before the analysis, and ", "validate the quality of the result after clustering.
" ), alLearnMoreRel = paste0("
Determine the optimal number of clusters by inspecting ", "the average silhouette width and the total within cluster sum of squares (WSS) ", "for a range of cluster numbers.
", "Silhouette analysis first computes how close each trajectory is with others in the cluster it is assigned to, ", "this is then compared to closeness with trajectories in other clusters. ", "Larger average silhouette widths usually indicate better clustering. To make sure averaging does not hide a locally bad", "clustering, this should be inspected along with the silhouette plot in the \"Internal\" tab.
", "
WSS evaluates the compactness of clusters. ", "Compact clusters achieve low WSS values. ", "Look for the elbow in the plot of WSS as function of cluster numbers.
"), alLearnMoreInt = paste0("Evaluate the goodness of a clustering structure by inspecting ", "principal components, the dendrogram, ", "and the silhouette for a given number of clusters.
", "Principal components: Each point in the scatter plot corresponds to a single time series in the first 2 PCs space. ", "Points are coloured by cluster numbers. Compact, well separated clusters ", "indicate good partitioning. The percentage of total variance carried by each PC is indicated.
", "Dendrogram: The height of branches indicates how well clusters are separated.
", "Silhouette plot: The plot indicates for each series whether it is on average closer to series within its cluster ",
"or to series in other clusters. Each bar represents the silhouette score ",
"(Si) for one series. The height of the bars varies ",
"between 1 (the series is much closer to series in its cluster) and -1 (the series is much closer to series in an other cluster). ",
"Hence, large positive values of Si are usually associated with better clustering, while negative values are associated with worse clustering.")
)
# UI ----
clustValidUI <- function(id, label = "Validation") {
ns <- NS(id)
tagList(
h4(
"Cluster validation using ",
a("factoextra",
href="https://cran.r-project.org/web/packages/factoextra/",
title="External link")
),
actionLink(ns("alLearnMore"), "Learn more"),
br(),
br(),
fluidRow(
column(4,
selectInput(
ns("selectDiss"),
label = ("Dissimilarity measure"),
choices = list("Euclidean" = "euclidean",
"Manhattan" = "manhattan",
"Maximum" = "maximum",
"Canberra" = "canberra",
"DTW" = "DTW"),
selected = "euclidean"
),
bsAlert("alertAnchorClValidNAsPresent")
),
column(4,
selectInput(
ns("selectLinkage"),
label = ("Linkage method"),
choices = list(
"Average" = "average",
"Complete" = "complete",
"Single" = "single",
"Centroid" = "centroid",
"Ward" = "ward.D",
"Ward D2" = "ward.D2",
"McQuitty" = "mcquitty"
),
selected = "average"
)
)
),
br(),
tabsetPanel(
tabPanel("Relative",
br(),
p("Determine and visualise the optimal number of clusters. ",
actionLink(ns("alLearnMoreRel"), "Learn more")),
fluidRow(
column(2,
actionButton(ns('butPlotRel'), 'Validate!')
),
column(6,
sliderInput(
ns('slClValidMaxClust'),
'Maximum number of clusters to consider',
min = 2,
max = 20,
value = 10,
step = 1,
ticks = TRUE,
round = TRUE
)
)
),
br(),
withSpinner(plotOutput(ns('outPlotSilhAvg'))),
br(),
withSpinner(plotOutput(ns('outPlotWss')))
),
tabPanel("Internal",
br(),
p("Validate a given data partitioning. ",
actionLink(ns("alLearnMoreInt"), "Learn more")),
fluidRow(
column(2,
actionButton(ns('butPlotInt'), 'Validate!')
),
column(6,
sliderInput(
ns('slClValidNclust'),
'Number of clusters to evaluate',
min = 2,
max = 20,
value = 1,
step = 1,
ticks = TRUE,
round = TRUE
)
)
),
br(),
withSpinner(plotOutput(ns('outPlotTree'))),
br(),
withSpinner(plotOutput(ns('outPlotSilhForCut'))),
br(),
withSpinner(plotOutput(ns('outPlotClPCA')))
)
)
)
}
# SERVER ----
clustValid <- function(input, output, session, in.dataWide) {
ns = session$ns
# Return the number of clusters from the slider
# and delay by a constant in milliseconds defined in auxfunc.R
returnNclust = reactive({
return(input$slClValidNclust)
}) %>% debounce(MILLIS)
# Return max number of clusters from the slider
# and delay by a constant in milliseconds defined in auxfunc.R
returnMaxNclust = reactive({
return(input$slClValidMaxClust)
}) %>% debounce(MILLIS)
# calculate distance matrix for further clustering
# time series arranged in rows with columns corresponding to time points
calcDist <- reactive({
cat(file = stderr(), 'clustValid:calcDist \n')
loc.dm = in.dataWide()
if (is.null(loc.dm)) {
return(NULL)
}
# Throw some warnings if NAs present in the dataset.
# DTW cannot compute distance when NA's are preset.
# Other distance measures can be calculated but caution is required with interpretation.
# NAs in the wide format can result from explicit NAs in the measurment column or
# from missing rows that cause NAs to appear when convertinf from long to wide (dcast)
if(sum(is.na(loc.dm)) > 0) {
if (input$selectDiss == "DTW") {
createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresentDTW", title = "Error",
content = helpText.clValid[["alertClValidNAsPresentDTW"]],
append = FALSE,
style = "danger")
closeAlert(session, 'alertClValidNAsPresent')
return(NULL)
} else {
createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresent", title = "Warning",
content = helpText.clValid[["alertClValidNAsPresent"]],
append = FALSE,
style = "warning")
closeAlert(session, 'alertClValidNAsPresentDTW')
}
} else {
closeAlert(session, 'alertClValidNAsPresentDTW')
closeAlert(session, 'alertClValidNAsPresent')
}
# calculate distance matrix
return(proxy::dist(loc.dm, method = input$selectDiss))
})
# calculate dendrogram for a chosen number of clusters and the linkage method
calcDendCut = reactive({
cat(file = stderr(), 'clustValid:calcDendCut \n')
loc.dist = calcDist()
if (is.null(loc.dist)) {
return(NULL)
}
return(factoextra::hcut(x = loc.dist,
k = returnNclust(),
FUNcluster = "hclust",
hc_method = input$selectLinkage,
graph = FALSE))
})
# Plotting ----
# Function instead of reactive as per:
# http://stackoverflow.com/questions/26764481/downloading-png-from-shiny-r
# This function is used to plot and to downoad a pdf
# plot average silhouette
plotSilhAvg <- function() {
cat(file = stderr(), 'plotSilhAvg: in\n')
# make the f-n dependent on the button click
locBut = input$butPlotRel
# Check if required data exists
# Thanks to isolate all mods in the left panel are delayed
# until clicking the Plot button
loc.dist = isolate(calcDist())
validate(
need(!is.null(loc.dist), "Nothing to plot. Load data first!")
)
loc.p = LOCnbclust(loc.dist,
method = "silhouette",
k.max = returnMaxNclust(),
hc_metric = input$selectDiss,
hc_method = input$selectLinkage) +
xlab("Number of clusters") +
ylab("Average silhouette width") +
ggtitle("Average silhouette width for different cluster numbers") +
LOCggplotTheme(in.font.base = PLOTFONTBASE,
in.font.axis.text = PLOTFONTAXISTEXT,
in.font.axis.title = PLOTFONTAXISTITLE,
in.font.strip = PLOTFONTFACETSTRIP,
in.font.legend = PLOTFONTLEGEND)
return(loc.p)
}
# plot Ws
plotWss <- function() {
cat(file = stderr(), 'plotWss: in\n')
# make the f-n dependent on the button click
locBut = input$butPlotRel
# Check if required data exists
# Thanks to isolate all mods in the left panel are delayed
# until clicking the Plot button
loc.dist = isolate(calcDist())
validate(
need(!is.null(loc.dist), "Nothing to plot. Load data first!")
)
loc.p = LOCnbclust(loc.dist,
method = "wss",
k.max = returnMaxNclust(),
hc_metric = input$selectDiss,
hc_method = input$selectLinkage) +
xlab("Number of clusters") +
ylab("Total within cluster sum of squares") +
ggtitle("Within cluster sum of squares for different cluster numbers") +
LOCggplotTheme(in.font.base = PLOTFONTBASE,
in.font.axis.text = PLOTFONTAXISTEXT,
in.font.axis.title = PLOTFONTAXISTITLE,
in.font.strip = PLOTFONTFACETSTRIP,
in.font.legend = PLOTFONTLEGEND)
return(loc.p)
}
# plot dendrogram tree
plotTree <- function() {
cat(file = stderr(), 'plotTree: in\n')
# make the f-n dependent on the button click
locBut = input$butPlotInt
# Check if required data exists
# Thanks to isolate all mods in the left panel are delayed
# until clicking the Plot button
loc.part = calcDendCut()
validate(
need(!is.null(loc.part), "Nothing to plot. Load data first!")
)
loc.p = factoextra::fviz_dend(loc.part,
show_labels = F,
rect = T,
xlab = "Time series",
main = "Dendrogram") +
LOCggplotTheme(in.font.base = PLOTFONTBASE,
in.font.axis.text = PLOTFONTAXISTEXT,
in.font.axis.title = PLOTFONTAXISTITLE,
in.font.strip = PLOTFONTFACETSTRIP,
in.font.legend = PLOTFONTLEGEND)
return(loc.p)
}
# PCA visualization of partitioning methods
plotClPCA <- function() {
cat(file = stderr(), 'plotTree: in\n')
# make the f-n dependent on the button click
locBut = input$butPlotInt
# Check if required data exists
# Thanks to isolate all mods in the left panel are delayed
# until clicking the Plot button
loc.part = calcDendCut()
loc.dm = in.dataWide()
print(sum(is.na(loc.dm)))
validate(
need(!is.null(loc.part), "Nothing to plot. Load data first!"),
need(!is.null(loc.dm), "Nothing to plot. Load data first!"),
need(sum(is.na(loc.dm)), "Cannot calculate PCA in the presence of missing data and/or NAs.")
)
if (sum(is.na(loc.dm)) > 0)
return(NULL)
loc.p = factoextra::fviz_cluster(loc.part,
data = loc.dm,
geom = "point",
elipse.type = "convex",
main = "Principal components"
)+
LOCggplotTheme(in.font.base = PLOTFONTBASE,
in.font.axis.text = PLOTFONTAXISTEXT,
in.font.axis.title = PLOTFONTAXISTITLE,
in.font.strip = PLOTFONTFACETSTRIP,
in.font.legend = PLOTFONTLEGEND)
return(loc.p)
}
# plot silhouetts for a particular dendrogram cut
plotSilhForCut <- function() {
cat(file = stderr(), 'plotSilhForCut: in\n')
# make the f-n dependent on the button click
locBut = input$butPlotInt
# Check if required data exists
# Thanks to isolate all mods in the left panel are delayed
# until clicking the Plot button
loc.part = calcDendCut()
validate(
need(!is.null(loc.part), "Nothing to plot. Load data first!")
)
loc.p = factoextra::fviz_silhouette(loc.part,
print.summary = FALSE,
main = "Silhouette") +
xlab("Time series") +
LOCggplotTheme(in.font.base = PLOTFONTBASE,
in.font.axis.text = PLOTFONTAXISTEXT,
in.font.axis.title = PLOTFONTAXISTITLE,
in.font.strip = PLOTFONTFACETSTRIP,
in.font.legend = PLOTFONTLEGEND) +
theme(axis.text.x = element_blank())
return(loc.p)
}
# Plot rendering ----
# Display silhouette
output$outPlotSilhAvg <- renderPlot({
loc.p = plotSilhAvg()
if(is.null(loc.p))
return(NULL)
return(loc.p)
})
# Display wss
output$outPlotWss <- renderPlot({
loc.p = plotWss()
if(is.null(loc.p))
return(NULL)
return(loc.p)
})
# Display PCA of clustering
output$outPlotClPCA <- renderPlot({
# This is required to avoid
# "Warning: Error in