Commit ebddc4e8 authored by dmattek's avatar dmattek

Using builtin functions of factoextra

parent 578defa6
...@@ -4,16 +4,8 @@ ...@@ -4,16 +4,8 @@
# #
# This module is a tab for hierarchical clustering (base R hclust + dist) # This module is a tab for hierarchical clustering (base R hclust + dist)
helpText.clValid = c(alertNAsPresentDTW = paste0("NAs present. DTW cannot calculate the distance. ", helpText.clValid = c(alertClValidNAsPresent = paste0("NAs present. The selected distance measure will work, ",
"NAs and missing data can be interpolated by activating the option in the left panel. ", "however caution is recommended. Consider interpolation of NAs and missing data in the left panel."),
"If outlier points were removed, activate \"Interpolate gaps\" or ",
"decrease the threshold for maximum allowed gap length. ",
"The latter will result in entire trajectories with outliers being removed."),
alertNAsPresent = paste0("NAs present. The selected distance measure will work with missing data, ",
"however caution is recommended. NAs and missing data can be interpolated by activating the option in the left panel. ",
"If outlier points were removed, activate \"Interpolate gaps\" or ",
"decrease the threshold for maximum allowed gap length. ",
"The latter will result in entire trajectories with outliers being removed."),
alLearnMore = paste0("<p><a href=http://www.sthda.com/english/wiki/print.php?id=241 title=\"External link\">Clustering</a> ", alLearnMore = paste0("<p><a href=http://www.sthda.com/english/wiki/print.php?id=241 title=\"External link\">Clustering</a> ",
"is an <b>unsupervised</b> machine learning method for partitioning ", "is an <b>unsupervised</b> machine learning method for partitioning ",
"dataset into a set of groups or clusters. The procedure will return clusters ", "dataset into a set of groups or clusters. The procedure will return clusters ",
...@@ -30,8 +22,12 @@ helpText.clValid = c(alertNAsPresentDTW = paste0("NAs present. DTW cannot calcul ...@@ -30,8 +22,12 @@ helpText.clValid = c(alertNAsPresentDTW = paste0("NAs present. DTW cannot calcul
"<p><b>WSS</b> evaluates the compactness of clusters. ", "<p><b>WSS</b> evaluates the compactness of clusters. ",
"Compact clusters achieve low WSS values. ", "Compact clusters achieve low WSS values. ",
"Look for the <i>knee</i> in the plot of WSS as function of cluster numbers.</p>"), "Look for the <i>knee</i> in the plot of WSS as function of cluster numbers.</p>"),
alLearnMoreInt = paste0("<p>Evaluate the goodness of a clustering structure by inspecting <b>the dendrogram</b> ", alLearnMoreInt = paste0("<p>Evaluate the goodness of a clustering structure by inspecting ",
"and <b>the silhouette</b> for a given number of clusters.</p>", "principle components, the dendrogram, ",
"and the silhouette for a given number of clusters.</p>",
"<p>Each point in the scatter plot of 2 principle components corresponds to a single time series. ",
"Points are coloured by cluster numbers. Compact, well separated clusters ",
"indicate good partitioning.</p>",
"<p>The height of dendrogram branches indicates how well clusters are separated.</p>", "<p>The height of dendrogram branches indicates how well clusters are separated.</p>",
"<p>The silhouette plot displays how close each time series in one cluster ", "<p>The silhouette plot displays how close each time series in one cluster ",
"is to time series in the neighboring clusters. ", "is to time series in the neighboring clusters. ",
...@@ -57,20 +53,19 @@ clustValidUI <- function(id, label = "Validation") { ...@@ -57,20 +53,19 @@ clustValidUI <- function(id, label = "Validation") {
br(), br(),
fluidRow( fluidRow(
column(3, column(4,
selectInput( selectInput(
ns("selectDiss"), ns("selectDiss"),
label = ("Dissimilarity measure"), label = ("Dissimilarity measure"),
choices = list("Euclidean" = "euclidean", choices = list("Euclidean" = "euclidean",
"Manhattan" = "manhattan", "Manhattan" = "manhattan",
"Maximum" = "maximum", "Maximum" = "maximum",
"Canberra" = "canberra", "Canberra" = "canberra"),
"DTW" = "DTW"), selected = "euclidean"
selected = 1
), ),
bsAlert("alertAnchorClHierNAsPresent") bsAlert("alertAnchorClValidNAsPresent")
), ),
column(3, column(4,
selectInput( selectInput(
ns("selectLinkage"), ns("selectLinkage"),
label = ("Linkage method"), label = ("Linkage method"),
...@@ -83,7 +78,7 @@ clustValidUI <- function(id, label = "Validation") { ...@@ -83,7 +78,7 @@ clustValidUI <- function(id, label = "Validation") {
"Ward D2" = "ward.D2", "Ward D2" = "ward.D2",
"McQuitty" = "mcquitty" "McQuitty" = "mcquitty"
), ),
selected = 2 selected = "average"
) )
) )
), ),
...@@ -128,7 +123,7 @@ clustValidUI <- function(id, label = "Validation") { ...@@ -128,7 +123,7 @@ clustValidUI <- function(id, label = "Validation") {
column(6, column(6,
sliderInput( sliderInput(
ns('slClValidNclust'), ns('slClValidNclust'),
'Number of dendrogram branches to cut', 'Number of clusters to evaluate',
min = 2, min = 2,
max = 20, max = 20,
value = 1, value = 1,
...@@ -139,9 +134,9 @@ clustValidUI <- function(id, label = "Validation") { ...@@ -139,9 +134,9 @@ clustValidUI <- function(id, label = "Validation") {
) )
), ),
br(), br(),
withSpinner(plotOutput(ns('outPlotTree'))), withSpinner(plotOutput(ns('outPlotClPCA'))),
br(), br(),
#withSpinner(plotOutput(ns('outPlotClPCA'))), withSpinner(plotOutput(ns('outPlotTree'))),
br(), br(),
withSpinner(plotOutput(ns('outPlotSilhForCut'))) withSpinner(plotOutput(ns('outPlotSilhForCut')))
) )
...@@ -168,23 +163,14 @@ clustValid <- function(input, output, session, in.data4clust) { ...@@ -168,23 +163,14 @@ clustValid <- function(input, output, session, in.data4clust) {
# Throw some warnings if NAs present in the dataset. # Throw some warnings if NAs present in the dataset.
# DTW cannot compute distance when NA's are preset. # DTW cannot compute distance when NA's are preset.
# Other distance measures can be calculated but caution is required with interpretation. # Other distance measures can be calculated but caution is required with interpretation.
print(sum(is.na(loc.dm)))
if(sum(is.na(loc.dm)) > 0) { if(sum(is.na(loc.dm)) > 0) {
if (input$selectPlotHierDiss == "DTW") { createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresent", title = "Warning",
createAlert(session, "alertAnchorClHierNAsPresent", "alertNAsPresentDTW", title = "Error", content = helpText.clValid[["alertClValidNAsPresent"]],
content = helpText.clHier[["alertNAsPresentDTW"]],
append = FALSE,
style = "danger")
return(NULL)
} else {
createAlert(session, "alertAnchorClHierNAsPresent", "alertNAsPresent", title = "Warning",
content = helpText.clHier[["alertNAsPresent"]],
append = FALSE, append = FALSE,
style = "warning") style = "warning")
closeAlert(session, 'alertNAsPresentDTW')
}
} else { } else {
closeAlert(session, 'alertNAsPresentDTW') closeAlert(session, 'alertClValidNAsPresent')
closeAlert(session, 'alertNAsPresent')
} }
# calculate distance matrix # calculate distance matrix
...@@ -196,16 +182,49 @@ clustValid <- function(input, output, session, in.data4clust) { ...@@ -196,16 +182,49 @@ clustValid <- function(input, output, session, in.data4clust) {
calcDendCut = reactive({ calcDendCut = reactive({
cat(file = stderr(), 'clustValid:calcDendCut \n') cat(file = stderr(), 'clustValid:calcDendCut \n')
loc.dmdist = userFitDistHier() loc.dm = returnDMwithChecks()
if (is.null(loc.dmdist)) { if (is.null(loc.dm)) {
return(NULL) return(NULL)
} }
return(LOChcut(x = loc.dmdist, return(factoextra::eclust(x = loc.dm,
k = input$slClValidNclust, FUNcluster = "hclust",
hc_func = "hclust", k = input$slClValidNclust,
hc_method = input$selectLinkage, hc_metric = input$selectDiss)) hc_method = input$selectLinkage,
hc_metric = input$selectDiss,
graph = FALSE))
})
# Return a matrix with time series in wide format
# If data contains NAs (from explicit NAs or due to missing time points,
# or due to missing time points after outlier removal),
# some warnings are thrown. E.g. DTW cannot caluclate distance if NAs are present.
returnDMwithChecks = reactive({
cat(file = stderr(), 'clustValid:returnDMwithChecks \n')
loc.dm = in.data4clust()
if (is.null(loc.dm)) {
return(NULL)
}
# Throw some warnings if NAs present in the dataset.
# DTW cannot compute distance when NA's are preset.
# Other distance measures can be calculated but caution is required with interpretation.
print(sum(is.na(loc.dm)))
if(sum(is.na(loc.dm)) > 0) {
createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresent",
title = "Warning",
content = helpText.clValid[["alertClValidNAsPresent"]],
append = FALSE,
style = "warning")
} else {
closeAlert(session, 'alertClValidNAsPresent')
}
return(loc.dm)
}) })
# Plotting ---- # Plotting ----
...@@ -216,19 +235,27 @@ clustValid <- function(input, output, session, in.data4clust) { ...@@ -216,19 +235,27 @@ clustValid <- function(input, output, session, in.data4clust) {
# plot average silhouette # plot average silhouette
plotSilhAvg <- function() { plotSilhAvg <- function() {
loc.dmdist = userFitDistHier() locBut = input$butPlotRel
if (locBut == 0) {
cat(file = stderr(), 'plotSilhAvg: Go button not pressed\n')
return(NULL)
}
if (is.null(loc.dmdist)) { loc.dm = returnDMwithChecks()
if (is.null(loc.dm)) {
return(NULL) return(NULL)
} }
loc.p = LOCnbclust(x = loc.dmdist, loc.p = factoextra::fviz_nbclust(loc.dm,
FUNcluster = LOChcut, hcut,
method = "silhouette", method = "silhouette",
verbose = TRUE, k.max = input$slClValidMaxClust,
k.max = input$slClValidMaxClust, hc_metric = input$selectDiss,
hc_metric = input$selectDiss, hc_method = input$selectLinkage) +
hc_method = input$selectLinkage) + xlab("Number of clusters") +
ylab("Average silhouette width") +
ggtitle("Optimal number of clusters from silhouette analysis") +
LOCggplotTheme(in.font.base = PLOTFONTBASE, LOCggplotTheme(in.font.base = PLOTFONTBASE,
in.font.axis.text = PLOTFONTAXISTEXT, in.font.axis.text = PLOTFONTAXISTEXT,
in.font.axis.title = PLOTFONTAXISTITLE, in.font.axis.title = PLOTFONTAXISTITLE,
...@@ -240,19 +267,27 @@ clustValid <- function(input, output, session, in.data4clust) { ...@@ -240,19 +267,27 @@ clustValid <- function(input, output, session, in.data4clust) {
# plot Ws # plot Ws
plotWss <- function() { plotWss <- function() {
loc.dmdist = userFitDistHier() locBut = input$butPlotRel
if (locBut == 0) {
cat(file = stderr(), 'plotWss: Go button not pressed\n')
return(NULL)
}
if (is.null(loc.dmdist)) { loc.dm = returnDMwithChecks()
if (is.null(loc.dm)) {
return(NULL) return(NULL)
} }
loc.p = LOCnbclust(x = loc.dmdist, loc.p = factoextra::fviz_nbclust(loc.dm,
FUNcluster = LOChcut, hcut,
method = "wss", method = "wss",
verbose = TRUE, k.max = input$slClValidMaxClust,
k.max = input$slClValidMaxClust, hc_metric = input$selectDiss,
hc_metric = input$selectDiss, hc_method = input$selectLinkage) +
hc_method = input$selectLinkage) + xlab("Number of clusters") +
ylab("Total within cluster sum of squares") +
ggtitle("Within cluster sum of squares for different cluster numbers") +
LOCggplotTheme(in.font.base = PLOTFONTBASE, LOCggplotTheme(in.font.base = PLOTFONTBASE,
in.font.axis.text = PLOTFONTAXISTEXT, in.font.axis.text = PLOTFONTAXISTEXT,
in.font.axis.title = PLOTFONTAXISTITLE, in.font.axis.title = PLOTFONTAXISTITLE,
...@@ -265,17 +300,23 @@ clustValid <- function(input, output, session, in.data4clust) { ...@@ -265,17 +300,23 @@ clustValid <- function(input, output, session, in.data4clust) {
# plot dendrogram tree # plot dendrogram tree
plotTree <- function() { plotTree <- function() {
loc.part = calcDendCut() locBut = input$butPlotInt
if (locBut == 0) {
cat(file = stderr(), 'plotTree: Go button not pressed\n')
return(NULL)
}
loc.part = calcDendCut()
if (is.null(loc.part)) { if (is.null(loc.part)) {
return(NULL) return(NULL)
} }
loc.p = factoextra::fviz_dend(x = loc.part, loc.p = factoextra::fviz_dend(loc.part,
show_labels = F, show_labels = F,
rect = T, rect = T,
xlab = "Time series", xlab = "Time series",
k = input$slClValidNclust) + main = "Dendrogram") +
LOCggplotTheme(in.font.base = PLOTFONTBASE, LOCggplotTheme(in.font.base = PLOTFONTBASE,
in.font.axis.text = PLOTFONTAXISTEXT, in.font.axis.text = PLOTFONTAXISTEXT,
in.font.axis.title = PLOTFONTAXISTITLE, in.font.axis.title = PLOTFONTAXISTITLE,
...@@ -289,13 +330,23 @@ clustValid <- function(input, output, session, in.data4clust) { ...@@ -289,13 +330,23 @@ clustValid <- function(input, output, session, in.data4clust) {
# PCA visualization of partitioning methods # PCA visualization of partitioning methods
plotClPCA <- function() { plotClPCA <- function() {
loc.part = calcDendCut() locBut = input$butPlotInt
if (locBut == 0) {
cat(file = stderr(), 'plotClPCA: Go button not pressed\n')
return(NULL)
}
loc.part = calcDendCut()
if (is.null(loc.part)) { if (is.null(loc.part)) {
return(NULL) return(NULL)
} }
loc.p = factoextra::fviz_cluster(object = loc.part, ellipse.type = "convex") loc.p = factoextra::fviz_cluster(loc.part,
geom = "point",
elipse.type = "norm",
main = "Principle components"
)
return(loc.p) return(loc.p)
} }
...@@ -303,14 +354,21 @@ clustValid <- function(input, output, session, in.data4clust) { ...@@ -303,14 +354,21 @@ clustValid <- function(input, output, session, in.data4clust) {
# plot silhouetts for a particular dendrogram cut # plot silhouetts for a particular dendrogram cut
plotSilhForCut <- function() { plotSilhForCut <- function() {
loc.part = calcDendCut() locBut = input$butPlotInt
if (locBut == 0) {
cat(file = stderr(), 'plotSilhForCut: Go button not pressed\n')
return(NULL)
}
loc.part = calcDendCut()
if (is.null(loc.part)) { if (is.null(loc.part)) {
return(NULL) return(NULL)
} }
loc.p = factoextra::fviz_silhouette(sil.obj = loc.part, loc.p = factoextra::fviz_silhouette(loc.part,
print.summary = FALSE) + print.summary = FALSE,
main = "Silhouette") +
xlab("Time series") + xlab("Time series") +
LOCggplotTheme(in.font.base = PLOTFONTBASE, LOCggplotTheme(in.font.base = PLOTFONTBASE,
in.font.axis.text = PLOTFONTAXISTEXT, in.font.axis.text = PLOTFONTAXISTEXT,
...@@ -325,55 +383,72 @@ clustValid <- function(input, output, session, in.data4clust) { ...@@ -325,55 +383,72 @@ clustValid <- function(input, output, session, in.data4clust) {
# Plot rendering ---- # Plot rendering ----
# Display silhouette # Display silhouette
output$outPlotSilhAvg <- renderPlot({ output$outPlotSilhAvg <- renderPlot({
locBut = input$butPlotRel loc.p = plotSilhAvg()
if(is.null(loc.p))
if (locBut == 0) {
cat(file = stderr(), 'outPlotSilhAvg: Go button not pressed\n')
return(NULL) return(NULL)
}
plotSilhAvg() return(loc.p)
}) })
# Display wss # Display wss
output$outPlotWss <- renderPlot({ output$outPlotWss <- renderPlot({
locBut = input$butPlotRel loc.p = plotWss()
if(is.null(loc.p))
return(NULL)
if (locBut == 0) { return(loc.p)
cat(file = stderr(), 'outPlotWss: Go button not pressed\n') })
# Display PCA of clustering
output$outPlotClPCA <- renderPlot({
# This is required to avoid
# "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
# When running on a server. Based on:
# https://github.com/ropensci/plotly/issues/494
# if (names(dev.cur()) != "null device")
# dev.off()
# pdf(NULL)
loc.p = plotClPCA()
if(is.null(loc.p))
return(NULL) return(NULL)
}
plotWss() return(loc.p)
}) })
# Display tree # Display tree
output$outPlotTree <- renderPlot({ output$outPlotTree <- renderPlot({
locBut = input$butPlotInt # This is required to avoid
# "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
if (locBut == 0) { # When running on a server. Based on:
cat(file = stderr(), 'outPlotTree: Go button not pressed\n') # https://github.com/ropensci/plotly/issues/494
# if (names(dev.cur()) != "null device")
# dev.off()
# pdf(NULL)
loc.p = plotTree()
if(is.null(loc.p))
return(NULL) return(NULL)
}
plotTree() return(loc.p)
}) })
# Display silhouette for a dendrogram cut # Display silhouette for a dendrogram cut
output$outPlotSilhForCut <- renderPlot({ output$outPlotSilhForCut <- renderPlot({
locBut = input$butPlotInt # This is required to avoid
# "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
if (locBut == 0) { # When running on a server. Based on:
cat(file = stderr(), 'outPlotSilhForCut: Go button not pressed\n') # https://github.com/ropensci/plotly/issues/494
# if (names(dev.cur()) != "null device")
# dev.off()
# pdf(NULL)
loc.p = plotSilhForCut()
if(is.null(loc.p))
return(NULL) return(NULL)
}
plotSilhForCut() return(loc.p)
}) })
# Pop-overs ---- # Pop-overs ----
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment