This server has been upgraded to GitLab release 12.10.6.

tabClValid.R 14.2 KB
Newer Older
dmattek's avatar
dmattek committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This module is a tab for hierarchical clustering (base R hclust + dist)

helpText.clValid = c(alertNAsPresentDTW = paste0("NAs present. DTW cannot calculate the distance. ",
                                                "NAs and missing data can be interpolated by activating the option in the left panel. ",
                                                "If outlier points were removed, activate \"Interpolate gaps\" or ",
                                                "decrease the threshold for maximum allowed gap length. ",
                                                "The latter will result in entire trajectories with outliers being removed."),
                    alertNAsPresent = paste0("NAs present. The selected distance measure will work with missing data, ",
                                             "however caution is recommended. NAs and missing data can be interpolated by activating the option in the left panel. ",
                                             "If outlier points were removed, activate \"Interpolate gaps\" or ",
                                             "decrease the threshold for maximum allowed gap length. ",
                                             "The latter will result in entire trajectories with outliers being removed."),
dmattek's avatar
dmattek committed
17 18
                    alLearnMore = paste0("<p><a href=http://www.sthda.com/english/wiki/print.php?id=241 title=\"External link\">Clustering</a> ",
                                         "is an <b>unsupervised</b> machine learning method for partitioning ",
dmattek's avatar
dmattek committed
19 20 21 22
                                         "dataset into a set of groups or clusters. The procedure will return clusters ",
                                         "even if the data <b>does not</b> contain any! ",
                                         "Therefore, it’s necessary to ",
                                         "assess clustering tendency before the analysis, and ",
dmattek's avatar
dmattek committed
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
                                         "validate the quality of the result after clustering.<p>"
                                         ),
                    alLearnMoreRel = paste0("<p>Determine the optimal number of clusters by inspecting ",
                                            "the average silhouette width and the total within cluster sum of squares (WSS) ",
                                            "for a range of cluster numbers.</p>", 
                                            "<p><b>Silhouette analysis</b> estimates the average distance between clusters. ",
                                            "Larger silhouette widths indicate better.<p>",
                                            "<p><b>WSS</b> evaluates the compactness of clusters. ",
                                            "Compact clusters achieve low WSS values. ",
                                            "Look for the <i>knee</i> in the plot of WSS as function of cluster numbers.</p>"),
                    alLearnMoreInt = paste0("<p>Evaluate the goodness of a clustering structure by inspecting <b>the dendrogram</b> ",
                                            "and <b>the silhouette</b> for a given number of clusters.</p>",
                                            "<p>The height of dendrogram branches indicates how well clusters are separated.</p>",
                                            "<p>The silhouette plot displays how close each time series in one cluster ", 
                                            "is to time series in the neighboring clusters. ",
                                            "A large positive silhouette (Si) indicates time series that are well clustered.",
                                            "A negative Si indicates time series that are closer to ",
                                            "a neighboring cluster, and are placed in the wrong cluster.</p>")
                    )
dmattek's avatar
dmattek committed
42 43 44 45 46 47 48


# UI ----
clustValidUI <- function(id, label = "Validation") {
  ns <- NS(id)
  
  tagList(
dmattek's avatar
dmattek committed
49 50 51 52 53 54
    h4(
      "Cluster validation using ",
      a("factoextra", 
        href="https://cran.r-project.org/web/packages/factoextra/",
        title="External link")
    ),
dmattek's avatar
dmattek committed
55 56 57 58
    actionLink(ns("alLearnMore"), "Learn more"),
    br(),
    br(),
    fluidRow(
dmattek's avatar
dmattek committed
59

dmattek's avatar
dmattek committed
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
      column(3,
             selectInput(
               ns("selectDiss"),
               label = ("Dissimilarity measure"),
               choices = list("Euclidean" = "euclidean",
                              "Manhattan" = "manhattan",
                              "Maximum"   = "maximum",
                              "Canberra"  = "canberra",
                              "DTW"       = "DTW"),
               selected = 1
             ),
             bsAlert("alertAnchorClHierNAsPresent")
             ),
      column(3,
             selectInput(
               ns("selectLinkage"),
               label = ("Linkage method"),
               choices = list(
                 "Average"  = "average",
                 "Complete" = "complete",
                 "Single"   = "single",
                 "Centroid" = "centroid",
                 "Ward"     = "ward.D",
                 "Ward D2"  = "ward.D2",
                 "McQuitty" = "mcquitty"
               ),
               selected = 2
               )
             )
    ),
    
    br(),
    tabsetPanel(
      tabPanel("Relative",
               br(),
dmattek's avatar
dmattek committed
95 96
               p("Determine and visualise the optimal number of clusters. ",
                 actionLink(ns("alLearnMoreRel"), "Learn more")),
dmattek's avatar
dmattek committed
97 98 99 100 101 102 103
               fluidRow(
                 column(2, 
                        actionButton(ns('butPlotRel'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidMaxClust'),
dmattek's avatar
dmattek committed
104
                          'Maximum number of clusters to consider',
dmattek's avatar
dmattek committed
105 106 107 108 109 110 111 112 113 114 115 116
                          min = 2,
                          max = 20,
                          value = 10,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
               withSpinner(plotOutput(ns('outPlotSilhAvg'))),
               br(),
dmattek's avatar
dmattek committed
117
               withSpinner(plotOutput(ns('outPlotWss')))
dmattek's avatar
dmattek committed
118 119 120 121
               
      ),
      tabPanel("Internal",
               br(),
dmattek's avatar
dmattek committed
122 123
               p("Validate a given data partitioning. ",
                 actionLink(ns("alLearnMoreInt"), "Learn more")),
dmattek's avatar
dmattek committed
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
               fluidRow(
                 column(2,
                        actionButton(ns('butPlotInt'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidNclust'),
                          'Number of dendrogram branches to cut',
                          min = 2,
                          max = 20,
                          value = 1,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
               withSpinner(plotOutput(ns('outPlotTree'))),
               br(),
dmattek's avatar
dmattek committed
144 145 146
               #withSpinner(plotOutput(ns('outPlotClPCA'))),
               br(),
               withSpinner(plotOutput(ns('outPlotSilhForCut')))
dmattek's avatar
dmattek committed
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
      )
    )
  )
}

# SERVER ----
clustValid <- function(input, output, session, in.data4clust) {

  ns = session$ns
  
  # calculate distance matrix for further clustering
  # time series arranged in rows with columns corresponding to time points
  userFitDistHier <- reactive({
    cat(file = stderr(), 'clustValid:userFitDistHier \n')
    
    loc.dm = in.data4clust()
    
    if (is.null(loc.dm)) {
      return(NULL)
    }
    
    # Throw some warnings if NAs present in the dataset.
    # DTW cannot compute distance when NA's are preset.
    # Other distance measures can be calculated but caution is required with interpretation.
    if(sum(is.na(loc.dm)) > 0) {
      if (input$selectPlotHierDiss == "DTW") {
        createAlert(session, "alertAnchorClHierNAsPresent", "alertNAsPresentDTW", title = "Error",
                    content = helpText.clHier[["alertNAsPresentDTW"]], 
                    append = FALSE,
                    style = "danger")
        return(NULL)
      } else {
        createAlert(session, "alertAnchorClHierNAsPresent", "alertNAsPresent", title = "Warning",
                    content = helpText.clHier[["alertNAsPresent"]], 
                    append = FALSE, 
                    style = "warning")
        closeAlert(session, 'alertNAsPresentDTW')
      }
    } else {
      closeAlert(session, 'alertNAsPresentDTW')
      closeAlert(session, 'alertNAsPresent')
    }
    
    # calculate distance matrix
    
    return(dist(loc.dm, method = input$selectPlotHierDiss))
  })
  
  
  calcDendCut = reactive({
    cat(file = stderr(), 'clustValid:calcDendCut \n')
    
    loc.dmdist = userFitDistHier()
    
    if (is.null(loc.dmdist)) {
      return(NULL)
    }
    
    return(LOChcut(x = loc.dmdist, 
                   k = input$slClValidNclust, 
                   hc_func = "hclust", 
                   hc_method = input$selectLinkage, hc_metric = input$selectDiss))
  })
  
dmattek's avatar
dmattek committed
211
  # Plotting ----
dmattek's avatar
dmattek committed
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
  # Function instead of reactive as per:
  # http://stackoverflow.com/questions/26764481/downloading-png-from-shiny-r
  # This function is used to plot and to downoad a pdf
  
  # plot average silhouette
  plotSilhAvg <- function() {

    loc.dmdist = userFitDistHier()
    
    if (is.null(loc.dmdist)) {
      return(NULL)
    }
    
    loc.p = LOCnbclust(x = loc.dmdist, 
                       FUNcluster = LOChcut,  
                       method = "silhouette", 
                       verbose = TRUE, 
                       k.max = input$slClValidMaxClust,
                       hc_metric = input$selectDiss,
dmattek's avatar
dmattek committed
231 232 233 234 235 236
                       hc_method = input$selectLinkage) +
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
    return(loc.p)
  }

  # plot Ws
  plotWss <- function() {
    
    loc.dmdist = userFitDistHier()
    
    if (is.null(loc.dmdist)) {
      return(NULL)
    }
    
    loc.p = LOCnbclust(x = loc.dmdist, 
                       FUNcluster = LOChcut,  
                       method = "wss", 
                       verbose = TRUE, 
                       k.max = input$slClValidMaxClust,
                       hc_metric = input$selectDiss,
dmattek's avatar
dmattek committed
255 256 257 258 259 260
                       hc_method = input$selectLinkage) +
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
261 262 263 264 265 266 267
    
    return(loc.p)
  }

  # plot dendrogram tree
  plotTree <- function() {
    
dmattek's avatar
dmattek committed
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
    loc.part = calcDendCut()
    
    if (is.null(loc.part)) {
      return(NULL)
    }
    
    loc.p = factoextra::fviz_dend(x = loc.part, 
                                  show_labels = F,
                                  rect = T,
                                  xlab = "Time series",
                                  k = input$slClValidNclust) +
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
    
    return(loc.p)
  }
  
  
  # PCA visualization of partitioning methods 
  plotClPCA <- function() {
    
    loc.part = calcDendCut()
dmattek's avatar
dmattek committed
293
    
dmattek's avatar
dmattek committed
294
    if (is.null(loc.part)) {
dmattek's avatar
dmattek committed
295 296 297
      return(NULL)
    }
    
dmattek's avatar
dmattek committed
298
    loc.p = factoextra::fviz_cluster(object = loc.part, ellipse.type = "convex")
dmattek's avatar
dmattek committed
299 300 301 302 303 304 305
    
    return(loc.p)
  }
  
  # plot silhouetts for a particular dendrogram cut
  plotSilhForCut <- function() {
    
dmattek's avatar
dmattek committed
306
    loc.part = calcDendCut()
dmattek's avatar
dmattek committed
307
    
dmattek's avatar
dmattek committed
308
    if (is.null(loc.part)) {
dmattek's avatar
dmattek committed
309 310 311
      return(NULL)
    }
    
dmattek's avatar
dmattek committed
312 313 314 315 316 317 318 319 320
    loc.p = factoextra::fviz_silhouette(sil.obj = loc.part, 
                                        print.summary = FALSE) +
      xlab("Time series") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND) +
      theme(axis.text.x = element_blank())
dmattek's avatar
dmattek committed
321 322 323 324
    
    return(loc.p)
  }
  
dmattek's avatar
dmattek committed
325
  # Plot rendering ----
dmattek's avatar
dmattek committed
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
  # Display silhouette
  output$outPlotSilhAvg <- renderPlot({
    locBut = input$butPlotRel
    
    if (locBut == 0) {
      cat(file = stderr(), 'outPlotSilhAvg: Go button not pressed\n')
      
      return(NULL)
    }
    
    plotSilhAvg()
  })

  
  # Display wss
  output$outPlotWss <- renderPlot({
    locBut = input$butPlotRel
    
    if (locBut == 0) {
      cat(file = stderr(), 'outPlotWss: Go button not pressed\n')
      
      return(NULL)
    }
    
    plotWss()
  })
  
  # Display tree
  output$outPlotTree <- renderPlot({
    locBut = input$butPlotInt
    
    if (locBut == 0) {
      cat(file = stderr(), 'outPlotTree: Go button not pressed\n')
      
      return(NULL)
    }
    
    plotTree()
  })
  
  # Display silhouette for a dendrogram cut
  output$outPlotSilhForCut <- renderPlot({
    locBut = input$butPlotInt
    
    if (locBut == 0) {
      cat(file = stderr(), 'outPlotSilhForCut: Go button not pressed\n')
      
      return(NULL)
    }
    
    plotSilhForCut()
  })
  
  # Pop-overs ----
  addPopover(session, 
             ns("alLearnMore"),
             title = "Classes of cluster validation",
             content = helpText.clValid[["alLearnMore"]],
             trigger = "click")
dmattek's avatar
dmattek committed
385 386 387 388 389 390 391 392 393 394 395 396
  
  addPopover(session, 
             ns("alLearnMoreRel"),
             title = "Relative validation",
             content = helpText.clValid[["alLearnMoreRel"]],
             trigger = "click")
  
  addPopover(session, 
             ns("alLearnMoreInt"),
             title = "Internal validation",
             content = helpText.clValid[["alLearnMoreInt"]],
             trigger = "click")
dmattek's avatar
dmattek committed
397 398 399
}