This server has been upgraded to GitLab release 12.10.6.

tabClValid.R 18.4 KB
Newer Older
dmattek's avatar
dmattek committed
1 2 3 4 5 6
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This module is a tab for hierarchical clustering (base R hclust + dist)

7
helpText.clValid = c(alertClValidNAsPresent = paste0("NAs present. The selected distance measure will work, ",
8 9
                                              "however PCA will not be avaliable."),
                     alertClValidNAsPresentDTW = paste0("NAs present. DTW distance measure will NOT work."),
dmattek's avatar
dmattek committed
10 11
                    alLearnMore = paste0("<p><a href=http://www.sthda.com/english/wiki/print.php?id=241 title=\"External link\">Clustering</a> ",
                                         "is an <b>unsupervised</b> machine learning method for partitioning ",
12
                                         "dataset into a set of groups called clusters. The procedure will return clusters ",
dmattek's avatar
dmattek committed
13 14 15
                                         "even if the data <b>does not</b> contain any! ",
                                         "Therefore, it’s necessary to ",
                                         "assess clustering tendency before the analysis, and ",
dmattek's avatar
dmattek committed
16 17 18 19 20
                                         "validate the quality of the result after clustering.<p>"
                                         ),
                    alLearnMoreRel = paste0("<p>Determine the optimal number of clusters by inspecting ",
                                            "the average silhouette width and the total within cluster sum of squares (WSS) ",
                                            "for a range of cluster numbers.</p>", 
21 22 23 24
                                            "<p><b>Silhouette analysis</b> first computes how close each trajectory is with others in the cluster it is assigned to, ",
                                            "this is then compared to closeness with trajectories in other clusters. ",
                                            "Larger average silhouette widths usually indicate better clustering. To make sure averaging does not hide a locally bad",
                                            "clustering, this should be inspected along with the silhouette plot in the \"Internal\" tab.<p>",
dmattek's avatar
dmattek committed
25 26
                                            "<p><b>WSS</b> evaluates the compactness of clusters. ",
                                            "Compact clusters achieve low WSS values. ",
27
                                            "Look for the <i>elbow</i> in the plot of WSS as function of cluster numbers.</p>"),
28
                    alLearnMoreInt = paste0("<p>Evaluate the goodness of a clustering structure by inspecting ",
29
                                            "principal components, the dendrogram, ",
30
                                            "and the silhouette for a given number of clusters.</p>",
31
                                            "<p><b>Principal components:</b> Each point in the scatter plot corresponds to a single time series in the first 2 PCs space. ",
32
                                            "Points are coloured by cluster numbers. Compact, well separated clusters ",
33 34 35 36 37 38 39
                                            "indicate good partitioning. The percentage of total variance carried by each PC is indicated.</p>",
                                            "<p><b>Dendrogram:</b> The height of branches indicates how well clusters are separated.</p>",
                                            "<p><b>Silhouette plot:</b> The plot indicates for each series whether it is on average closer to series within its cluster ",
                                            "or to series in other clusters. Each bar represents the <a href=https://en.wikipedia.org/wiki/Silhouette_(clustering) title=\"External link\">silhouette score</a> ",
                                            "(Si) for one series. The height of the bars varies ",
                                            "between 1 (the series is much closer to series in its cluster) and -1 (the series is much closer to series in an other cluster). ",
                                            "Hence, large positive values of Si are usually associated with better clustering, while negative values are associated with worse clustering.")
dmattek's avatar
dmattek committed
40
                    )
dmattek's avatar
dmattek committed
41 42 43 44 45 46 47


# UI ----
clustValidUI <- function(id, label = "Validation") {
  ns <- NS(id)
  
  tagList(
dmattek's avatar
dmattek committed
48 49 50 51 52 53
    h4(
      "Cluster validation using ",
      a("factoextra", 
        href="https://cran.r-project.org/web/packages/factoextra/",
        title="External link")
    ),
dmattek's avatar
dmattek committed
54 55 56 57
    actionLink(ns("alLearnMore"), "Learn more"),
    br(),
    br(),
    fluidRow(
dmattek's avatar
dmattek committed
58

59
      column(4,
dmattek's avatar
dmattek committed
60 61 62 63 64 65
             selectInput(
               ns("selectDiss"),
               label = ("Dissimilarity measure"),
               choices = list("Euclidean" = "euclidean",
                              "Manhattan" = "manhattan",
                              "Maximum"   = "maximum",
66 67
                              "Canberra"  = "canberra",
                              "DTW"       = "DTW"),
68
               selected = "euclidean"
dmattek's avatar
dmattek committed
69
             ),
70
             bsAlert("alertAnchorClValidNAsPresent")
dmattek's avatar
dmattek committed
71
             ),
72
      column(4,
dmattek's avatar
dmattek committed
73 74 75 76 77 78 79 80 81 82 83 84
             selectInput(
               ns("selectLinkage"),
               label = ("Linkage method"),
               choices = list(
                 "Average"  = "average",
                 "Complete" = "complete",
                 "Single"   = "single",
                 "Centroid" = "centroid",
                 "Ward"     = "ward.D",
                 "Ward D2"  = "ward.D2",
                 "McQuitty" = "mcquitty"
               ),
85
               selected = "average"
dmattek's avatar
dmattek committed
86 87 88 89 90 91 92 93
               )
             )
    ),
    
    br(),
    tabsetPanel(
      tabPanel("Relative",
               br(),
dmattek's avatar
dmattek committed
94 95
               p("Determine and visualise the optimal number of clusters. ",
                 actionLink(ns("alLearnMoreRel"), "Learn more")),
dmattek's avatar
dmattek committed
96 97 98 99 100 101 102
               fluidRow(
                 column(2, 
                        actionButton(ns('butPlotRel'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidMaxClust'),
dmattek's avatar
dmattek committed
103
                          'Maximum number of clusters to consider',
dmattek's avatar
dmattek committed
104 105 106 107 108 109 110 111 112 113 114 115
                          min = 2,
                          max = 20,
                          value = 10,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
               withSpinner(plotOutput(ns('outPlotSilhAvg'))),
               br(),
dmattek's avatar
dmattek committed
116
               withSpinner(plotOutput(ns('outPlotWss')))
dmattek's avatar
dmattek committed
117 118 119 120
               
      ),
      tabPanel("Internal",
               br(),
dmattek's avatar
dmattek committed
121 122
               p("Validate a given data partitioning. ",
                 actionLink(ns("alLearnMoreInt"), "Learn more")),
dmattek's avatar
dmattek committed
123 124 125 126 127 128 129
               fluidRow(
                 column(2,
                        actionButton(ns('butPlotInt'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidNclust'),
130
                          'Number of clusters to evaluate',
dmattek's avatar
dmattek committed
131 132 133 134 135 136 137 138 139 140
                          min = 2,
                          max = 20,
                          value = 1,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
141
               withSpinner(plotOutput(ns('outPlotTree'))),
dmattek's avatar
dmattek committed
142
               br(),
143 144 145
               withSpinner(plotOutput(ns('outPlotSilhForCut'))),
               br(),
               withSpinner(plotOutput(ns('outPlotClPCA')))
dmattek's avatar
dmattek committed
146 147 148 149 150 151
      )
    )
  )
}

# SERVER ----
152
clustValid <- function(input, output, session, in.dataWide) {
dmattek's avatar
dmattek committed
153 154 155

  ns = session$ns
  
156 157 158 159 160 161 162 163 164 165 166 167
  # Return the number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnNclust = reactive({
    return(input$slClValidNclust)
  }) %>% debounce(MILLIS)
  
  # Return max number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnMaxNclust = reactive({
    return(input$slClValidMaxClust)
  }) %>% debounce(MILLIS)
  
dmattek's avatar
dmattek committed
168 169
  # calculate distance matrix for further clustering
  # time series arranged in rows with columns corresponding to time points
170 171
  calcDist <- reactive({
    cat(file = stderr(), 'clustValid:calcDist \n')
dmattek's avatar
dmattek committed
172
    
173
    loc.dm = in.dataWide()
dmattek's avatar
dmattek committed
174 175 176 177 178 179 180 181
    
    if (is.null(loc.dm)) {
      return(NULL)
    }
    
    # Throw some warnings if NAs present in the dataset.
    # DTW cannot compute distance when NA's are preset.
    # Other distance measures can be calculated but caution is required with interpretation.
182 183
    # NAs in the wide format can result from explicit NAs in the measurment column or
    # from missing rows that cause NAs to appear when convertinf from long to wide (dcast)
dmattek's avatar
dmattek committed
184
    if(sum(is.na(loc.dm)) > 0) {
185 186 187 188 189 190 191 192 193 194
      if (input$selectDiss == "DTW") {
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresentDTW", title = "Error",
                    content = helpText.clValid[["alertClValidNAsPresentDTW"]], 
                    append = FALSE,
                    style = "danger")
        closeAlert(session, 'alertClValidNAsPresent')
        
        return(NULL)
        
      } else {
195 196
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresent", title = "Warning",
                    content = helpText.clValid[["alertClValidNAsPresent"]], 
dmattek's avatar
dmattek committed
197 198
                    append = FALSE, 
                    style = "warning")
199 200
        closeAlert(session, 'alertClValidNAsPresentDTW')
      }
dmattek's avatar
dmattek committed
201
    } else {
202
      closeAlert(session, 'alertClValidNAsPresentDTW')
203
      closeAlert(session, 'alertClValidNAsPresent')
204
    }    
dmattek's avatar
dmattek committed
205 206
    
    
207 208
    # calculate distance matrix
    return(proxy::dist(loc.dm, method = input$selectDiss))
dmattek's avatar
dmattek committed
209 210
  })
  
211
  # calculate dendrogram for a chosen number of clusters and the linkage method
dmattek's avatar
dmattek committed
212 213 214
  calcDendCut = reactive({
    cat(file = stderr(), 'clustValid:calcDendCut \n')
    
215
    loc.dist = calcDist()
dmattek's avatar
dmattek committed
216
    
217
    if (is.null(loc.dist)) {
dmattek's avatar
dmattek committed
218 219 220
      return(NULL)
    }
    
221 222
    return(factoextra::hcut(x = loc.dist,
                            k = returnNclust(),
223 224 225 226 227
                              FUNcluster = "hclust",
                              hc_method = input$selectLinkage, 
                              graph = FALSE))
  })
  
dmattek's avatar
dmattek committed
228
  # Plotting ----
dmattek's avatar
dmattek committed
229 230 231 232 233 234
  # Function instead of reactive as per:
  # http://stackoverflow.com/questions/26764481/downloading-png-from-shiny-r
  # This function is used to plot and to downoad a pdf
  
  # plot average silhouette
  plotSilhAvg <- function() {
235
    cat(file = stderr(), 'plotSilhAvg: in\n')
dmattek's avatar
dmattek committed
236
    
237 238 239 240 241 242 243 244 245 246 247 248
    # make the f-n dependent on the button click
    locBut = input$butPlotRel

    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
    validate(
      need(!is.null(loc.dist), "Nothing to plot. Load data first!")
    )    

    loc.p = LOCnbclust(loc.dist,
249
                                     method = "silhouette",
250
                                     k.max = returnMaxNclust(),
251 252 253 254
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Average silhouette width") +
255
      ggtitle("Average silhouette width for different cluster numbers") +
dmattek's avatar
dmattek committed
256 257 258 259 260
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
261 262 263 264 265
    return(loc.p)
  }

  # plot Ws
  plotWss <- function() {
266
    cat(file = stderr(), 'plotWss: in\n')
dmattek's avatar
dmattek committed
267
    
268
    # make the f-n dependent on the button click
269
    locBut = input$butPlotRel
dmattek's avatar
dmattek committed
270
    
271 272 273 274 275 276 277
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
    validate(
      need(!is.null(loc.dist), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
278
    
279
    loc.p = LOCnbclust(loc.dist,
280
                                     method = "wss",
281
                                     k.max = returnMaxNclust(),
282 283 284 285 286
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Total within cluster sum of squares") +
      ggtitle("Within cluster sum of squares for different cluster numbers") +
dmattek's avatar
dmattek committed
287 288 289 290 291
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
292 293 294 295 296 297
    
    return(loc.p)
  }

  # plot dendrogram tree
  plotTree <- function() {
298
    cat(file = stderr(), 'plotTree: in\n')
dmattek's avatar
dmattek committed
299
    
300
    # make the f-n dependent on the button click
301
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
302
    
303 304 305
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
306
    loc.part = calcDendCut()
307 308 309
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
310
    
311
    loc.p = factoextra::fviz_dend(loc.part, 
dmattek's avatar
dmattek committed
312 313
                                  show_labels = F,
                                  rect = T,
314 315
                                  xlab = "Time series", 
                                  main = "Dendrogram") +
dmattek's avatar
dmattek committed
316 317 318 319 320 321 322 323 324 325 326 327
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
    
    return(loc.p)
  }
  
  
  # PCA visualization of partitioning methods 
  plotClPCA <- function() {
328
    cat(file = stderr(), 'plotTree: in\n')
dmattek's avatar
dmattek committed
329
    
330
    # make the f-n dependent on the button click
331
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
332
    
333 334 335
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
336
    loc.part = calcDendCut()
337 338 339 340 341 342 343 344 345 346
    loc.dm = in.dataWide()
    print(sum(is.na(loc.dm)))
    
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!"),
      need(!is.null(loc.dm),   "Nothing to plot. Load data first!"),
      need(sum(is.na(loc.dm)), "Cannot calculate PCA in the presence of missing data and/or NAs.")
    )    
    
    if (sum(is.na(loc.dm)) > 0)
dmattek's avatar
dmattek committed
347 348
      return(NULL)
    
349
    loc.p = factoextra::fviz_cluster(loc.part, 
350
                                     data = loc.dm,
351
                                     geom = "point",
352
                                     elipse.type = "convex", 
353
                                     main = "Principal components"
354 355 356 357 358 359
                                     )+
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
360 361 362 363 364 365
    
    return(loc.p)
  }
  
  # plot silhouetts for a particular dendrogram cut
  plotSilhForCut <- function() {
366
    cat(file = stderr(), 'plotSilhForCut: in\n')
dmattek's avatar
dmattek committed
367
    
368
    # make the f-n dependent on the button click
369
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
370
    
371 372 373
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
374
    loc.part = calcDendCut()
375 376 377
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
378
    
379 380 381
    loc.p = factoextra::fviz_silhouette(loc.part, 
                                        print.summary = FALSE, 
                                        main = "Silhouette") +
dmattek's avatar
dmattek committed
382 383 384 385 386 387 388
      xlab("Time series") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND) +
      theme(axis.text.x = element_blank())
dmattek's avatar
dmattek committed
389 390 391 392
    
    return(loc.p)
  }
  
dmattek's avatar
dmattek committed
393
  # Plot rendering ----
dmattek's avatar
dmattek committed
394 395
  # Display silhouette
  output$outPlotSilhAvg <- renderPlot({
396 397
    loc.p = plotSilhAvg()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
398 399
      return(NULL)
    
400
    return(loc.p)
dmattek's avatar
dmattek committed
401 402 403 404 405
  })

  
  # Display wss
  output$outPlotWss <- renderPlot({
406 407 408
    loc.p = plotWss()
    if(is.null(loc.p))
      return(NULL)
dmattek's avatar
dmattek committed
409
    
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
    return(loc.p)
  })
  
  # Display PCA of clustering
  output$outPlotClPCA <- renderPlot({
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotClPCA()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
425 426
      return(NULL)
    
427
    return(loc.p)
dmattek's avatar
dmattek committed
428 429 430 431
  })
  
  # Display tree
  output$outPlotTree <- renderPlot({
432 433 434 435 436 437 438 439 440 441
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotTree()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
442 443
      return(NULL)
    
444
    return(loc.p)
dmattek's avatar
dmattek committed
445 446 447 448
  })
  
  # Display silhouette for a dendrogram cut
  output$outPlotSilhForCut <- renderPlot({
449 450 451 452 453 454 455 456 457 458
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotSilhForCut()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
459 460
      return(NULL)
    
461
    return(loc.p)
dmattek's avatar
dmattek committed
462 463 464 465 466 467 468 469
  })
  
  # Pop-overs ----
  addPopover(session, 
             ns("alLearnMore"),
             title = "Classes of cluster validation",
             content = helpText.clValid[["alLearnMore"]],
             trigger = "click")
dmattek's avatar
dmattek committed
470 471 472 473 474 475 476 477 478 479 480 481
  
  addPopover(session, 
             ns("alLearnMoreRel"),
             title = "Relative validation",
             content = helpText.clValid[["alLearnMoreRel"]],
             trigger = "click")
  
  addPopover(session, 
             ns("alLearnMoreInt"),
             title = "Internal validation",
             content = helpText.clValid[["alLearnMoreInt"]],
             trigger = "click")
dmattek's avatar
dmattek committed
482 483 484
}