tabClValid.R 19.6 KB
Newer Older
dmattek's avatar
dmattek committed
1 2 3 4 5 6
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This module is a tab for hierarchical clustering (base R hclust + dist)

7
helpText.clValid = c(alertClValidNAsPresent = paste0("NAs present. The selected distance measure will work, ",
8 9
                                              "however PCA will not be avaliable."),
                     alertClValidNAsPresentDTW = paste0("NAs present. DTW distance measure will NOT work."),
dmattek's avatar
dmattek committed
10 11
                    alLearnMore = paste0("<p><a href=http://www.sthda.com/english/wiki/print.php?id=241 title=\"External link\">Clustering</a> ",
                                         "is an <b>unsupervised</b> machine learning method for partitioning ",
12
                                         "dataset into a set of groups called clusters. The procedure will return clusters ",
dmattek's avatar
dmattek committed
13 14 15
                                         "even if the data <b>does not</b> contain any! ",
                                         "Therefore, it’s necessary to ",
                                         "assess clustering tendency before the analysis, and ",
dmattek's avatar
dmattek committed
16 17 18 19 20
                                         "validate the quality of the result after clustering.<p>"
                                         ),
                    alLearnMoreRel = paste0("<p>Determine the optimal number of clusters by inspecting ",
                                            "the average silhouette width and the total within cluster sum of squares (WSS) ",
                                            "for a range of cluster numbers.</p>", 
21 22 23 24
                                            "<p><b>Silhouette analysis</b> first computes how close each trajectory is with others in the cluster it is assigned to, ",
                                            "this is then compared to closeness with trajectories in other clusters. ",
                                            "Larger average silhouette widths usually indicate better clustering. To make sure averaging does not hide a locally bad",
                                            "clustering, this should be inspected along with the silhouette plot in the \"Internal\" tab.<p>",
dmattek's avatar
dmattek committed
25 26
                                            "<p><b>WSS</b> evaluates the compactness of clusters. ",
                                            "Compact clusters achieve low WSS values. ",
27
                                            "Look for the <i>elbow</i> in the plot of WSS as function of cluster numbers.</p>"),
28
                    alLearnMoreInt = paste0("<p>Evaluate the goodness of a clustering structure by inspecting ",
29
                                            "principal components, the dendrogram, ",
30
                                            "and the silhouette for a given number of clusters.</p>",
31
                                            "<p><b>Principal components:</b> Each point in the scatter plot corresponds to a single time series in the first 2 PCs space. ",
32
                                            "Points are coloured by cluster numbers. Compact, well separated clusters ",
33 34 35 36 37 38 39
                                            "indicate good partitioning. The percentage of total variance carried by each PC is indicated.</p>",
                                            "<p><b>Dendrogram:</b> The height of branches indicates how well clusters are separated.</p>",
                                            "<p><b>Silhouette plot:</b> The plot indicates for each series whether it is on average closer to series within its cluster ",
                                            "or to series in other clusters. Each bar represents the <a href=https://en.wikipedia.org/wiki/Silhouette_(clustering) title=\"External link\">silhouette score</a> ",
                                            "(Si) for one series. The height of the bars varies ",
                                            "between 1 (the series is much closer to series in its cluster) and -1 (the series is much closer to series in an other cluster). ",
                                            "Hence, large positive values of Si are usually associated with better clustering, while negative values are associated with worse clustering.")
dmattek's avatar
dmattek committed
40
                    )
dmattek's avatar
dmattek committed
41 42 43 44 45 46 47


# UI ----
clustValidUI <- function(id, label = "Validation") {
  ns <- NS(id)
  
  tagList(
dmattek's avatar
dmattek committed
48 49 50 51 52 53
    h4(
      "Cluster validation using ",
      a("factoextra", 
        href="https://cran.r-project.org/web/packages/factoextra/",
        title="External link")
    ),
dmattek's avatar
dmattek committed
54 55 56 57
    actionLink(ns("alLearnMore"), "Learn more"),
    br(),
    br(),
    fluidRow(
dmattek's avatar
dmattek committed
58

59
      column(4,
dmattek's avatar
dmattek committed
60 61 62 63 64 65
             selectInput(
               ns("selectDiss"),
               label = ("Dissimilarity measure"),
               choices = list("Euclidean" = "euclidean",
                              "Manhattan" = "manhattan",
                              "Maximum"   = "maximum",
66 67
                              "Canberra"  = "canberra",
                              "DTW"       = "DTW"),
68
               selected = "euclidean"
dmattek's avatar
dmattek committed
69
             ),
70
             bsAlert("alertAnchorClValidNAsPresent")
dmattek's avatar
dmattek committed
71
             ),
72
      column(4,
dmattek's avatar
dmattek committed
73 74 75 76 77 78 79 80 81 82 83 84
             selectInput(
               ns("selectLinkage"),
               label = ("Linkage method"),
               choices = list(
                 "Average"  = "average",
                 "Complete" = "complete",
                 "Single"   = "single",
                 "Centroid" = "centroid",
                 "Ward"     = "ward.D",
                 "Ward D2"  = "ward.D2",
                 "McQuitty" = "mcquitty"
               ),
85
               selected = "average"
dmattek's avatar
dmattek committed
86 87 88 89 90 91 92 93
               )
             )
    ),
    
    br(),
    tabsetPanel(
      tabPanel("Relative",
               br(),
dmattek's avatar
dmattek committed
94 95
               p("Determine and visualise the optimal number of clusters. ",
                 actionLink(ns("alLearnMoreRel"), "Learn more")),
dmattek's avatar
dmattek committed
96 97 98 99 100 101 102
               fluidRow(
                 column(2, 
                        actionButton(ns('butPlotRel'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidMaxClust'),
dmattek's avatar
dmattek committed
103
                          'Maximum number of clusters to consider',
dmattek's avatar
dmattek committed
104 105 106 107 108 109 110 111 112 113 114 115
                          min = 2,
                          max = 20,
                          value = 10,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
               withSpinner(plotOutput(ns('outPlotSilhAvg'))),
               br(),
dmattek's avatar
dmattek committed
116
               withSpinner(plotOutput(ns('outPlotWss')))
dmattek's avatar
dmattek committed
117 118 119 120
               
      ),
      tabPanel("Internal",
               br(),
dmattek's avatar
dmattek committed
121 122
               p("Validate a given data partitioning. ",
                 actionLink(ns("alLearnMoreInt"), "Learn more")),
dmattek's avatar
dmattek committed
123 124 125 126 127 128 129
               fluidRow(
                 column(2,
                        actionButton(ns('butPlotInt'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidNclust'),
130
                          'Number of clusters to evaluate',
dmattek's avatar
dmattek committed
131 132 133 134 135 136 137 138 139 140
                          min = 2,
                          max = 20,
                          value = 1,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
141
               withSpinner(plotOutput(ns('outPlotTree'))),
dmattek's avatar
dmattek committed
142
               br(),
143 144 145
               withSpinner(plotOutput(ns('outPlotSilhForCut'))),
               br(),
               withSpinner(plotOutput(ns('outPlotClPCA')))
dmattek's avatar
dmattek committed
146 147 148 149 150 151
      )
    )
  )
}

# SERVER ----
152
clustValid <- function(input, output, session, in.dataWide) {
dmattek's avatar
dmattek committed
153 154 155

  ns = session$ns
  
156 157 158 159 160 161 162 163 164 165 166 167
  # Return the number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnNclust = reactive({
    return(input$slClValidNclust)
  }) %>% debounce(MILLIS)
  
  # Return max number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnMaxNclust = reactive({
    return(input$slClValidMaxClust)
  }) %>% debounce(MILLIS)
  
dmattek's avatar
dmattek committed
168 169
  # calculate distance matrix for further clustering
  # time series arranged in rows with columns corresponding to time points
170 171
  calcDist <- reactive({
    cat(file = stderr(), 'clustValid:calcDist \n')
dmattek's avatar
dmattek committed
172
    
173
    loc.dm = in.dataWide()
dmattek's avatar
dmattek committed
174 175 176 177 178 179 180 181
    
    if (is.null(loc.dm)) {
      return(NULL)
    }
    
    # Throw some warnings if NAs present in the dataset.
    # DTW cannot compute distance when NA's are preset.
    # Other distance measures can be calculated but caution is required with interpretation.
182 183
    # NAs in the wide format can result from explicit NAs in the measurment column or
    # from missing rows that cause NAs to appear when convertinf from long to wide (dcast)
dmattek's avatar
dmattek committed
184
    if(sum(is.na(loc.dm)) > 0) {
185 186 187 188 189 190 191 192 193 194
      if (input$selectDiss == "DTW") {
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresentDTW", title = "Error",
                    content = helpText.clValid[["alertClValidNAsPresentDTW"]], 
                    append = FALSE,
                    style = "danger")
        closeAlert(session, 'alertClValidNAsPresent')
        
        return(NULL)
        
      } else {
195 196
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresent", title = "Warning",
                    content = helpText.clValid[["alertClValidNAsPresent"]], 
dmattek's avatar
dmattek committed
197 198
                    append = FALSE, 
                    style = "warning")
199 200
        closeAlert(session, 'alertClValidNAsPresentDTW')
      }
dmattek's avatar
dmattek committed
201
    } else {
202
      closeAlert(session, 'alertClValidNAsPresentDTW')
203
      closeAlert(session, 'alertClValidNAsPresent')
204
    }    
dmattek's avatar
dmattek committed
205 206
    
    
207 208
    # calculate distance matrix
    return(proxy::dist(loc.dm, method = input$selectDiss))
dmattek's avatar
dmattek committed
209 210
  })
  
211
  # calculate dendrogram for a chosen number of clusters and the linkage method
dmattek's avatar
dmattek committed
212 213 214
  calcDendCut = reactive({
    cat(file = stderr(), 'clustValid:calcDendCut \n')
    
215
    loc.dist = calcDist()
dmattek's avatar
dmattek committed
216
    
217
    if (is.null(loc.dist)) {
dmattek's avatar
dmattek committed
218 219 220
      return(NULL)
    }
    
221 222 223 224 225 226
    return(LOChcut(x = loc.dist,
                   k = returnNclust(),
                   hc_func = "hclust",
                   hc_method = input$selectLinkage,
                   hc_metric = input$selectDiss
                   ))    
227 228
  })
  
dmattek's avatar
dmattek committed
229
  # Plotting ----
dmattek's avatar
dmattek committed
230 231 232 233 234 235
  # Function instead of reactive as per:
  # http://stackoverflow.com/questions/26764481/downloading-png-from-shiny-r
  # This function is used to plot and to downoad a pdf
  
  # plot average silhouette
  plotSilhAvg <- function() {
236
    cat(file = stderr(), 'plotSilhAvg: in\n')
dmattek's avatar
dmattek committed
237
    
238 239 240 241 242 243 244
    # make the f-n dependent on the button click
    locBut = input$butPlotRel

    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
dmattek's avatar
dmattek committed
245
    
246
    validate(
dmattek's avatar
dmattek committed
247 248
      need(!is.null(loc.dist), "Nothing to plot. Load data first!"),
      need(returnMaxNclust() <  nrow(loc.dist), "Maximum number of clusters to conisder should be smaller than the number of time series.")
249 250 251
    )    

    loc.p = LOCnbclust(loc.dist,
252
                                     method = "silhouette",
253
                                     k.max = returnMaxNclust(),
254 255 256 257
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Average silhouette width") +
258
      ggtitle("Average silhouette width for different cluster numbers") +
dmattek's avatar
dmattek committed
259 260 261 262 263
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
264 265 266 267 268
    return(loc.p)
  }

  # plot Ws
  plotWss <- function() {
269
    cat(file = stderr(), 'plotWss: in\n')
dmattek's avatar
dmattek committed
270
    
271
    # make the f-n dependent on the button click
272
    locBut = input$butPlotRel
dmattek's avatar
dmattek committed
273
    
274 275 276 277
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
dmattek's avatar
dmattek committed
278
    
279
    validate(
dmattek's avatar
dmattek committed
280 281
      need(!is.null(loc.dist), "Nothing to plot. Load data first!"),
      need(returnMaxNclust() <  nrow(loc.dist), "Maximum number of clusters to conisder should be smaller than the number of time series.")
282
    )    
dmattek's avatar
dmattek committed
283
    
284
    loc.p = LOCnbclust(loc.dist,
285
                                     method = "wss",
286
                                     k.max = returnMaxNclust(),
287 288 289 290 291
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Total within cluster sum of squares") +
      ggtitle("Within cluster sum of squares for different cluster numbers") +
dmattek's avatar
dmattek committed
292 293 294 295 296
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
297 298 299
    
    return(loc.p)
  }
dmattek's avatar
dmattek committed
300 301 302
  
  # PCA visualization of partitioning methods 
  plotClPCA <- function() {
303
    cat(file = stderr(), 'plotTree: in\n')
dmattek's avatar
dmattek committed
304
    
305
    # make the f-n dependent on the button click
306
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
307
    
308
    # until clicking the Plot button
309
    loc.part = calcDendCut()
310 311 312 313 314 315 316 317 318
    loc.dm = in.dataWide()
    
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!"),
      need(!is.null(loc.dm),   "Nothing to plot. Load data first!"),
      need(sum(is.na(loc.dm)), "Cannot calculate PCA in the presence of missing data and/or NAs.")
    )    
    
    if (sum(is.na(loc.dm)) > 0)
dmattek's avatar
dmattek committed
319 320
      return(NULL)
    
321
    loc.p = factoextra::fviz_cluster(loc.part, 
322
                                     data = loc.dm,
323
                                     geom = "point",
324
                                     elipse.type = "convex", 
325
                                     main = "Principal components"
326 327 328 329 330 331
                                     )+
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352

    
    # Retrieve association of cluster and colours and use it for dendrogram for color matching between dend, silhouette and PCA plot
    temp = ggplot_build(loc.p)
    map_individual = as.data.table(temp$data[[1]][, c("colour", "shape")])
    map_cluster = map_individual[, .SD[1], by = shape]
    map_cluster[, cluster := 1:nrow(map_cluster)]

    return(list(plot = loc.p, mapping_individual = map_individual, mapping_cluster = map_cluster))
  }
  
  
  # plot dendrogram tree
  plotTree <- function() {
    cat(file = stderr(), 'plotTree: in\n')
    
    # make the f-n dependent on the button click
    locBut = input$butPlotInt
    
    # Check if required data exists
    loc.part = calcDendCut()
dmattek's avatar
dmattek committed
353
    
354 355
    # Rerun the PCA plot to obtain clour mapping of clusters in PCA and silhouette plot and match it with dendrogram colors.
    loc.map = plotClPCA()
dmattek's avatar
dmattek committed
356
    
357 358
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!"),
dmattek's avatar
dmattek committed
359
      need(!is.null(loc.map),  "Cannot assign colours to clusters. Possible NAs in the dataset!")
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
    )    
    
    # Determine cluster order of occurence from left to right in the dendrogram
    # This is necessary because fviz_dend colors clusters from left to right,
    # whereas fviz_silhouette and fviz_cluster use the order of cluster first occurence in the list of individuals.
    loc.mapClus = loc.map$mapping_cluster
    ord.clusDend = unique(loc.part$cluster[loc.part$order])
    col.clusDend = loc.mapClus[, colour][ord.clusDend]
        
    loc.p = factoextra::fviz_dend(loc.part,
                                  k = returnNclust(),
                                  k_colors = col.clusDend,
                                  show_labels = F,
                                  rect = T,
                                  xlab = "Time series",
                                  main = "Dendrogram") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE,
                     in.font.axis.text = PLOTFONTAXISTEXT,
                     in.font.axis.title = PLOTFONTAXISTITLE,
                     in.font.strip = PLOTFONTFACETSTRIP,
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
381 382 383 384
    
    return(loc.p)
  }
  
385 386
  
  # plot silhouettes for a particular dendrogram cut
dmattek's avatar
dmattek committed
387
  plotSilhForCut <- function() {
388
    cat(file = stderr(), 'plotSilhForCut: in\n')
dmattek's avatar
dmattek committed
389
    
390
    # make the f-n dependent on the button click
391
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
392
    
393
    # until clicking the Plot button
394
    loc.part = calcDendCut()
395 396 397
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
398
    
399 400 401
    loc.p = factoextra::fviz_silhouette(loc.part, 
                                        print.summary = FALSE, 
                                        main = "Silhouette") +
dmattek's avatar
dmattek committed
402 403 404 405 406 407 408
      xlab("Time series") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND) +
      theme(axis.text.x = element_blank())
dmattek's avatar
dmattek committed
409 410 411 412
    
    return(loc.p)
  }
  
dmattek's avatar
dmattek committed
413
  # Plot rendering ----
dmattek's avatar
dmattek committed
414 415
  # Display silhouette
  output$outPlotSilhAvg <- renderPlot({
416 417
    loc.p = plotSilhAvg()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
418 419
      return(NULL)
    
420
    return(loc.p)
dmattek's avatar
dmattek committed
421 422 423 424 425
  })

  
  # Display wss
  output$outPlotWss <- renderPlot({
426 427 428
    loc.p = plotWss()
    if(is.null(loc.p))
      return(NULL)
dmattek's avatar
dmattek committed
429
    
430 431 432 433 434 435 436 437 438 439 440 441 442 443
    return(loc.p)
  })
  
  # Display PCA of clustering
  output$outPlotClPCA <- renderPlot({
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotClPCA()
444
    loc.p = loc.p$plot
445
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
446 447
      return(NULL)
    
448
    return(loc.p)
dmattek's avatar
dmattek committed
449 450 451 452
  })
  
  # Display tree
  output$outPlotTree <- renderPlot({
453 454 455 456 457 458 459 460 461 462
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotTree()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
463 464
      return(NULL)
    
465
    return(loc.p)
dmattek's avatar
dmattek committed
466 467 468 469
  })
  
  # Display silhouette for a dendrogram cut
  output$outPlotSilhForCut <- renderPlot({
470 471 472 473 474 475 476 477 478 479
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotSilhForCut()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
480 481
      return(NULL)
    
482
    return(loc.p)
dmattek's avatar
dmattek committed
483 484 485 486 487 488 489 490
  })
  
  # Pop-overs ----
  addPopover(session, 
             ns("alLearnMore"),
             title = "Classes of cluster validation",
             content = helpText.clValid[["alLearnMore"]],
             trigger = "click")
dmattek's avatar
dmattek committed
491 492 493 494 495 496 497 498 499 500 501 502
  
  addPopover(session, 
             ns("alLearnMoreRel"),
             title = "Relative validation",
             content = helpText.clValid[["alLearnMoreRel"]],
             trigger = "click")
  
  addPopover(session, 
             ns("alLearnMoreInt"),
             title = "Internal validation",
             content = helpText.clValid[["alLearnMoreInt"]],
             trigger = "click")
dmattek's avatar
dmattek committed
503 504 505
}