tabClValid.R 19.3 KB
Newer Older
dmattek's avatar
dmattek committed
1 2 3 4 5 6
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This module is a tab for hierarchical clustering (base R hclust + dist)

7
helpText.clValid = c(alertClValidNAsPresent = paste0("NAs present. The selected distance measure will work, ",
8 9
                                              "however PCA will not be avaliable."),
                     alertClValidNAsPresentDTW = paste0("NAs present. DTW distance measure will NOT work."),
dmattek's avatar
dmattek committed
10 11
                    alLearnMore = paste0("<p><a href=http://www.sthda.com/english/wiki/print.php?id=241 title=\"External link\">Clustering</a> ",
                                         "is an <b>unsupervised</b> machine learning method for partitioning ",
12
                                         "dataset into a set of groups called clusters. The procedure will return clusters ",
dmattek's avatar
dmattek committed
13 14 15
                                         "even if the data <b>does not</b> contain any! ",
                                         "Therefore, it’s necessary to ",
                                         "assess clustering tendency before the analysis, and ",
dmattek's avatar
dmattek committed
16 17 18 19 20
                                         "validate the quality of the result after clustering.<p>"
                                         ),
                    alLearnMoreRel = paste0("<p>Determine the optimal number of clusters by inspecting ",
                                            "the average silhouette width and the total within cluster sum of squares (WSS) ",
                                            "for a range of cluster numbers.</p>", 
21 22 23 24
                                            "<p><b>Silhouette analysis</b> first computes how close each trajectory is with others in the cluster it is assigned to, ",
                                            "this is then compared to closeness with trajectories in other clusters. ",
                                            "Larger average silhouette widths usually indicate better clustering. To make sure averaging does not hide a locally bad",
                                            "clustering, this should be inspected along with the silhouette plot in the \"Internal\" tab.<p>",
dmattek's avatar
dmattek committed
25 26
                                            "<p><b>WSS</b> evaluates the compactness of clusters. ",
                                            "Compact clusters achieve low WSS values. ",
27
                                            "Look for the <i>elbow</i> in the plot of WSS as function of cluster numbers.</p>"),
28
                    alLearnMoreInt = paste0("<p>Evaluate the goodness of a clustering structure by inspecting ",
29
                                            "principal components, the dendrogram, ",
30
                                            "and the silhouette for a given number of clusters.</p>",
31
                                            "<p><b>Principal components:</b> Each point in the scatter plot corresponds to a single time series in the first 2 PCs space. ",
32
                                            "Points are coloured by cluster numbers. Compact, well separated clusters ",
33 34 35 36 37 38 39
                                            "indicate good partitioning. The percentage of total variance carried by each PC is indicated.</p>",
                                            "<p><b>Dendrogram:</b> The height of branches indicates how well clusters are separated.</p>",
                                            "<p><b>Silhouette plot:</b> The plot indicates for each series whether it is on average closer to series within its cluster ",
                                            "or to series in other clusters. Each bar represents the <a href=https://en.wikipedia.org/wiki/Silhouette_(clustering) title=\"External link\">silhouette score</a> ",
                                            "(Si) for one series. The height of the bars varies ",
                                            "between 1 (the series is much closer to series in its cluster) and -1 (the series is much closer to series in an other cluster). ",
                                            "Hence, large positive values of Si are usually associated with better clustering, while negative values are associated with worse clustering.")
dmattek's avatar
dmattek committed
40
                    )
dmattek's avatar
dmattek committed
41 42 43 44 45 46 47


# UI ----
clustValidUI <- function(id, label = "Validation") {
  ns <- NS(id)
  
  tagList(
dmattek's avatar
dmattek committed
48 49 50 51 52 53
    h4(
      "Cluster validation using ",
      a("factoextra", 
        href="https://cran.r-project.org/web/packages/factoextra/",
        title="External link")
    ),
dmattek's avatar
dmattek committed
54 55 56 57
    actionLink(ns("alLearnMore"), "Learn more"),
    br(),
    br(),
    fluidRow(
dmattek's avatar
dmattek committed
58

59
      column(4,
dmattek's avatar
dmattek committed
60 61 62 63 64 65
             selectInput(
               ns("selectDiss"),
               label = ("Dissimilarity measure"),
               choices = list("Euclidean" = "euclidean",
                              "Manhattan" = "manhattan",
                              "Maximum"   = "maximum",
66 67
                              "Canberra"  = "canberra",
                              "DTW"       = "DTW"),
68
               selected = "euclidean"
dmattek's avatar
dmattek committed
69
             ),
70
             bsAlert("alertAnchorClValidNAsPresent")
dmattek's avatar
dmattek committed
71
             ),
72
      column(4,
dmattek's avatar
dmattek committed
73 74 75 76 77 78 79 80 81 82 83 84
             selectInput(
               ns("selectLinkage"),
               label = ("Linkage method"),
               choices = list(
                 "Average"  = "average",
                 "Complete" = "complete",
                 "Single"   = "single",
                 "Centroid" = "centroid",
                 "Ward"     = "ward.D",
                 "Ward D2"  = "ward.D2",
                 "McQuitty" = "mcquitty"
               ),
85
               selected = "average"
dmattek's avatar
dmattek committed
86 87 88 89 90 91 92 93
               )
             )
    ),
    
    br(),
    tabsetPanel(
      tabPanel("Relative",
               br(),
dmattek's avatar
dmattek committed
94 95
               p("Determine and visualise the optimal number of clusters. ",
                 actionLink(ns("alLearnMoreRel"), "Learn more")),
dmattek's avatar
dmattek committed
96 97 98 99 100 101 102
               fluidRow(
                 column(2, 
                        actionButton(ns('butPlotRel'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidMaxClust'),
dmattek's avatar
dmattek committed
103
                          'Maximum number of clusters to consider',
dmattek's avatar
dmattek committed
104 105 106 107 108 109 110 111 112 113 114 115
                          min = 2,
                          max = 20,
                          value = 10,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
               withSpinner(plotOutput(ns('outPlotSilhAvg'))),
               br(),
dmattek's avatar
dmattek committed
116
               withSpinner(plotOutput(ns('outPlotWss')))
dmattek's avatar
dmattek committed
117 118 119 120
               
      ),
      tabPanel("Internal",
               br(),
dmattek's avatar
dmattek committed
121 122
               p("Validate a given data partitioning. ",
                 actionLink(ns("alLearnMoreInt"), "Learn more")),
dmattek's avatar
dmattek committed
123 124 125 126 127 128 129
               fluidRow(
                 column(2,
                        actionButton(ns('butPlotInt'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidNclust'),
130
                          'Number of clusters to evaluate',
dmattek's avatar
dmattek committed
131 132 133 134 135 136 137 138 139 140
                          min = 2,
                          max = 20,
                          value = 1,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
141
               withSpinner(plotOutput(ns('outPlotTree'))),
dmattek's avatar
dmattek committed
142
               br(),
143 144 145
               withSpinner(plotOutput(ns('outPlotSilhForCut'))),
               br(),
               withSpinner(plotOutput(ns('outPlotClPCA')))
dmattek's avatar
dmattek committed
146 147 148 149 150 151
      )
    )
  )
}

# SERVER ----
152
clustValid <- function(input, output, session, in.dataWide) {
dmattek's avatar
dmattek committed
153 154 155

  ns = session$ns
  
156 157 158 159 160 161 162 163 164 165 166 167
  # Return the number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnNclust = reactive({
    return(input$slClValidNclust)
  }) %>% debounce(MILLIS)
  
  # Return max number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnMaxNclust = reactive({
    return(input$slClValidMaxClust)
  }) %>% debounce(MILLIS)
  
dmattek's avatar
dmattek committed
168 169
  # calculate distance matrix for further clustering
  # time series arranged in rows with columns corresponding to time points
170 171
  calcDist <- reactive({
    cat(file = stderr(), 'clustValid:calcDist \n')
dmattek's avatar
dmattek committed
172
    
173
    loc.dm = in.dataWide()
dmattek's avatar
dmattek committed
174 175 176 177 178 179 180 181
    
    if (is.null(loc.dm)) {
      return(NULL)
    }
    
    # Throw some warnings if NAs present in the dataset.
    # DTW cannot compute distance when NA's are preset.
    # Other distance measures can be calculated but caution is required with interpretation.
182 183
    # NAs in the wide format can result from explicit NAs in the measurment column or
    # from missing rows that cause NAs to appear when convertinf from long to wide (dcast)
dmattek's avatar
dmattek committed
184
    if(sum(is.na(loc.dm)) > 0) {
185 186 187 188 189 190 191 192 193 194
      if (input$selectDiss == "DTW") {
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresentDTW", title = "Error",
                    content = helpText.clValid[["alertClValidNAsPresentDTW"]], 
                    append = FALSE,
                    style = "danger")
        closeAlert(session, 'alertClValidNAsPresent')
        
        return(NULL)
        
      } else {
195 196
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresent", title = "Warning",
                    content = helpText.clValid[["alertClValidNAsPresent"]], 
dmattek's avatar
dmattek committed
197 198
                    append = FALSE, 
                    style = "warning")
199 200
        closeAlert(session, 'alertClValidNAsPresentDTW')
      }
dmattek's avatar
dmattek committed
201
    } else {
202
      closeAlert(session, 'alertClValidNAsPresentDTW')
203
      closeAlert(session, 'alertClValidNAsPresent')
204
    }    
dmattek's avatar
dmattek committed
205 206
    
    
207 208
    # calculate distance matrix
    return(proxy::dist(loc.dm, method = input$selectDiss))
dmattek's avatar
dmattek committed
209 210
  })
  
211
  # calculate dendrogram for a chosen number of clusters and the linkage method
dmattek's avatar
dmattek committed
212 213 214
  calcDendCut = reactive({
    cat(file = stderr(), 'clustValid:calcDendCut \n')
    
215
    loc.dist = calcDist()
dmattek's avatar
dmattek committed
216
    
217
    if (is.null(loc.dist)) {
dmattek's avatar
dmattek committed
218 219 220
      return(NULL)
    }
    
221 222 223 224 225 226
    return(LOChcut(x = loc.dist,
                   k = returnNclust(),
                   hc_func = "hclust",
                   hc_method = input$selectLinkage,
                   hc_metric = input$selectDiss
                   ))    
227 228
  })
  
dmattek's avatar
dmattek committed
229
  # Plotting ----
dmattek's avatar
dmattek committed
230 231 232 233 234 235
  # Function instead of reactive as per:
  # http://stackoverflow.com/questions/26764481/downloading-png-from-shiny-r
  # This function is used to plot and to downoad a pdf
  
  # plot average silhouette
  plotSilhAvg <- function() {
236
    cat(file = stderr(), 'plotSilhAvg: in\n')
dmattek's avatar
dmattek committed
237
    
238 239 240 241 242 243 244 245 246 247 248 249
    # make the f-n dependent on the button click
    locBut = input$butPlotRel

    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
    validate(
      need(!is.null(loc.dist), "Nothing to plot. Load data first!")
    )    

    loc.p = LOCnbclust(loc.dist,
250
                                     method = "silhouette",
251
                                     k.max = returnMaxNclust(),
252 253 254 255
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Average silhouette width") +
256
      ggtitle("Average silhouette width for different cluster numbers") +
dmattek's avatar
dmattek committed
257 258 259 260 261
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
262 263 264 265 266
    return(loc.p)
  }

  # plot Ws
  plotWss <- function() {
267
    cat(file = stderr(), 'plotWss: in\n')
dmattek's avatar
dmattek committed
268
    
269
    # make the f-n dependent on the button click
270
    locBut = input$butPlotRel
dmattek's avatar
dmattek committed
271
    
272 273 274 275 276 277 278
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
    validate(
      need(!is.null(loc.dist), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
279
    
280
    loc.p = LOCnbclust(loc.dist,
281
                                     method = "wss",
282
                                     k.max = returnMaxNclust(),
283 284 285 286 287
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Total within cluster sum of squares") +
      ggtitle("Within cluster sum of squares for different cluster numbers") +
dmattek's avatar
dmattek committed
288 289 290 291 292
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
293 294 295
    
    return(loc.p)
  }
dmattek's avatar
dmattek committed
296 297 298
  
  # PCA visualization of partitioning methods 
  plotClPCA <- function() {
299
    cat(file = stderr(), 'plotTree: in\n')
dmattek's avatar
dmattek committed
300
    
301
    # make the f-n dependent on the button click
302
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
303
    
304
    # until clicking the Plot button
305
    loc.part = calcDendCut()
306 307
    loc.dm = in.dataWide()
    print(sum(is.na(loc.dm)))
308

309 310 311 312 313 314 315 316
    
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!"),
      need(!is.null(loc.dm),   "Nothing to plot. Load data first!"),
      need(sum(is.na(loc.dm)), "Cannot calculate PCA in the presence of missing data and/or NAs.")
    )    
    
    if (sum(is.na(loc.dm)) > 0)
dmattek's avatar
dmattek committed
317 318
      return(NULL)
    
319
    loc.p = factoextra::fviz_cluster(loc.part, 
320
                                     data = loc.dm,
321
                                     geom = "point",
322
                                     elipse.type = "convex", 
323
                                     main = "Principal components"
324 325 326 327 328 329
                                     )+
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376

    
    # Retrieve association of cluster and colours and use it for dendrogram for color matching between dend, silhouette and PCA plot
    temp = ggplot_build(loc.p)
    map_individual = as.data.table(temp$data[[1]][, c("colour", "shape")])
    map_cluster = map_individual[, .SD[1], by = shape]
    map_cluster[, cluster := 1:nrow(map_cluster)]

    return(list(plot = loc.p, mapping_individual = map_individual, mapping_cluster = map_cluster))
  }
  
  
  # plot dendrogram tree
  plotTree <- function() {
    cat(file = stderr(), 'plotTree: in\n')
    
    # make the f-n dependent on the button click
    locBut = input$butPlotInt
    
    # Check if required data exists
    loc.part = calcDendCut()
    # Rerun the PCA plot to obtain clour mapping of clusters in PCA and silhouette plot and match it with dendrogram colors.
    loc.map = plotClPCA()
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!"),
      need(!is.null(loc.map), "Nothing to plot. Load data first!")
    )    
    
    # Determine cluster order of occurence from left to right in the dendrogram
    # This is necessary because fviz_dend colors clusters from left to right,
    # whereas fviz_silhouette and fviz_cluster use the order of cluster first occurence in the list of individuals.
    loc.mapClus = loc.map$mapping_cluster
    ord.clusDend = unique(loc.part$cluster[loc.part$order])
    col.clusDend = loc.mapClus[, colour][ord.clusDend]
        
    loc.p = factoextra::fviz_dend(loc.part,
                                  k = returnNclust(),
                                  k_colors = col.clusDend,
                                  show_labels = F,
                                  rect = T,
                                  xlab = "Time series",
                                  main = "Dendrogram") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE,
                     in.font.axis.text = PLOTFONTAXISTEXT,
                     in.font.axis.title = PLOTFONTAXISTITLE,
                     in.font.strip = PLOTFONTFACETSTRIP,
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
377 378 379 380
    
    return(loc.p)
  }
  
381 382
  
  # plot silhouettes for a particular dendrogram cut
dmattek's avatar
dmattek committed
383
  plotSilhForCut <- function() {
384
    cat(file = stderr(), 'plotSilhForCut: in\n')
dmattek's avatar
dmattek committed
385
    
386
    # make the f-n dependent on the button click
387
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
388
    
389
    # until clicking the Plot button
390
    loc.part = calcDendCut()
391 392 393
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
394
    
395 396 397
    loc.p = factoextra::fviz_silhouette(loc.part, 
                                        print.summary = FALSE, 
                                        main = "Silhouette") +
dmattek's avatar
dmattek committed
398 399 400 401 402 403 404
      xlab("Time series") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND) +
      theme(axis.text.x = element_blank())
dmattek's avatar
dmattek committed
405 406 407 408
    
    return(loc.p)
  }
  
dmattek's avatar
dmattek committed
409
  # Plot rendering ----
dmattek's avatar
dmattek committed
410 411
  # Display silhouette
  output$outPlotSilhAvg <- renderPlot({
412 413
    loc.p = plotSilhAvg()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
414 415
      return(NULL)
    
416
    return(loc.p)
dmattek's avatar
dmattek committed
417 418 419 420 421
  })

  
  # Display wss
  output$outPlotWss <- renderPlot({
422 423 424
    loc.p = plotWss()
    if(is.null(loc.p))
      return(NULL)
dmattek's avatar
dmattek committed
425
    
426 427 428 429 430 431 432 433 434 435 436 437 438 439
    return(loc.p)
  })
  
  # Display PCA of clustering
  output$outPlotClPCA <- renderPlot({
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotClPCA()
440
    loc.p = loc.p$plot
441
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
442 443
      return(NULL)
    
444
    return(loc.p)
dmattek's avatar
dmattek committed
445 446 447 448
  })
  
  # Display tree
  output$outPlotTree <- renderPlot({
449 450 451 452 453 454 455 456 457 458
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotTree()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
459 460
      return(NULL)
    
461
    return(loc.p)
dmattek's avatar
dmattek committed
462 463 464 465
  })
  
  # Display silhouette for a dendrogram cut
  output$outPlotSilhForCut <- renderPlot({
466 467 468 469 470 471 472 473 474 475
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotSilhForCut()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
476 477
      return(NULL)
    
478
    return(loc.p)
dmattek's avatar
dmattek committed
479 480 481 482 483 484 485 486
  })
  
  # Pop-overs ----
  addPopover(session, 
             ns("alLearnMore"),
             title = "Classes of cluster validation",
             content = helpText.clValid[["alLearnMore"]],
             trigger = "click")
dmattek's avatar
dmattek committed
487 488 489 490 491 492 493 494 495 496 497 498
  
  addPopover(session, 
             ns("alLearnMoreRel"),
             title = "Relative validation",
             content = helpText.clValid[["alLearnMoreRel"]],
             trigger = "click")
  
  addPopover(session, 
             ns("alLearnMoreInt"),
             title = "Internal validation",
             content = helpText.clValid[["alLearnMoreInt"]],
             trigger = "click")
dmattek's avatar
dmattek committed
499 500 501
}