This server has been upgraded to GitLab release 12.10.6.

tabClValid.R 17.7 KB
Newer Older
dmattek's avatar
dmattek committed
1 2 3 4 5 6
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This module is a tab for hierarchical clustering (base R hclust + dist)

7
helpText.clValid = c(alertClValidNAsPresent = paste0("NAs present. The selected distance measure will work, ",
8 9
                                              "however PCA will not be avaliable."),
                     alertClValidNAsPresentDTW = paste0("NAs present. DTW distance measure will NOT work."),
dmattek's avatar
dmattek committed
10 11
                    alLearnMore = paste0("<p><a href=http://www.sthda.com/english/wiki/print.php?id=241 title=\"External link\">Clustering</a> ",
                                         "is an <b>unsupervised</b> machine learning method for partitioning ",
dmattek's avatar
dmattek committed
12 13 14 15
                                         "dataset into a set of groups or clusters. The procedure will return clusters ",
                                         "even if the data <b>does not</b> contain any! ",
                                         "Therefore, it’s necessary to ",
                                         "assess clustering tendency before the analysis, and ",
dmattek's avatar
dmattek committed
16 17 18 19 20 21 22 23 24 25
                                         "validate the quality of the result after clustering.<p>"
                                         ),
                    alLearnMoreRel = paste0("<p>Determine the optimal number of clusters by inspecting ",
                                            "the average silhouette width and the total within cluster sum of squares (WSS) ",
                                            "for a range of cluster numbers.</p>", 
                                            "<p><b>Silhouette analysis</b> estimates the average distance between clusters. ",
                                            "Larger silhouette widths indicate better.<p>",
                                            "<p><b>WSS</b> evaluates the compactness of clusters. ",
                                            "Compact clusters achieve low WSS values. ",
                                            "Look for the <i>knee</i> in the plot of WSS as function of cluster numbers.</p>"),
26 27 28 29 30 31
                    alLearnMoreInt = paste0("<p>Evaluate the goodness of a clustering structure by inspecting ",
                                            "principle components, the dendrogram, ",
                                            "and the silhouette for a given number of clusters.</p>",
                                            "<p>Each point in the scatter plot of 2 principle components corresponds to a single time series. ",
                                            "Points are coloured by cluster numbers. Compact, well separated clusters ",
                                            "indicate good partitioning.</p>",
dmattek's avatar
dmattek committed
32 33 34 35 36 37 38
                                            "<p>The height of dendrogram branches indicates how well clusters are separated.</p>",
                                            "<p>The silhouette plot displays how close each time series in one cluster ", 
                                            "is to time series in the neighboring clusters. ",
                                            "A large positive silhouette (Si) indicates time series that are well clustered.",
                                            "A negative Si indicates time series that are closer to ",
                                            "a neighboring cluster, and are placed in the wrong cluster.</p>")
                    )
dmattek's avatar
dmattek committed
39 40 41 42 43 44 45


# UI ----
clustValidUI <- function(id, label = "Validation") {
  ns <- NS(id)
  
  tagList(
dmattek's avatar
dmattek committed
46 47 48 49 50 51
    h4(
      "Cluster validation using ",
      a("factoextra", 
        href="https://cran.r-project.org/web/packages/factoextra/",
        title="External link")
    ),
dmattek's avatar
dmattek committed
52 53 54 55
    actionLink(ns("alLearnMore"), "Learn more"),
    br(),
    br(),
    fluidRow(
dmattek's avatar
dmattek committed
56

57
      column(4,
dmattek's avatar
dmattek committed
58 59 60 61 62 63
             selectInput(
               ns("selectDiss"),
               label = ("Dissimilarity measure"),
               choices = list("Euclidean" = "euclidean",
                              "Manhattan" = "manhattan",
                              "Maximum"   = "maximum",
64 65
                              "Canberra"  = "canberra",
                              "DTW"       = "DTW"),
66
               selected = "euclidean"
dmattek's avatar
dmattek committed
67
             ),
68
             bsAlert("alertAnchorClValidNAsPresent")
dmattek's avatar
dmattek committed
69
             ),
70
      column(4,
dmattek's avatar
dmattek committed
71 72 73 74 75 76 77 78 79 80 81 82
             selectInput(
               ns("selectLinkage"),
               label = ("Linkage method"),
               choices = list(
                 "Average"  = "average",
                 "Complete" = "complete",
                 "Single"   = "single",
                 "Centroid" = "centroid",
                 "Ward"     = "ward.D",
                 "Ward D2"  = "ward.D2",
                 "McQuitty" = "mcquitty"
               ),
83
               selected = "average"
dmattek's avatar
dmattek committed
84 85 86 87 88 89 90 91
               )
             )
    ),
    
    br(),
    tabsetPanel(
      tabPanel("Relative",
               br(),
dmattek's avatar
dmattek committed
92 93
               p("Determine and visualise the optimal number of clusters. ",
                 actionLink(ns("alLearnMoreRel"), "Learn more")),
dmattek's avatar
dmattek committed
94 95 96 97 98 99 100
               fluidRow(
                 column(2, 
                        actionButton(ns('butPlotRel'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidMaxClust'),
dmattek's avatar
dmattek committed
101
                          'Maximum number of clusters to consider',
dmattek's avatar
dmattek committed
102 103 104 105 106 107 108 109 110 111 112 113
                          min = 2,
                          max = 20,
                          value = 10,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
               withSpinner(plotOutput(ns('outPlotSilhAvg'))),
               br(),
dmattek's avatar
dmattek committed
114
               withSpinner(plotOutput(ns('outPlotWss')))
dmattek's avatar
dmattek committed
115 116 117 118
               
      ),
      tabPanel("Internal",
               br(),
dmattek's avatar
dmattek committed
119 120
               p("Validate a given data partitioning. ",
                 actionLink(ns("alLearnMoreInt"), "Learn more")),
dmattek's avatar
dmattek committed
121 122 123 124 125 126 127
               fluidRow(
                 column(2,
                        actionButton(ns('butPlotInt'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidNclust'),
128
                          'Number of clusters to evaluate',
dmattek's avatar
dmattek committed
129 130 131 132 133 134 135 136 137 138
                          min = 2,
                          max = 20,
                          value = 1,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
139
               withSpinner(plotOutput(ns('outPlotTree'))),
dmattek's avatar
dmattek committed
140
               br(),
141 142 143
               withSpinner(plotOutput(ns('outPlotSilhForCut'))),
               br(),
               withSpinner(plotOutput(ns('outPlotClPCA')))
dmattek's avatar
dmattek committed
144 145 146 147 148 149
      )
    )
  )
}

# SERVER ----
150
clustValid <- function(input, output, session, in.dataWide) {
dmattek's avatar
dmattek committed
151 152 153

  ns = session$ns
  
154 155 156 157 158 159 160 161 162 163 164 165
  # Return the number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnNclust = reactive({
    return(input$slClValidNclust)
  }) %>% debounce(MILLIS)
  
  # Return max number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnMaxNclust = reactive({
    return(input$slClValidMaxClust)
  }) %>% debounce(MILLIS)
  
dmattek's avatar
dmattek committed
166 167
  # calculate distance matrix for further clustering
  # time series arranged in rows with columns corresponding to time points
168 169
  calcDist <- reactive({
    cat(file = stderr(), 'clustValid:calcDist \n')
dmattek's avatar
dmattek committed
170
    
171
    loc.dm = in.dataWide()
dmattek's avatar
dmattek committed
172 173 174 175 176 177 178 179
    
    if (is.null(loc.dm)) {
      return(NULL)
    }
    
    # Throw some warnings if NAs present in the dataset.
    # DTW cannot compute distance when NA's are preset.
    # Other distance measures can be calculated but caution is required with interpretation.
180 181
    # NAs in the wide format can result from explicit NAs in the measurment column or
    # from missing rows that cause NAs to appear when convertinf from long to wide (dcast)
dmattek's avatar
dmattek committed
182
    if(sum(is.na(loc.dm)) > 0) {
183 184 185 186 187 188 189 190 191 192
      if (input$selectDiss == "DTW") {
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresentDTW", title = "Error",
                    content = helpText.clValid[["alertClValidNAsPresentDTW"]], 
                    append = FALSE,
                    style = "danger")
        closeAlert(session, 'alertClValidNAsPresent')
        
        return(NULL)
        
      } else {
193 194
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresent", title = "Warning",
                    content = helpText.clValid[["alertClValidNAsPresent"]], 
dmattek's avatar
dmattek committed
195 196
                    append = FALSE, 
                    style = "warning")
197 198
        closeAlert(session, 'alertClValidNAsPresentDTW')
      }
dmattek's avatar
dmattek committed
199
    } else {
200
      closeAlert(session, 'alertClValidNAsPresentDTW')
201
      closeAlert(session, 'alertClValidNAsPresent')
202
    }    
dmattek's avatar
dmattek committed
203 204
    
    
205 206
    # calculate distance matrix
    return(proxy::dist(loc.dm, method = input$selectDiss))
dmattek's avatar
dmattek committed
207 208
  })
  
209
  # calculate dendrogram for a chosen number of clusters and the linkage method
dmattek's avatar
dmattek committed
210 211 212
  calcDendCut = reactive({
    cat(file = stderr(), 'clustValid:calcDendCut \n')
    
213
    loc.dist = calcDist()
dmattek's avatar
dmattek committed
214
    
215
    if (is.null(loc.dist)) {
dmattek's avatar
dmattek committed
216 217 218
      return(NULL)
    }
    
219 220
    return(factoextra::hcut(x = loc.dist,
                            k = returnNclust(),
221 222 223 224 225
                              FUNcluster = "hclust",
                              hc_method = input$selectLinkage, 
                              graph = FALSE))
  })
  
dmattek's avatar
dmattek committed
226
  # Plotting ----
dmattek's avatar
dmattek committed
227 228 229 230 231 232
  # Function instead of reactive as per:
  # http://stackoverflow.com/questions/26764481/downloading-png-from-shiny-r
  # This function is used to plot and to downoad a pdf
  
  # plot average silhouette
  plotSilhAvg <- function() {
233
    cat(file = stderr(), 'plotSilhAvg: in\n')
dmattek's avatar
dmattek committed
234
    
235 236 237 238 239 240 241 242 243 244 245 246
    # make the f-n dependent on the button click
    locBut = input$butPlotRel

    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
    validate(
      need(!is.null(loc.dist), "Nothing to plot. Load data first!")
    )    

    loc.p = LOCnbclust(loc.dist,
247
                                     method = "silhouette",
248
                                     k.max = returnMaxNclust(),
249 250 251 252 253
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Average silhouette width") +
      ggtitle("Optimal number of clusters from silhouette analysis") +
dmattek's avatar
dmattek committed
254 255 256 257 258
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
259 260 261 262 263
    return(loc.p)
  }

  # plot Ws
  plotWss <- function() {
264
    cat(file = stderr(), 'plotWss: in\n')
dmattek's avatar
dmattek committed
265
    
266
    # make the f-n dependent on the button click
267
    locBut = input$butPlotRel
dmattek's avatar
dmattek committed
268
    
269 270 271 272 273 274 275
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
    validate(
      need(!is.null(loc.dist), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
276
    
277
    loc.p = LOCnbclust(loc.dist,
278
                                     method = "wss",
279
                                     k.max = returnMaxNclust(),
280 281 282 283 284
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Total within cluster sum of squares") +
      ggtitle("Within cluster sum of squares for different cluster numbers") +
dmattek's avatar
dmattek committed
285 286 287 288 289
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
290 291 292 293 294 295
    
    return(loc.p)
  }

  # plot dendrogram tree
  plotTree <- function() {
296
    cat(file = stderr(), 'plotTree: in\n')
dmattek's avatar
dmattek committed
297
    
298
    # make the f-n dependent on the button click
299
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
300
    
301 302 303 304 305 306 307
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.part = isolate(calcDendCut())
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
308
    
309
    loc.p = factoextra::fviz_dend(loc.part, 
dmattek's avatar
dmattek committed
310 311
                                  show_labels = F,
                                  rect = T,
312 313
                                  xlab = "Time series", 
                                  main = "Dendrogram") +
dmattek's avatar
dmattek committed
314 315 316 317 318 319 320 321 322 323 324 325
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
    
    return(loc.p)
  }
  
  
  # PCA visualization of partitioning methods 
  plotClPCA <- function() {
326
    cat(file = stderr(), 'plotTree: in\n')
dmattek's avatar
dmattek committed
327
    
328
    # make the f-n dependent on the button click
329
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
330
    
331 332 333 334 335 336 337 338 339 340 341 342 343 344
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.part = isolate(calcDendCut())
    loc.dm = in.dataWide()
    print(sum(is.na(loc.dm)))
    
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!"),
      need(!is.null(loc.dm),   "Nothing to plot. Load data first!"),
      need(sum(is.na(loc.dm)), "Cannot calculate PCA in the presence of missing data and/or NAs.")
    )    
    
    if (sum(is.na(loc.dm)) > 0)
dmattek's avatar
dmattek committed
345 346
      return(NULL)
    
347
    loc.p = factoextra::fviz_cluster(loc.part, 
348
                                     data = loc.dm,
349
                                     geom = "point",
350
                                     elipse.type = "convex", 
351
                                     main = "Principle components"
352 353 354 355 356 357
                                     )+
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
358 359 360 361 362 363
    
    return(loc.p)
  }
  
  # plot silhouetts for a particular dendrogram cut
  plotSilhForCut <- function() {
364
    cat(file = stderr(), 'plotSilhForCut: in\n')
dmattek's avatar
dmattek committed
365
    
366
    # make the f-n dependent on the button click
367
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
368
    
369 370 371 372 373 374 375
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.part = isolate(calcDendCut())
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
376
    
377 378 379
    loc.p = factoextra::fviz_silhouette(loc.part, 
                                        print.summary = FALSE, 
                                        main = "Silhouette") +
dmattek's avatar
dmattek committed
380 381 382 383 384 385 386
      xlab("Time series") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND) +
      theme(axis.text.x = element_blank())
dmattek's avatar
dmattek committed
387 388 389 390
    
    return(loc.p)
  }
  
dmattek's avatar
dmattek committed
391
  # Plot rendering ----
dmattek's avatar
dmattek committed
392 393
  # Display silhouette
  output$outPlotSilhAvg <- renderPlot({
394 395
    loc.p = plotSilhAvg()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
396 397
      return(NULL)
    
398
    return(loc.p)
dmattek's avatar
dmattek committed
399 400 401 402 403
  })

  
  # Display wss
  output$outPlotWss <- renderPlot({
404 405 406
    loc.p = plotWss()
    if(is.null(loc.p))
      return(NULL)
dmattek's avatar
dmattek committed
407
    
408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
    return(loc.p)
  })
  
  # Display PCA of clustering
  output$outPlotClPCA <- renderPlot({
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotClPCA()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
423 424
      return(NULL)
    
425
    return(loc.p)
dmattek's avatar
dmattek committed
426 427 428 429
  })
  
  # Display tree
  output$outPlotTree <- renderPlot({
430 431 432 433 434 435 436 437 438 439
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotTree()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
440 441
      return(NULL)
    
442
    return(loc.p)
dmattek's avatar
dmattek committed
443 444 445 446
  })
  
  # Display silhouette for a dendrogram cut
  output$outPlotSilhForCut <- renderPlot({
447 448 449 450 451 452 453 454 455 456
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotSilhForCut()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
457 458
      return(NULL)
    
459
    return(loc.p)
dmattek's avatar
dmattek committed
460 461 462 463 464 465 466 467
  })
  
  # Pop-overs ----
  addPopover(session, 
             ns("alLearnMore"),
             title = "Classes of cluster validation",
             content = helpText.clValid[["alLearnMore"]],
             trigger = "click")
dmattek's avatar
dmattek committed
468 469 470 471 472 473 474 475 476 477 478 479
  
  addPopover(session, 
             ns("alLearnMoreRel"),
             title = "Relative validation",
             content = helpText.clValid[["alLearnMoreRel"]],
             trigger = "click")
  
  addPopover(session, 
             ns("alLearnMoreInt"),
             title = "Internal validation",
             content = helpText.clValid[["alLearnMoreInt"]],
             trigger = "click")
dmattek's avatar
dmattek committed
480 481 482
}