tabClValid.R 16.4 KB
Newer Older
dmattek's avatar
dmattek committed
1 2 3 4 5 6
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This module is a tab for hierarchical clustering (base R hclust + dist)

7 8
helpText.clValid = c(alertClValidNAsPresent = paste0("NAs present. The selected distance measure will work, ",
                                              "however caution is recommended. Consider interpolation of NAs and missing data in the left panel."),
dmattek's avatar
dmattek committed
9 10
                    alLearnMore = paste0("<p><a href=http://www.sthda.com/english/wiki/print.php?id=241 title=\"External link\">Clustering</a> ",
                                         "is an <b>unsupervised</b> machine learning method for partitioning ",
dmattek's avatar
dmattek committed
11 12 13 14
                                         "dataset into a set of groups or clusters. The procedure will return clusters ",
                                         "even if the data <b>does not</b> contain any! ",
                                         "Therefore, it’s necessary to ",
                                         "assess clustering tendency before the analysis, and ",
dmattek's avatar
dmattek committed
15 16 17 18 19 20 21 22 23 24
                                         "validate the quality of the result after clustering.<p>"
                                         ),
                    alLearnMoreRel = paste0("<p>Determine the optimal number of clusters by inspecting ",
                                            "the average silhouette width and the total within cluster sum of squares (WSS) ",
                                            "for a range of cluster numbers.</p>", 
                                            "<p><b>Silhouette analysis</b> estimates the average distance between clusters. ",
                                            "Larger silhouette widths indicate better.<p>",
                                            "<p><b>WSS</b> evaluates the compactness of clusters. ",
                                            "Compact clusters achieve low WSS values. ",
                                            "Look for the <i>knee</i> in the plot of WSS as function of cluster numbers.</p>"),
25 26 27 28 29 30
                    alLearnMoreInt = paste0("<p>Evaluate the goodness of a clustering structure by inspecting ",
                                            "principle components, the dendrogram, ",
                                            "and the silhouette for a given number of clusters.</p>",
                                            "<p>Each point in the scatter plot of 2 principle components corresponds to a single time series. ",
                                            "Points are coloured by cluster numbers. Compact, well separated clusters ",
                                            "indicate good partitioning.</p>",
dmattek's avatar
dmattek committed
31 32 33 34 35 36 37
                                            "<p>The height of dendrogram branches indicates how well clusters are separated.</p>",
                                            "<p>The silhouette plot displays how close each time series in one cluster ", 
                                            "is to time series in the neighboring clusters. ",
                                            "A large positive silhouette (Si) indicates time series that are well clustered.",
                                            "A negative Si indicates time series that are closer to ",
                                            "a neighboring cluster, and are placed in the wrong cluster.</p>")
                    )
dmattek's avatar
dmattek committed
38 39 40 41 42 43 44


# UI ----
clustValidUI <- function(id, label = "Validation") {
  ns <- NS(id)
  
  tagList(
dmattek's avatar
dmattek committed
45 46 47 48 49 50
    h4(
      "Cluster validation using ",
      a("factoextra", 
        href="https://cran.r-project.org/web/packages/factoextra/",
        title="External link")
    ),
dmattek's avatar
dmattek committed
51 52 53 54
    actionLink(ns("alLearnMore"), "Learn more"),
    br(),
    br(),
    fluidRow(
dmattek's avatar
dmattek committed
55

56
      column(4,
dmattek's avatar
dmattek committed
57 58 59 60 61 62
             selectInput(
               ns("selectDiss"),
               label = ("Dissimilarity measure"),
               choices = list("Euclidean" = "euclidean",
                              "Manhattan" = "manhattan",
                              "Maximum"   = "maximum",
63 64
                              "Canberra"  = "canberra"),
               selected = "euclidean"
dmattek's avatar
dmattek committed
65
             ),
66
             bsAlert("alertAnchorClValidNAsPresent")
dmattek's avatar
dmattek committed
67
             ),
68
      column(4,
dmattek's avatar
dmattek committed
69 70 71 72 73 74 75 76 77 78 79 80
             selectInput(
               ns("selectLinkage"),
               label = ("Linkage method"),
               choices = list(
                 "Average"  = "average",
                 "Complete" = "complete",
                 "Single"   = "single",
                 "Centroid" = "centroid",
                 "Ward"     = "ward.D",
                 "Ward D2"  = "ward.D2",
                 "McQuitty" = "mcquitty"
               ),
81
               selected = "average"
dmattek's avatar
dmattek committed
82 83 84 85 86 87 88 89
               )
             )
    ),
    
    br(),
    tabsetPanel(
      tabPanel("Relative",
               br(),
dmattek's avatar
dmattek committed
90 91
               p("Determine and visualise the optimal number of clusters. ",
                 actionLink(ns("alLearnMoreRel"), "Learn more")),
dmattek's avatar
dmattek committed
92 93 94 95 96 97 98
               fluidRow(
                 column(2, 
                        actionButton(ns('butPlotRel'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidMaxClust'),
dmattek's avatar
dmattek committed
99
                          'Maximum number of clusters to consider',
dmattek's avatar
dmattek committed
100 101 102 103 104 105 106 107 108 109 110 111
                          min = 2,
                          max = 20,
                          value = 10,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
               withSpinner(plotOutput(ns('outPlotSilhAvg'))),
               br(),
dmattek's avatar
dmattek committed
112
               withSpinner(plotOutput(ns('outPlotWss')))
dmattek's avatar
dmattek committed
113 114 115 116
               
      ),
      tabPanel("Internal",
               br(),
dmattek's avatar
dmattek committed
117 118
               p("Validate a given data partitioning. ",
                 actionLink(ns("alLearnMoreInt"), "Learn more")),
dmattek's avatar
dmattek committed
119 120 121 122 123 124 125
               fluidRow(
                 column(2,
                        actionButton(ns('butPlotInt'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidNclust'),
126
                          'Number of clusters to evaluate',
dmattek's avatar
dmattek committed
127 128 129 130 131 132 133 134 135 136
                          min = 2,
                          max = 20,
                          value = 1,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
137
               withSpinner(plotOutput(ns('outPlotClPCA'))),
dmattek's avatar
dmattek committed
138
               br(),
139
               withSpinner(plotOutput(ns('outPlotTree'))),
dmattek's avatar
dmattek committed
140 141
               br(),
               withSpinner(plotOutput(ns('outPlotSilhForCut')))
dmattek's avatar
dmattek committed
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
      )
    )
  )
}

# SERVER ----
clustValid <- function(input, output, session, in.data4clust) {

  ns = session$ns
  
  # calculate distance matrix for further clustering
  # time series arranged in rows with columns corresponding to time points
  userFitDistHier <- reactive({
    cat(file = stderr(), 'clustValid:userFitDistHier \n')
    
    loc.dm = in.data4clust()
    
    if (is.null(loc.dm)) {
      return(NULL)
    }
    
    # Throw some warnings if NAs present in the dataset.
    # DTW cannot compute distance when NA's are preset.
    # Other distance measures can be calculated but caution is required with interpretation.
166
    print(sum(is.na(loc.dm)))
dmattek's avatar
dmattek committed
167
    if(sum(is.na(loc.dm)) > 0) {
168 169
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresent", title = "Warning",
                    content = helpText.clValid[["alertClValidNAsPresent"]], 
dmattek's avatar
dmattek committed
170 171 172
                    append = FALSE, 
                    style = "warning")
    } else {
173
      closeAlert(session, 'alertClValidNAsPresent')
dmattek's avatar
dmattek committed
174 175 176 177 178 179 180 181 182 183 184
    }
    
    # calculate distance matrix
    
    return(dist(loc.dm, method = input$selectPlotHierDiss))
  })
  
  
  calcDendCut = reactive({
    cat(file = stderr(), 'clustValid:calcDendCut \n')
    
185
    loc.dm = returnDMwithChecks()
dmattek's avatar
dmattek committed
186
    
187
    if (is.null(loc.dm)) {
dmattek's avatar
dmattek committed
188 189 190
      return(NULL)
    }
    
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
    return(factoextra::eclust(x = loc.dm, 
                              FUNcluster = "hclust",
                              k = input$slClValidNclust, 
                              hc_method = input$selectLinkage, 
                              hc_metric = input$selectDiss,
                              graph = FALSE))
  })
  
  # Return a matrix with time series in wide format
  # If data contains NAs (from explicit NAs or due to missing time points, 
  # or due to missing time points after outlier removal),
  # some warnings are thrown. E.g. DTW cannot caluclate distance if NAs are present.
  returnDMwithChecks = reactive({
    cat(file = stderr(), 'clustValid:returnDMwithChecks \n')
    
    loc.dm = in.data4clust()
    
    if (is.null(loc.dm)) {
      return(NULL)
    }
    
    # Throw some warnings if NAs present in the dataset.
    # DTW cannot compute distance when NA's are preset.
    # Other distance measures can be calculated but caution is required with interpretation.
    print(sum(is.na(loc.dm)))
    
    if(sum(is.na(loc.dm)) > 0) {
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresent", 
                    title = "Warning",
                    content = helpText.clValid[["alertClValidNAsPresent"]], 
                    append = FALSE, 
                    style = "warning")
    } else {
      closeAlert(session, 'alertClValidNAsPresent')
    }
    
    return(loc.dm)
dmattek's avatar
dmattek committed
228 229
  })
  
dmattek's avatar
dmattek committed
230
  # Plotting ----
dmattek's avatar
dmattek committed
231 232 233 234 235 236 237
  # Function instead of reactive as per:
  # http://stackoverflow.com/questions/26764481/downloading-png-from-shiny-r
  # This function is used to plot and to downoad a pdf
  
  # plot average silhouette
  plotSilhAvg <- function() {

238 239 240 241 242 243
    locBut = input$butPlotRel
    if (locBut == 0) {
      cat(file = stderr(), 'plotSilhAvg: Go button not pressed\n')
      
      return(NULL)
    }
dmattek's avatar
dmattek committed
244
    
245 246
    loc.dm = returnDMwithChecks()
    if (is.null(loc.dm)) {
dmattek's avatar
dmattek committed
247 248 249
      return(NULL)
    }
    
250 251 252 253 254 255 256 257 258
    loc.p = factoextra::fviz_nbclust(loc.dm,
                                     hcut, 
                                     method = "silhouette",
                                     k.max = input$slClValidMaxClust,
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Average silhouette width") +
      ggtitle("Optimal number of clusters from silhouette analysis") +
dmattek's avatar
dmattek committed
259 260 261 262 263
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
264 265 266 267 268 269
    return(loc.p)
  }

  # plot Ws
  plotWss <- function() {
    
270 271 272 273 274 275
    locBut = input$butPlotRel
    if (locBut == 0) {
      cat(file = stderr(), 'plotWss: Go button not pressed\n')
      
      return(NULL)
    }
dmattek's avatar
dmattek committed
276
    
277 278
    loc.dm = returnDMwithChecks()
    if (is.null(loc.dm)) {
dmattek's avatar
dmattek committed
279 280 281
      return(NULL)
    }
    
282 283 284 285 286 287 288 289 290
    loc.p = factoextra::fviz_nbclust(loc.dm,
                                     hcut, 
                                     method = "wss",
                                     k.max = input$slClValidMaxClust,
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Total within cluster sum of squares") +
      ggtitle("Within cluster sum of squares for different cluster numbers") +
dmattek's avatar
dmattek committed
291 292 293 294 295
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
296 297 298 299 300 301 302
    
    return(loc.p)
  }

  # plot dendrogram tree
  plotTree <- function() {
    
303 304 305 306 307 308
    locBut = input$butPlotInt
    if (locBut == 0) {
      cat(file = stderr(), 'plotTree: Go button not pressed\n')
      
      return(NULL)
    }
dmattek's avatar
dmattek committed
309
    
310
    loc.part = calcDendCut()
dmattek's avatar
dmattek committed
311 312 313 314
    if (is.null(loc.part)) {
      return(NULL)
    }
    
315
    loc.p = factoextra::fviz_dend(loc.part, 
dmattek's avatar
dmattek committed
316 317
                                  show_labels = F,
                                  rect = T,
318 319
                                  xlab = "Time series", 
                                  main = "Dendrogram") +
dmattek's avatar
dmattek committed
320 321 322 323 324 325 326 327 328 329 330 331 332
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
    
    return(loc.p)
  }
  
  
  # PCA visualization of partitioning methods 
  plotClPCA <- function() {
    
333 334 335 336 337 338
    locBut = input$butPlotInt
    if (locBut == 0) {
      cat(file = stderr(), 'plotClPCA: Go button not pressed\n')
      
      return(NULL)
    }
dmattek's avatar
dmattek committed
339
    
340
    loc.part = calcDendCut()
dmattek's avatar
dmattek committed
341
    if (is.null(loc.part)) {
dmattek's avatar
dmattek committed
342 343 344
      return(NULL)
    }
    
345 346 347 348 349
    loc.p = factoextra::fviz_cluster(loc.part, 
                                     geom = "point",
                                     elipse.type = "norm", 
                                     main = "Principle components"
                                     )
dmattek's avatar
dmattek committed
350 351 352 353 354 355 356
    
    return(loc.p)
  }
  
  # plot silhouetts for a particular dendrogram cut
  plotSilhForCut <- function() {
    
357 358 359 360 361 362
    locBut = input$butPlotInt
    if (locBut == 0) {
      cat(file = stderr(), 'plotSilhForCut: Go button not pressed\n')
      
      return(NULL)
    }
dmattek's avatar
dmattek committed
363
    
364
    loc.part = calcDendCut()
dmattek's avatar
dmattek committed
365
    if (is.null(loc.part)) {
dmattek's avatar
dmattek committed
366 367 368
      return(NULL)
    }
    
369 370 371
    loc.p = factoextra::fviz_silhouette(loc.part, 
                                        print.summary = FALSE, 
                                        main = "Silhouette") +
dmattek's avatar
dmattek committed
372 373 374 375 376 377 378
      xlab("Time series") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND) +
      theme(axis.text.x = element_blank())
dmattek's avatar
dmattek committed
379 380 381 382
    
    return(loc.p)
  }
  
dmattek's avatar
dmattek committed
383
  # Plot rendering ----
dmattek's avatar
dmattek committed
384 385
  # Display silhouette
  output$outPlotSilhAvg <- renderPlot({
386 387
    loc.p = plotSilhAvg()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
388 389
      return(NULL)
    
390
    return(loc.p)
dmattek's avatar
dmattek committed
391 392 393 394 395
  })

  
  # Display wss
  output$outPlotWss <- renderPlot({
396 397 398
    loc.p = plotWss()
    if(is.null(loc.p))
      return(NULL)
dmattek's avatar
dmattek committed
399
    
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
    return(loc.p)
  })
  
  # Display PCA of clustering
  output$outPlotClPCA <- renderPlot({
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotClPCA()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
415 416
      return(NULL)
    
417
    return(loc.p)
dmattek's avatar
dmattek committed
418 419 420 421
  })
  
  # Display tree
  output$outPlotTree <- renderPlot({
422 423 424 425 426 427 428 429 430 431
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotTree()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
432 433
      return(NULL)
    
434
    return(loc.p)
dmattek's avatar
dmattek committed
435 436 437 438
  })
  
  # Display silhouette for a dendrogram cut
  output$outPlotSilhForCut <- renderPlot({
439 440 441 442 443 444 445 446 447 448
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotSilhForCut()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
449 450
      return(NULL)
    
451
    return(loc.p)
dmattek's avatar
dmattek committed
452 453 454 455 456 457 458 459
  })
  
  # Pop-overs ----
  addPopover(session, 
             ns("alLearnMore"),
             title = "Classes of cluster validation",
             content = helpText.clValid[["alLearnMore"]],
             trigger = "click")
dmattek's avatar
dmattek committed
460 461 462 463 464 465 466 467 468 469 470 471
  
  addPopover(session, 
             ns("alLearnMoreRel"),
             title = "Relative validation",
             content = helpText.clValid[["alLearnMoreRel"]],
             trigger = "click")
  
  addPopover(session, 
             ns("alLearnMoreInt"),
             title = "Internal validation",
             content = helpText.clValid[["alLearnMoreInt"]],
             trigger = "click")
dmattek's avatar
dmattek committed
472 473 474
}