tabClValid.R 17.7 KB
Newer Older
dmattek's avatar
dmattek committed
1
2
3
4
5
6
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This module is a tab for hierarchical clustering (base R hclust + dist)

7
helpText.clValid = c(alertClValidNAsPresent = paste0("NAs present. The selected distance measure will work, ",
8
9
                                              "however PCA will not be avaliable."),
                     alertClValidNAsPresentDTW = paste0("NAs present. DTW distance measure will NOT work."),
dmattek's avatar
dmattek committed
10
11
                    alLearnMore = paste0("<p><a href=http://www.sthda.com/english/wiki/print.php?id=241 title=\"External link\">Clustering</a> ",
                                         "is an <b>unsupervised</b> machine learning method for partitioning ",
dmattek's avatar
dmattek committed
12
13
14
15
                                         "dataset into a set of groups or clusters. The procedure will return clusters ",
                                         "even if the data <b>does not</b> contain any! ",
                                         "Therefore, it’s necessary to ",
                                         "assess clustering tendency before the analysis, and ",
dmattek's avatar
dmattek committed
16
17
18
19
20
21
22
23
24
25
                                         "validate the quality of the result after clustering.<p>"
                                         ),
                    alLearnMoreRel = paste0("<p>Determine the optimal number of clusters by inspecting ",
                                            "the average silhouette width and the total within cluster sum of squares (WSS) ",
                                            "for a range of cluster numbers.</p>", 
                                            "<p><b>Silhouette analysis</b> estimates the average distance between clusters. ",
                                            "Larger silhouette widths indicate better.<p>",
                                            "<p><b>WSS</b> evaluates the compactness of clusters. ",
                                            "Compact clusters achieve low WSS values. ",
                                            "Look for the <i>knee</i> in the plot of WSS as function of cluster numbers.</p>"),
26
27
28
29
30
31
                    alLearnMoreInt = paste0("<p>Evaluate the goodness of a clustering structure by inspecting ",
                                            "principle components, the dendrogram, ",
                                            "and the silhouette for a given number of clusters.</p>",
                                            "<p>Each point in the scatter plot of 2 principle components corresponds to a single time series. ",
                                            "Points are coloured by cluster numbers. Compact, well separated clusters ",
                                            "indicate good partitioning.</p>",
dmattek's avatar
dmattek committed
32
33
34
35
36
37
38
                                            "<p>The height of dendrogram branches indicates how well clusters are separated.</p>",
                                            "<p>The silhouette plot displays how close each time series in one cluster ", 
                                            "is to time series in the neighboring clusters. ",
                                            "A large positive silhouette (Si) indicates time series that are well clustered.",
                                            "A negative Si indicates time series that are closer to ",
                                            "a neighboring cluster, and are placed in the wrong cluster.</p>")
                    )
dmattek's avatar
dmattek committed
39
40
41
42
43
44
45


# UI ----
clustValidUI <- function(id, label = "Validation") {
  ns <- NS(id)
  
  tagList(
dmattek's avatar
dmattek committed
46
47
48
49
50
51
    h4(
      "Cluster validation using ",
      a("factoextra", 
        href="https://cran.r-project.org/web/packages/factoextra/",
        title="External link")
    ),
dmattek's avatar
dmattek committed
52
53
54
55
    actionLink(ns("alLearnMore"), "Learn more"),
    br(),
    br(),
    fluidRow(
dmattek's avatar
dmattek committed
56

57
      column(4,
dmattek's avatar
dmattek committed
58
59
60
61
62
63
             selectInput(
               ns("selectDiss"),
               label = ("Dissimilarity measure"),
               choices = list("Euclidean" = "euclidean",
                              "Manhattan" = "manhattan",
                              "Maximum"   = "maximum",
64
65
                              "Canberra"  = "canberra",
                              "DTW"       = "DTW"),
66
               selected = "euclidean"
dmattek's avatar
dmattek committed
67
             ),
68
             bsAlert("alertAnchorClValidNAsPresent")
dmattek's avatar
dmattek committed
69
             ),
70
      column(4,
dmattek's avatar
dmattek committed
71
72
73
74
75
76
77
78
79
80
81
82
             selectInput(
               ns("selectLinkage"),
               label = ("Linkage method"),
               choices = list(
                 "Average"  = "average",
                 "Complete" = "complete",
                 "Single"   = "single",
                 "Centroid" = "centroid",
                 "Ward"     = "ward.D",
                 "Ward D2"  = "ward.D2",
                 "McQuitty" = "mcquitty"
               ),
83
               selected = "average"
dmattek's avatar
dmattek committed
84
85
86
87
88
89
90
91
               )
             )
    ),
    
    br(),
    tabsetPanel(
      tabPanel("Relative",
               br(),
dmattek's avatar
dmattek committed
92
93
               p("Determine and visualise the optimal number of clusters. ",
                 actionLink(ns("alLearnMoreRel"), "Learn more")),
dmattek's avatar
dmattek committed
94
95
96
97
98
99
100
               fluidRow(
                 column(2, 
                        actionButton(ns('butPlotRel'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidMaxClust'),
dmattek's avatar
dmattek committed
101
                          'Maximum number of clusters to consider',
dmattek's avatar
dmattek committed
102
103
104
105
106
107
108
109
110
111
112
113
                          min = 2,
                          max = 20,
                          value = 10,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
               withSpinner(plotOutput(ns('outPlotSilhAvg'))),
               br(),
dmattek's avatar
dmattek committed
114
               withSpinner(plotOutput(ns('outPlotWss')))
dmattek's avatar
dmattek committed
115
116
117
118
               
      ),
      tabPanel("Internal",
               br(),
dmattek's avatar
dmattek committed
119
120
               p("Validate a given data partitioning. ",
                 actionLink(ns("alLearnMoreInt"), "Learn more")),
dmattek's avatar
dmattek committed
121
122
123
124
125
126
127
               fluidRow(
                 column(2,
                        actionButton(ns('butPlotInt'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidNclust'),
128
                          'Number of clusters to evaluate',
dmattek's avatar
dmattek committed
129
130
131
132
133
134
135
136
137
138
                          min = 2,
                          max = 20,
                          value = 1,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
139
               withSpinner(plotOutput(ns('outPlotTree'))),
dmattek's avatar
dmattek committed
140
               br(),
141
142
143
               withSpinner(plotOutput(ns('outPlotSilhForCut'))),
               br(),
               withSpinner(plotOutput(ns('outPlotClPCA')))
dmattek's avatar
dmattek committed
144
145
146
147
148
149
      )
    )
  )
}

# SERVER ----
150
clustValid <- function(input, output, session, in.dataWide) {
dmattek's avatar
dmattek committed
151
152
153

  ns = session$ns
  
154
155
156
157
158
159
160
161
162
163
164
165
  # Return the number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnNclust = reactive({
    return(input$slClValidNclust)
  }) %>% debounce(MILLIS)
  
  # Return max number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnMaxNclust = reactive({
    return(input$slClValidMaxClust)
  }) %>% debounce(MILLIS)
  
dmattek's avatar
dmattek committed
166
167
  # calculate distance matrix for further clustering
  # time series arranged in rows with columns corresponding to time points
168
169
  calcDist <- reactive({
    cat(file = stderr(), 'clustValid:calcDist \n')
dmattek's avatar
dmattek committed
170
    
171
    loc.dm = in.dataWide()
dmattek's avatar
dmattek committed
172
173
174
175
176
177
178
179
    
    if (is.null(loc.dm)) {
      return(NULL)
    }
    
    # Throw some warnings if NAs present in the dataset.
    # DTW cannot compute distance when NA's are preset.
    # Other distance measures can be calculated but caution is required with interpretation.
180
181
    # NAs in the wide format can result from explicit NAs in the measurment column or
    # from missing rows that cause NAs to appear when convertinf from long to wide (dcast)
dmattek's avatar
dmattek committed
182
    if(sum(is.na(loc.dm)) > 0) {
183
184
185
186
187
188
189
190
191
192
      if (input$selectDiss == "DTW") {
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresentDTW", title = "Error",
                    content = helpText.clValid[["alertClValidNAsPresentDTW"]], 
                    append = FALSE,
                    style = "danger")
        closeAlert(session, 'alertClValidNAsPresent')
        
        return(NULL)
        
      } else {
193
194
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresent", title = "Warning",
                    content = helpText.clValid[["alertClValidNAsPresent"]], 
dmattek's avatar
dmattek committed
195
196
                    append = FALSE, 
                    style = "warning")
197
198
        closeAlert(session, 'alertClValidNAsPresentDTW')
      }
dmattek's avatar
dmattek committed
199
    } else {
200
      closeAlert(session, 'alertClValidNAsPresentDTW')
201
      closeAlert(session, 'alertClValidNAsPresent')
202
    }    
dmattek's avatar
dmattek committed
203
204
    
    
205
206
    # calculate distance matrix
    return(proxy::dist(loc.dm, method = input$selectDiss))
dmattek's avatar
dmattek committed
207
208
  })
  
209
  # calculate dendrogram for a chosen number of clusters and the linkage method
dmattek's avatar
dmattek committed
210
211
212
  calcDendCut = reactive({
    cat(file = stderr(), 'clustValid:calcDendCut \n')
    
213
    loc.dist = calcDist()
dmattek's avatar
dmattek committed
214
    
215
    if (is.null(loc.dist)) {
dmattek's avatar
dmattek committed
216
217
218
      return(NULL)
    }
    
219
220
    return(factoextra::hcut(x = loc.dist,
                            k = returnNclust(),
221
222
223
224
225
                              FUNcluster = "hclust",
                              hc_method = input$selectLinkage, 
                              graph = FALSE))
  })
  
dmattek's avatar
dmattek committed
226
  # Plotting ----
dmattek's avatar
dmattek committed
227
228
229
230
231
232
  # Function instead of reactive as per:
  # http://stackoverflow.com/questions/26764481/downloading-png-from-shiny-r
  # This function is used to plot and to downoad a pdf
  
  # plot average silhouette
  plotSilhAvg <- function() {
233
    cat(file = stderr(), 'plotSilhAvg: in\n')
dmattek's avatar
dmattek committed
234
    
235
236
237
238
239
240
241
242
243
244
245
246
    # make the f-n dependent on the button click
    locBut = input$butPlotRel

    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
    validate(
      need(!is.null(loc.dist), "Nothing to plot. Load data first!")
    )    

    loc.p = LOCnbclust(loc.dist,
247
                                     method = "silhouette",
248
                                     k.max = returnMaxNclust(),
249
250
251
252
253
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Average silhouette width") +
      ggtitle("Optimal number of clusters from silhouette analysis") +
dmattek's avatar
dmattek committed
254
255
256
257
258
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
259
260
261
262
263
    return(loc.p)
  }

  # plot Ws
  plotWss <- function() {
264
    cat(file = stderr(), 'plotWss: in\n')
dmattek's avatar
dmattek committed
265
    
266
    # make the f-n dependent on the button click
267
    locBut = input$butPlotRel
dmattek's avatar
dmattek committed
268
    
269
270
271
272
273
274
275
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
    validate(
      need(!is.null(loc.dist), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
276
    
277
    loc.p = LOCnbclust(loc.dist,
278
                                     method = "wss",
279
                                     k.max = returnMaxNclust(),
280
281
282
283
284
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Total within cluster sum of squares") +
      ggtitle("Within cluster sum of squares for different cluster numbers") +
dmattek's avatar
dmattek committed
285
286
287
288
289
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
290
291
292
293
294
295
    
    return(loc.p)
  }

  # plot dendrogram tree
  plotTree <- function() {
296
    cat(file = stderr(), 'plotTree: in\n')
dmattek's avatar
dmattek committed
297
    
298
    # make the f-n dependent on the button click
299
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
300
    
301
302
303
304
305
306
307
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.part = isolate(calcDendCut())
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
308
    
309
    loc.p = factoextra::fviz_dend(loc.part, 
dmattek's avatar
dmattek committed
310
311
                                  show_labels = F,
                                  rect = T,
312
313
                                  xlab = "Time series", 
                                  main = "Dendrogram") +
dmattek's avatar
dmattek committed
314
315
316
317
318
319
320
321
322
323
324
325
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
    
    return(loc.p)
  }
  
  
  # PCA visualization of partitioning methods 
  plotClPCA <- function() {
326
    cat(file = stderr(), 'plotTree: in\n')
dmattek's avatar
dmattek committed
327
    
328
    # make the f-n dependent on the button click
329
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
330
    
331
332
333
334
335
336
337
338
339
340
341
342
343
344
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.part = isolate(calcDendCut())
    loc.dm = in.dataWide()
    print(sum(is.na(loc.dm)))
    
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!"),
      need(!is.null(loc.dm),   "Nothing to plot. Load data first!"),
      need(sum(is.na(loc.dm)), "Cannot calculate PCA in the presence of missing data and/or NAs.")
    )    
    
    if (sum(is.na(loc.dm)) > 0)
dmattek's avatar
dmattek committed
345
346
      return(NULL)
    
347
    loc.p = factoextra::fviz_cluster(loc.part, 
348
                                     data = loc.dm,
349
                                     geom = "point",
350
                                     elipse.type = "convex", 
351
                                     main = "Principle components"
352
353
354
355
356
357
                                     )+
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
358
359
360
361
362
363
    
    return(loc.p)
  }
  
  # plot silhouetts for a particular dendrogram cut
  plotSilhForCut <- function() {
364
    cat(file = stderr(), 'plotSilhForCut: in\n')
dmattek's avatar
dmattek committed
365
    
366
    # make the f-n dependent on the button click
367
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
368
    
369
370
371
372
373
374
375
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.part = isolate(calcDendCut())
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
376
    
377
378
379
    loc.p = factoextra::fviz_silhouette(loc.part, 
                                        print.summary = FALSE, 
                                        main = "Silhouette") +
dmattek's avatar
dmattek committed
380
381
382
383
384
385
386
      xlab("Time series") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND) +
      theme(axis.text.x = element_blank())
dmattek's avatar
dmattek committed
387
388
389
390
    
    return(loc.p)
  }
  
dmattek's avatar
dmattek committed
391
  # Plot rendering ----
dmattek's avatar
dmattek committed
392
393
  # Display silhouette
  output$outPlotSilhAvg <- renderPlot({
394
395
    loc.p = plotSilhAvg()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
396
397
      return(NULL)
    
398
    return(loc.p)
dmattek's avatar
dmattek committed
399
400
401
402
403
  })

  
  # Display wss
  output$outPlotWss <- renderPlot({
404
405
406
    loc.p = plotWss()
    if(is.null(loc.p))
      return(NULL)
dmattek's avatar
dmattek committed
407
    
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
    return(loc.p)
  })
  
  # Display PCA of clustering
  output$outPlotClPCA <- renderPlot({
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotClPCA()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
423
424
      return(NULL)
    
425
    return(loc.p)
dmattek's avatar
dmattek committed
426
427
428
429
  })
  
  # Display tree
  output$outPlotTree <- renderPlot({
430
431
432
433
434
435
436
437
438
439
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotTree()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
440
441
      return(NULL)
    
442
    return(loc.p)
dmattek's avatar
dmattek committed
443
444
445
446
  })
  
  # Display silhouette for a dendrogram cut
  output$outPlotSilhForCut <- renderPlot({
447
448
449
450
451
452
453
454
455
456
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotSilhForCut()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
457
458
      return(NULL)
    
459
    return(loc.p)
dmattek's avatar
dmattek committed
460
461
462
463
464
465
466
467
  })
  
  # Pop-overs ----
  addPopover(session, 
             ns("alLearnMore"),
             title = "Classes of cluster validation",
             content = helpText.clValid[["alLearnMore"]],
             trigger = "click")
dmattek's avatar
dmattek committed
468
469
470
471
472
473
474
475
476
477
478
479
  
  addPopover(session, 
             ns("alLearnMoreRel"),
             title = "Relative validation",
             content = helpText.clValid[["alLearnMoreRel"]],
             trigger = "click")
  
  addPopover(session, 
             ns("alLearnMoreInt"),
             title = "Internal validation",
             content = helpText.clValid[["alLearnMoreInt"]],
             trigger = "click")
dmattek's avatar
dmattek committed
480
481
482
}