tabClValid.R 19.3 KB
Newer Older
dmattek's avatar
dmattek committed
1
2
3
4
5
6
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This module is a tab for hierarchical clustering (base R hclust + dist)

7
helpText.clValid = c(alertClValidNAsPresent = paste0("NAs present. The selected distance measure will work, ",
8
9
                                              "however PCA will not be avaliable."),
                     alertClValidNAsPresentDTW = paste0("NAs present. DTW distance measure will NOT work."),
dmattek's avatar
dmattek committed
10
11
                    alLearnMore = paste0("<p><a href=http://www.sthda.com/english/wiki/print.php?id=241 title=\"External link\">Clustering</a> ",
                                         "is an <b>unsupervised</b> machine learning method for partitioning ",
12
                                         "dataset into a set of groups called clusters. The procedure will return clusters ",
dmattek's avatar
dmattek committed
13
14
15
                                         "even if the data <b>does not</b> contain any! ",
                                         "Therefore, it’s necessary to ",
                                         "assess clustering tendency before the analysis, and ",
dmattek's avatar
dmattek committed
16
17
18
19
20
                                         "validate the quality of the result after clustering.<p>"
                                         ),
                    alLearnMoreRel = paste0("<p>Determine the optimal number of clusters by inspecting ",
                                            "the average silhouette width and the total within cluster sum of squares (WSS) ",
                                            "for a range of cluster numbers.</p>", 
21
22
23
24
                                            "<p><b>Silhouette analysis</b> first computes how close each trajectory is with others in the cluster it is assigned to, ",
                                            "this is then compared to closeness with trajectories in other clusters. ",
                                            "Larger average silhouette widths usually indicate better clustering. To make sure averaging does not hide a locally bad",
                                            "clustering, this should be inspected along with the silhouette plot in the \"Internal\" tab.<p>",
dmattek's avatar
dmattek committed
25
26
                                            "<p><b>WSS</b> evaluates the compactness of clusters. ",
                                            "Compact clusters achieve low WSS values. ",
27
                                            "Look for the <i>elbow</i> in the plot of WSS as function of cluster numbers.</p>"),
28
                    alLearnMoreInt = paste0("<p>Evaluate the goodness of a clustering structure by inspecting ",
29
                                            "principal components, the dendrogram, ",
30
                                            "and the silhouette for a given number of clusters.</p>",
31
                                            "<p><b>Principal components:</b> Each point in the scatter plot corresponds to a single time series in the first 2 PCs space. ",
32
                                            "Points are coloured by cluster numbers. Compact, well separated clusters ",
33
34
35
36
37
38
39
                                            "indicate good partitioning. The percentage of total variance carried by each PC is indicated.</p>",
                                            "<p><b>Dendrogram:</b> The height of branches indicates how well clusters are separated.</p>",
                                            "<p><b>Silhouette plot:</b> The plot indicates for each series whether it is on average closer to series within its cluster ",
                                            "or to series in other clusters. Each bar represents the <a href=https://en.wikipedia.org/wiki/Silhouette_(clustering) title=\"External link\">silhouette score</a> ",
                                            "(Si) for one series. The height of the bars varies ",
                                            "between 1 (the series is much closer to series in its cluster) and -1 (the series is much closer to series in an other cluster). ",
                                            "Hence, large positive values of Si are usually associated with better clustering, while negative values are associated with worse clustering.")
dmattek's avatar
dmattek committed
40
                    )
dmattek's avatar
dmattek committed
41
42
43
44
45
46
47


# UI ----
clustValidUI <- function(id, label = "Validation") {
  ns <- NS(id)
  
  tagList(
dmattek's avatar
dmattek committed
48
49
50
51
52
53
    h4(
      "Cluster validation using ",
      a("factoextra", 
        href="https://cran.r-project.org/web/packages/factoextra/",
        title="External link")
    ),
dmattek's avatar
dmattek committed
54
55
56
57
    actionLink(ns("alLearnMore"), "Learn more"),
    br(),
    br(),
    fluidRow(
dmattek's avatar
dmattek committed
58

59
      column(4,
dmattek's avatar
dmattek committed
60
61
62
63
64
65
             selectInput(
               ns("selectDiss"),
               label = ("Dissimilarity measure"),
               choices = list("Euclidean" = "euclidean",
                              "Manhattan" = "manhattan",
                              "Maximum"   = "maximum",
66
67
                              "Canberra"  = "canberra",
                              "DTW"       = "DTW"),
68
               selected = "euclidean"
dmattek's avatar
dmattek committed
69
             ),
70
             bsAlert("alertAnchorClValidNAsPresent")
dmattek's avatar
dmattek committed
71
             ),
72
      column(4,
dmattek's avatar
dmattek committed
73
74
75
76
77
78
79
80
81
82
83
84
             selectInput(
               ns("selectLinkage"),
               label = ("Linkage method"),
               choices = list(
                 "Average"  = "average",
                 "Complete" = "complete",
                 "Single"   = "single",
                 "Centroid" = "centroid",
                 "Ward"     = "ward.D",
                 "Ward D2"  = "ward.D2",
                 "McQuitty" = "mcquitty"
               ),
85
               selected = "average"
dmattek's avatar
dmattek committed
86
87
88
89
90
91
92
93
               )
             )
    ),
    
    br(),
    tabsetPanel(
      tabPanel("Relative",
               br(),
dmattek's avatar
dmattek committed
94
95
               p("Determine and visualise the optimal number of clusters. ",
                 actionLink(ns("alLearnMoreRel"), "Learn more")),
dmattek's avatar
dmattek committed
96
97
98
99
100
101
102
               fluidRow(
                 column(2, 
                        actionButton(ns('butPlotRel'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidMaxClust'),
dmattek's avatar
dmattek committed
103
                          'Maximum number of clusters to consider',
dmattek's avatar
dmattek committed
104
105
106
107
108
109
110
111
112
113
114
115
                          min = 2,
                          max = 20,
                          value = 10,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
               withSpinner(plotOutput(ns('outPlotSilhAvg'))),
               br(),
dmattek's avatar
dmattek committed
116
               withSpinner(plotOutput(ns('outPlotWss')))
dmattek's avatar
dmattek committed
117
118
119
120
               
      ),
      tabPanel("Internal",
               br(),
dmattek's avatar
dmattek committed
121
122
               p("Validate a given data partitioning. ",
                 actionLink(ns("alLearnMoreInt"), "Learn more")),
dmattek's avatar
dmattek committed
123
124
125
126
127
128
129
               fluidRow(
                 column(2,
                        actionButton(ns('butPlotInt'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidNclust'),
130
                          'Number of clusters to evaluate',
dmattek's avatar
dmattek committed
131
132
133
134
135
136
137
138
139
140
                          min = 2,
                          max = 20,
                          value = 1,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
141
               withSpinner(plotOutput(ns('outPlotTree'))),
dmattek's avatar
dmattek committed
142
               br(),
143
144
145
               withSpinner(plotOutput(ns('outPlotSilhForCut'))),
               br(),
               withSpinner(plotOutput(ns('outPlotClPCA')))
dmattek's avatar
dmattek committed
146
147
148
149
150
151
      )
    )
  )
}

# SERVER ----
152
clustValid <- function(input, output, session, in.dataWide) {
dmattek's avatar
dmattek committed
153
154
155

  ns = session$ns
  
156
157
158
159
160
161
162
163
164
165
166
167
  # Return the number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnNclust = reactive({
    return(input$slClValidNclust)
  }) %>% debounce(MILLIS)
  
  # Return max number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnMaxNclust = reactive({
    return(input$slClValidMaxClust)
  }) %>% debounce(MILLIS)
  
dmattek's avatar
dmattek committed
168
169
  # calculate distance matrix for further clustering
  # time series arranged in rows with columns corresponding to time points
170
171
  calcDist <- reactive({
    cat(file = stderr(), 'clustValid:calcDist \n')
dmattek's avatar
dmattek committed
172
    
173
    loc.dm = in.dataWide()
dmattek's avatar
dmattek committed
174
175
176
177
178
179
180
181
    
    if (is.null(loc.dm)) {
      return(NULL)
    }
    
    # Throw some warnings if NAs present in the dataset.
    # DTW cannot compute distance when NA's are preset.
    # Other distance measures can be calculated but caution is required with interpretation.
182
183
    # NAs in the wide format can result from explicit NAs in the measurment column or
    # from missing rows that cause NAs to appear when convertinf from long to wide (dcast)
dmattek's avatar
dmattek committed
184
    if(sum(is.na(loc.dm)) > 0) {
185
186
187
188
189
190
191
192
193
194
      if (input$selectDiss == "DTW") {
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresentDTW", title = "Error",
                    content = helpText.clValid[["alertClValidNAsPresentDTW"]], 
                    append = FALSE,
                    style = "danger")
        closeAlert(session, 'alertClValidNAsPresent')
        
        return(NULL)
        
      } else {
195
196
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresent", title = "Warning",
                    content = helpText.clValid[["alertClValidNAsPresent"]], 
dmattek's avatar
dmattek committed
197
198
                    append = FALSE, 
                    style = "warning")
199
200
        closeAlert(session, 'alertClValidNAsPresentDTW')
      }
dmattek's avatar
dmattek committed
201
    } else {
202
      closeAlert(session, 'alertClValidNAsPresentDTW')
203
      closeAlert(session, 'alertClValidNAsPresent')
204
    }    
dmattek's avatar
dmattek committed
205
206
    
    
207
208
    # calculate distance matrix
    return(proxy::dist(loc.dm, method = input$selectDiss))
dmattek's avatar
dmattek committed
209
210
  })
  
211
  # calculate dendrogram for a chosen number of clusters and the linkage method
dmattek's avatar
dmattek committed
212
213
214
  calcDendCut = reactive({
    cat(file = stderr(), 'clustValid:calcDendCut \n')
    
215
    loc.dist = calcDist()
dmattek's avatar
dmattek committed
216
    
217
    if (is.null(loc.dist)) {
dmattek's avatar
dmattek committed
218
219
220
      return(NULL)
    }
    
221
222
223
224
225
226
    return(LOChcut(x = loc.dist,
                   k = returnNclust(),
                   hc_func = "hclust",
                   hc_method = input$selectLinkage,
                   hc_metric = input$selectDiss
                   ))    
227
228
  })
  
dmattek's avatar
dmattek committed
229
  # Plotting ----
dmattek's avatar
dmattek committed
230
231
232
233
234
235
  # Function instead of reactive as per:
  # http://stackoverflow.com/questions/26764481/downloading-png-from-shiny-r
  # This function is used to plot and to downoad a pdf
  
  # plot average silhouette
  plotSilhAvg <- function() {
236
    cat(file = stderr(), 'plotSilhAvg: in\n')
dmattek's avatar
dmattek committed
237
    
238
239
240
241
242
243
244
245
246
247
248
249
    # make the f-n dependent on the button click
    locBut = input$butPlotRel

    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
    validate(
      need(!is.null(loc.dist), "Nothing to plot. Load data first!")
    )    

    loc.p = LOCnbclust(loc.dist,
250
                                     method = "silhouette",
251
                                     k.max = returnMaxNclust(),
252
253
254
255
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Average silhouette width") +
256
      ggtitle("Average silhouette width for different cluster numbers") +
dmattek's avatar
dmattek committed
257
258
259
260
261
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
262
263
264
265
266
    return(loc.p)
  }

  # plot Ws
  plotWss <- function() {
267
    cat(file = stderr(), 'plotWss: in\n')
dmattek's avatar
dmattek committed
268
    
269
    # make the f-n dependent on the button click
270
    locBut = input$butPlotRel
dmattek's avatar
dmattek committed
271
    
272
273
274
275
276
277
278
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
    validate(
      need(!is.null(loc.dist), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
279
    
280
    loc.p = LOCnbclust(loc.dist,
281
                                     method = "wss",
282
                                     k.max = returnMaxNclust(),
283
284
285
286
287
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Total within cluster sum of squares") +
      ggtitle("Within cluster sum of squares for different cluster numbers") +
dmattek's avatar
dmattek committed
288
289
290
291
292
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
293
294
295
    
    return(loc.p)
  }
dmattek's avatar
dmattek committed
296
297
298
  
  # PCA visualization of partitioning methods 
  plotClPCA <- function() {
299
    cat(file = stderr(), 'plotTree: in\n')
dmattek's avatar
dmattek committed
300
    
301
    # make the f-n dependent on the button click
302
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
303
    
304
    # until clicking the Plot button
305
    loc.part = calcDendCut()
306
307
    loc.dm = in.dataWide()
    print(sum(is.na(loc.dm)))
308

309
310
311
312
313
314
315
316
    
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!"),
      need(!is.null(loc.dm),   "Nothing to plot. Load data first!"),
      need(sum(is.na(loc.dm)), "Cannot calculate PCA in the presence of missing data and/or NAs.")
    )    
    
    if (sum(is.na(loc.dm)) > 0)
dmattek's avatar
dmattek committed
317
318
      return(NULL)
    
319
    loc.p = factoextra::fviz_cluster(loc.part, 
320
                                     data = loc.dm,
321
                                     geom = "point",
322
                                     elipse.type = "convex", 
323
                                     main = "Principal components"
324
325
326
327
328
329
                                     )+
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376

    
    # Retrieve association of cluster and colours and use it for dendrogram for color matching between dend, silhouette and PCA plot
    temp = ggplot_build(loc.p)
    map_individual = as.data.table(temp$data[[1]][, c("colour", "shape")])
    map_cluster = map_individual[, .SD[1], by = shape]
    map_cluster[, cluster := 1:nrow(map_cluster)]

    return(list(plot = loc.p, mapping_individual = map_individual, mapping_cluster = map_cluster))
  }
  
  
  # plot dendrogram tree
  plotTree <- function() {
    cat(file = stderr(), 'plotTree: in\n')
    
    # make the f-n dependent on the button click
    locBut = input$butPlotInt
    
    # Check if required data exists
    loc.part = calcDendCut()
    # Rerun the PCA plot to obtain clour mapping of clusters in PCA and silhouette plot and match it with dendrogram colors.
    loc.map = plotClPCA()
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!"),
      need(!is.null(loc.map), "Nothing to plot. Load data first!")
    )    
    
    # Determine cluster order of occurence from left to right in the dendrogram
    # This is necessary because fviz_dend colors clusters from left to right,
    # whereas fviz_silhouette and fviz_cluster use the order of cluster first occurence in the list of individuals.
    loc.mapClus = loc.map$mapping_cluster
    ord.clusDend = unique(loc.part$cluster[loc.part$order])
    col.clusDend = loc.mapClus[, colour][ord.clusDend]
        
    loc.p = factoextra::fviz_dend(loc.part,
                                  k = returnNclust(),
                                  k_colors = col.clusDend,
                                  show_labels = F,
                                  rect = T,
                                  xlab = "Time series",
                                  main = "Dendrogram") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE,
                     in.font.axis.text = PLOTFONTAXISTEXT,
                     in.font.axis.title = PLOTFONTAXISTITLE,
                     in.font.strip = PLOTFONTFACETSTRIP,
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
377
378
379
380
    
    return(loc.p)
  }
  
381
382
  
  # plot silhouettes for a particular dendrogram cut
dmattek's avatar
dmattek committed
383
  plotSilhForCut <- function() {
384
    cat(file = stderr(), 'plotSilhForCut: in\n')
dmattek's avatar
dmattek committed
385
    
386
    # make the f-n dependent on the button click
387
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
388
    
389
    # until clicking the Plot button
390
    loc.part = calcDendCut()
391
392
393
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
394
    
395
396
397
    loc.p = factoextra::fviz_silhouette(loc.part, 
                                        print.summary = FALSE, 
                                        main = "Silhouette") +
dmattek's avatar
dmattek committed
398
399
400
401
402
403
404
      xlab("Time series") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND) +
      theme(axis.text.x = element_blank())
dmattek's avatar
dmattek committed
405
406
407
408
    
    return(loc.p)
  }
  
dmattek's avatar
dmattek committed
409
  # Plot rendering ----
dmattek's avatar
dmattek committed
410
411
  # Display silhouette
  output$outPlotSilhAvg <- renderPlot({
412
413
    loc.p = plotSilhAvg()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
414
415
      return(NULL)
    
416
    return(loc.p)
dmattek's avatar
dmattek committed
417
418
419
420
421
  })

  
  # Display wss
  output$outPlotWss <- renderPlot({
422
423
424
    loc.p = plotWss()
    if(is.null(loc.p))
      return(NULL)
dmattek's avatar
dmattek committed
425
    
426
427
428
429
430
431
432
433
434
435
436
437
438
439
    return(loc.p)
  })
  
  # Display PCA of clustering
  output$outPlotClPCA <- renderPlot({
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotClPCA()
440
    loc.p = loc.p$plot
441
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
442
443
      return(NULL)
    
444
    return(loc.p)
dmattek's avatar
dmattek committed
445
446
447
448
  })
  
  # Display tree
  output$outPlotTree <- renderPlot({
449
450
451
452
453
454
455
456
457
458
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotTree()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
459
460
      return(NULL)
    
461
    return(loc.p)
dmattek's avatar
dmattek committed
462
463
464
465
  })
  
  # Display silhouette for a dendrogram cut
  output$outPlotSilhForCut <- renderPlot({
466
467
468
469
470
471
472
473
474
475
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotSilhForCut()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
476
477
      return(NULL)
    
478
    return(loc.p)
dmattek's avatar
dmattek committed
479
480
481
482
483
484
485
486
  })
  
  # Pop-overs ----
  addPopover(session, 
             ns("alLearnMore"),
             title = "Classes of cluster validation",
             content = helpText.clValid[["alLearnMore"]],
             trigger = "click")
dmattek's avatar
dmattek committed
487
488
489
490
491
492
493
494
495
496
497
498
  
  addPopover(session, 
             ns("alLearnMoreRel"),
             title = "Relative validation",
             content = helpText.clValid[["alLearnMoreRel"]],
             trigger = "click")
  
  addPopover(session, 
             ns("alLearnMoreInt"),
             title = "Internal validation",
             content = helpText.clValid[["alLearnMoreInt"]],
             trigger = "click")
dmattek's avatar
dmattek committed
499
500
501
}