tabClValid.R 20 KB
Newer Older
dmattek's avatar
dmattek committed
1
2
3
4
5
6
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This module is a tab for hierarchical clustering (base R hclust + dist)

7
helpText.clValid = c(alertClValidNAsPresent = paste0("NAs present. The selected distance measure will work, ",
8
9
                                              "however PCA will not be avaliable."),
                     alertClValidNAsPresentDTW = paste0("NAs present. DTW distance measure will NOT work."),
dmattek's avatar
dmattek committed
10
11
                    alLearnMore = paste0("<p><a href=http://www.sthda.com/english/wiki/print.php?id=241 title=\"External link\">Clustering</a> ",
                                         "is an <b>unsupervised</b> machine learning method for partitioning ",
12
                                         "dataset into a set of groups called clusters. The procedure will return clusters ",
dmattek's avatar
dmattek committed
13
14
15
                                         "even if the data <b>does not</b> contain any! ",
                                         "Therefore, it’s necessary to ",
                                         "assess clustering tendency before the analysis, and ",
dmattek's avatar
dmattek committed
16
17
18
19
20
                                         "validate the quality of the result after clustering.<p>"
                                         ),
                    alLearnMoreRel = paste0("<p>Determine the optimal number of clusters by inspecting ",
                                            "the average silhouette width and the total within cluster sum of squares (WSS) ",
                                            "for a range of cluster numbers.</p>", 
21
22
23
24
                                            "<p><b>Silhouette analysis</b> first computes how close each trajectory is with others in the cluster it is assigned to, ",
                                            "this is then compared to closeness with trajectories in other clusters. ",
                                            "Larger average silhouette widths usually indicate better clustering. To make sure averaging does not hide a locally bad",
                                            "clustering, this should be inspected along with the silhouette plot in the \"Internal\" tab.<p>",
dmattek's avatar
dmattek committed
25
26
                                            "<p><b>WSS</b> evaluates the compactness of clusters. ",
                                            "Compact clusters achieve low WSS values. ",
27
                                            "Look for the <i>elbow</i> in the plot of WSS as function of cluster numbers.</p>"),
28
                    alLearnMoreInt = paste0("<p>Evaluate the goodness of a clustering structure by inspecting ",
29
                                            "principal components, the dendrogram, ",
30
                                            "and the silhouette for a given number of clusters.</p>",
31
                                            "<p><b>Principal components:</b> Each point in the scatter plot corresponds to a single time series in the first 2 PCs space. ",
32
                                            "Points are coloured by cluster numbers. Compact, well separated clusters ",
33
34
35
36
37
38
39
                                            "indicate good partitioning. The percentage of total variance carried by each PC is indicated.</p>",
                                            "<p><b>Dendrogram:</b> The height of branches indicates how well clusters are separated.</p>",
                                            "<p><b>Silhouette plot:</b> The plot indicates for each series whether it is on average closer to series within its cluster ",
                                            "or to series in other clusters. Each bar represents the <a href=https://en.wikipedia.org/wiki/Silhouette_(clustering) title=\"External link\">silhouette score</a> ",
                                            "(Si) for one series. The height of the bars varies ",
                                            "between 1 (the series is much closer to series in its cluster) and -1 (the series is much closer to series in an other cluster). ",
                                            "Hence, large positive values of Si are usually associated with better clustering, while negative values are associated with worse clustering.")
dmattek's avatar
dmattek committed
40
                    )
dmattek's avatar
dmattek committed
41
42
43
44
45
46
47


# UI ----
clustValidUI <- function(id, label = "Validation") {
  ns <- NS(id)
  
  tagList(
dmattek's avatar
dmattek committed
48
49
50
51
52
53
    h4(
      "Cluster validation using ",
      a("factoextra", 
        href="https://cran.r-project.org/web/packages/factoextra/",
        title="External link")
    ),
dmattek's avatar
dmattek committed
54
55
56
57
    actionLink(ns("alLearnMore"), "Learn more"),
    br(),
    br(),
    fluidRow(
dmattek's avatar
dmattek committed
58

59
      column(4,
dmattek's avatar
dmattek committed
60
61
62
63
64
65
             selectInput(
               ns("selectDiss"),
               label = ("Dissimilarity measure"),
               choices = list("Euclidean" = "euclidean",
                              "Manhattan" = "manhattan",
                              "Maximum"   = "maximum",
66
67
                              "Canberra"  = "canberra",
                              "DTW"       = "DTW"),
68
               selected = "euclidean"
dmattek's avatar
dmattek committed
69
             ),
70
             bsAlert("alertAnchorClValidNAsPresent")
dmattek's avatar
dmattek committed
71
             ),
72
      column(4,
dmattek's avatar
dmattek committed
73
74
75
76
77
78
79
80
81
82
83
84
             selectInput(
               ns("selectLinkage"),
               label = ("Linkage method"),
               choices = list(
                 "Average"  = "average",
                 "Complete" = "complete",
                 "Single"   = "single",
                 "Centroid" = "centroid",
                 "Ward"     = "ward.D",
                 "Ward D2"  = "ward.D2",
                 "McQuitty" = "mcquitty"
               ),
85
               selected = "average"
dmattek's avatar
dmattek committed
86
87
88
89
90
91
92
93
               )
             )
    ),
    
    br(),
    tabsetPanel(
      tabPanel("Relative",
               br(),
dmattek's avatar
dmattek committed
94
95
               p("Determine and visualise the optimal number of clusters. ",
                 actionLink(ns("alLearnMoreRel"), "Learn more")),
dmattek's avatar
dmattek committed
96
97
98
99
100
101
102
               fluidRow(
                 column(2, 
                        actionButton(ns('butPlotRel'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidMaxClust'),
dmattek's avatar
dmattek committed
103
                          'Maximum number of clusters to consider',
dmattek's avatar
dmattek committed
104
105
106
107
108
109
110
111
112
113
114
115
                          min = 2,
                          max = 20,
                          value = 10,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
               withSpinner(plotOutput(ns('outPlotSilhAvg'))),
               br(),
dmattek's avatar
dmattek committed
116
               withSpinner(plotOutput(ns('outPlotWss')))
dmattek's avatar
dmattek committed
117
118
119
120
               
      ),
      tabPanel("Internal",
               br(),
dmattek's avatar
dmattek committed
121
122
               p("Validate a given data partitioning. ",
                 actionLink(ns("alLearnMoreInt"), "Learn more")),
dmattek's avatar
dmattek committed
123
124
125
126
127
128
129
               fluidRow(
                 column(2,
                        actionButton(ns('butPlotInt'), 'Validate!')
                        ),
                 column(6,
                        sliderInput(
                          ns('slClValidNclust'),
130
                          'Number of clusters to evaluate',
dmattek's avatar
dmattek committed
131
132
133
134
135
136
137
138
139
140
                          min = 2,
                          max = 20,
                          value = 1,
                          step = 1,
                          ticks = TRUE,
                          round = TRUE
                        )
                        )
               ),
               br(),
141
               withSpinner(plotOutput(ns('outPlotTree'))),
dmattek's avatar
dmattek committed
142
               br(),
143
144
145
               withSpinner(plotOutput(ns('outPlotSilhForCut'))),
               br(),
               withSpinner(plotOutput(ns('outPlotClPCA')))
dmattek's avatar
dmattek committed
146
147
148
149
150
151
      )
    )
  )
}

# SERVER ----
152
clustValid <- function(input, output, session, in.dataWide) {
dmattek's avatar
dmattek committed
153
154
155

  ns = session$ns
  
156
157
158
159
160
161
162
163
164
165
166
167
  # Return the number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnNclust = reactive({
    return(input$slClValidNclust)
  }) %>% debounce(MILLIS)
  
  # Return max number of clusters from the slider 
  # and delay by a constant in milliseconds defined in auxfunc.R
  returnMaxNclust = reactive({
    return(input$slClValidMaxClust)
  }) %>% debounce(MILLIS)
  
dmattek's avatar
dmattek committed
168
169
  # calculate distance matrix for further clustering
  # time series arranged in rows with columns corresponding to time points
170
171
  calcDist <- reactive({
    cat(file = stderr(), 'clustValid:calcDist \n')
dmattek's avatar
dmattek committed
172
    
173
    loc.dm = in.dataWide()
dmattek's avatar
dmattek committed
174
175
176
177
178
179
180
181
    
    if (is.null(loc.dm)) {
      return(NULL)
    }
    
    # Throw some warnings if NAs present in the dataset.
    # DTW cannot compute distance when NA's are preset.
    # Other distance measures can be calculated but caution is required with interpretation.
182
183
    # NAs in the wide format can result from explicit NAs in the measurment column or
    # from missing rows that cause NAs to appear when convertinf from long to wide (dcast)
dmattek's avatar
dmattek committed
184
    if(sum(is.na(loc.dm)) > 0) {
185
186
187
188
189
190
191
192
193
194
      if (input$selectDiss == "DTW") {
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresentDTW", title = "Error",
                    content = helpText.clValid[["alertClValidNAsPresentDTW"]], 
                    append = FALSE,
                    style = "danger")
        closeAlert(session, 'alertClValidNAsPresent')
        
        return(NULL)
        
      } else {
195
196
        createAlert(session, "alertAnchorClValidNAsPresent", "alertClValidNAsPresent", title = "Warning",
                    content = helpText.clValid[["alertClValidNAsPresent"]], 
dmattek's avatar
dmattek committed
197
198
                    append = FALSE, 
                    style = "warning")
199
200
        closeAlert(session, 'alertClValidNAsPresentDTW')
      }
dmattek's avatar
dmattek committed
201
    } else {
202
      closeAlert(session, 'alertClValidNAsPresentDTW')
203
      closeAlert(session, 'alertClValidNAsPresent')
204
    }    
dmattek's avatar
dmattek committed
205
206
    
    
207
208
    # calculate distance matrix
    return(proxy::dist(loc.dm, method = input$selectDiss))
dmattek's avatar
dmattek committed
209
210
  })
  
211
  # calculate dendrogram for a chosen number of clusters and the linkage method
dmattek's avatar
dmattek committed
212
213
214
  calcDendCut = reactive({
    cat(file = stderr(), 'clustValid:calcDendCut \n')
    
215
    loc.dist = calcDist()
dmattek's avatar
dmattek committed
216
    
217
    if (is.null(loc.dist)) {
dmattek's avatar
dmattek committed
218
219
220
      return(NULL)
    }
    
221
222
223
224
225
226
    return(LOChcut(x = loc.dist,
                   k = returnNclust(),
                   hc_func = "hclust",
                   hc_method = input$selectLinkage,
                   hc_metric = input$selectDiss
                   ))    
227
228
  })
  
dmattek's avatar
dmattek committed
229
  # Plotting ----
dmattek's avatar
dmattek committed
230
231
232
233
234
235
  # Function instead of reactive as per:
  # http://stackoverflow.com/questions/26764481/downloading-png-from-shiny-r
  # This function is used to plot and to downoad a pdf
  
  # plot average silhouette
  plotSilhAvg <- function() {
236
    cat(file = stderr(), 'plotSilhAvg: in\n')
dmattek's avatar
dmattek committed
237
    
238
239
240
241
242
243
244
    # make the f-n dependent on the button click
    locBut = input$butPlotRel

    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
dmattek's avatar
dmattek committed
245
    
246
    validate(
dmattek's avatar
dmattek committed
247
248
      need(!is.null(loc.dist), "Nothing to plot. Load data first!"),
      need(returnMaxNclust() <  nrow(loc.dist), "Maximum number of clusters to conisder should be smaller than the number of time series.")
249
250
251
    )    

    loc.p = LOCnbclust(loc.dist,
252
                                     method = "silhouette",
253
                                     k.max = returnMaxNclust(),
254
255
256
257
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Average silhouette width") +
258
      ggtitle("Average silhouette width for different cluster numbers") +
dmattek's avatar
dmattek committed
259
260
261
262
263
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
264
265
266
267
268
    return(loc.p)
  }

  # plot Ws
  plotWss <- function() {
269
    cat(file = stderr(), 'plotWss: in\n')
dmattek's avatar
dmattek committed
270
    
271
    # make the f-n dependent on the button click
272
    locBut = input$butPlotRel
dmattek's avatar
dmattek committed
273
    
274
275
276
277
    # Check if required data exists
    # Thanks to isolate all mods in the left panel are delayed 
    # until clicking the Plot button
    loc.dist = isolate(calcDist())
dmattek's avatar
dmattek committed
278
    
279
    validate(
dmattek's avatar
dmattek committed
280
281
      need(!is.null(loc.dist), "Nothing to plot. Load data first!"),
      need(returnMaxNclust() <  nrow(loc.dist), "Maximum number of clusters to conisder should be smaller than the number of time series.")
282
    )    
dmattek's avatar
dmattek committed
283
    
284
    loc.p = LOCnbclust(loc.dist,
285
                                     method = "wss",
286
                                     k.max = returnMaxNclust(),
287
288
289
290
291
                                     hc_metric = input$selectDiss,
                                     hc_method = input$selectLinkage) +
      xlab("Number of clusters") +
      ylab("Total within cluster sum of squares") +
      ggtitle("Within cluster sum of squares for different cluster numbers") +
dmattek's avatar
dmattek committed
292
293
294
295
296
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
297
298
299
    
    return(loc.p)
  }
dmattek's avatar
dmattek committed
300
301
302
  
  # PCA visualization of partitioning methods 
  plotClPCA <- function() {
303
    cat(file = stderr(), 'plotTree: in\n')
dmattek's avatar
dmattek committed
304
    
305
    # make the f-n dependent on the button click
306
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
307
    
308
    # until clicking the Plot button
309
    loc.part = calcDendCut()
310
311
312
313
314
315
316
317
318
    loc.dm = in.dataWide()
    
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!"),
      need(!is.null(loc.dm),   "Nothing to plot. Load data first!"),
      need(sum(is.na(loc.dm)), "Cannot calculate PCA in the presence of missing data and/or NAs.")
    )    
    
    if (sum(is.na(loc.dm)) > 0)
dmattek's avatar
dmattek committed
319
320
      return(NULL)
    
321
322
323
    loc.pal = ifelse(returnNclust() <= 10, "Color Blind", "Tableau 20")
    loc.col = ggthemes::tableau_color_pal(loc.pal)(n = returnNclust())
    
324
    loc.p = factoextra::fviz_cluster(loc.part, 
325
                                     data = loc.dm,
326
                                     geom = "point",
327
                                     elipse.type = "convex", 
328
                                     main = "Principal components"
329
330
331
332
333
                                     )+
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
334
335
336
                     in.font.legend = PLOTFONTLEGEND) +
      scale_fill_manual(values = loc.col) +
      scale_colour_manual(values = loc.col)
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357

    
    # Retrieve association of cluster and colours and use it for dendrogram for color matching between dend, silhouette and PCA plot
    temp = ggplot_build(loc.p)
    map_individual = as.data.table(temp$data[[1]][, c("colour", "shape")])
    map_cluster = map_individual[, .SD[1], by = shape]
    map_cluster[, cluster := 1:nrow(map_cluster)]

    return(list(plot = loc.p, mapping_individual = map_individual, mapping_cluster = map_cluster))
  }
  
  
  # plot dendrogram tree
  plotTree <- function() {
    cat(file = stderr(), 'plotTree: in\n')
    
    # make the f-n dependent on the button click
    locBut = input$butPlotInt
    
    # Check if required data exists
    loc.part = calcDendCut()
dmattek's avatar
dmattek committed
358
    
359
360
    # Rerun the PCA plot to obtain clour mapping of clusters in PCA and silhouette plot and match it with dendrogram colors.
    loc.map = plotClPCA()
dmattek's avatar
dmattek committed
361
    
362
363
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!"),
dmattek's avatar
dmattek committed
364
      need(!is.null(loc.map),  "Cannot assign colours to clusters. Possible NAs in the dataset!")
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
    )    
    
    # Determine cluster order of occurence from left to right in the dendrogram
    # This is necessary because fviz_dend colors clusters from left to right,
    # whereas fviz_silhouette and fviz_cluster use the order of cluster first occurence in the list of individuals.
    loc.mapClus = loc.map$mapping_cluster
    ord.clusDend = unique(loc.part$cluster[loc.part$order])
    col.clusDend = loc.mapClus[, colour][ord.clusDend]
        
    loc.p = factoextra::fviz_dend(loc.part,
                                  k = returnNclust(),
                                  k_colors = col.clusDend,
                                  show_labels = F,
                                  rect = T,
                                  xlab = "Time series",
                                  main = "Dendrogram") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE,
                     in.font.axis.text = PLOTFONTAXISTEXT,
                     in.font.axis.title = PLOTFONTAXISTITLE,
                     in.font.strip = PLOTFONTFACETSTRIP,
                     in.font.legend = PLOTFONTLEGEND)
dmattek's avatar
dmattek committed
386
387
388
389
    
    return(loc.p)
  }
  
390
391
  
  # plot silhouettes for a particular dendrogram cut
dmattek's avatar
dmattek committed
392
  plotSilhForCut <- function() {
393
    cat(file = stderr(), 'plotSilhForCut: in\n')
dmattek's avatar
dmattek committed
394
    
395
    # make the f-n dependent on the button click
396
    locBut = input$butPlotInt
dmattek's avatar
dmattek committed
397
    
398
    # until clicking the Plot button
399
    loc.part = calcDendCut()
400
401
402
    validate(
      need(!is.null(loc.part), "Nothing to plot. Load data first!")
    )    
dmattek's avatar
dmattek committed
403
    
404
405
406
    loc.pal = ifelse(returnNclust() <= 10, "Color Blind", "Tableau 20")
    loc.col = ggthemes::tableau_color_pal(loc.pal)(n = returnNclust())
    
407
408
409
    loc.p = factoextra::fviz_silhouette(loc.part, 
                                        print.summary = FALSE, 
                                        main = "Silhouette") +
dmattek's avatar
dmattek committed
410
411
412
413
414
415
      xlab("Time series") +
      LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                     in.font.axis.text = PLOTFONTAXISTEXT, 
                     in.font.axis.title = PLOTFONTAXISTITLE, 
                     in.font.strip = PLOTFONTFACETSTRIP, 
                     in.font.legend = PLOTFONTLEGEND) +
416
417
418
      theme(axis.text.x = element_blank()) +
      scale_fill_manual(values = loc.col) +
      scale_colour_manual(values = loc.col)
dmattek's avatar
dmattek committed
419
420
421
422
    
    return(loc.p)
  }
  
dmattek's avatar
dmattek committed
423
  # Plot rendering ----
dmattek's avatar
dmattek committed
424
425
  # Display silhouette
  output$outPlotSilhAvg <- renderPlot({
426
427
    loc.p = plotSilhAvg()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
428
429
      return(NULL)
    
430
    return(loc.p)
dmattek's avatar
dmattek committed
431
432
433
434
435
  })

  
  # Display wss
  output$outPlotWss <- renderPlot({
436
437
438
    loc.p = plotWss()
    if(is.null(loc.p))
      return(NULL)
dmattek's avatar
dmattek committed
439
    
440
441
442
443
444
445
446
447
448
449
450
451
452
453
    return(loc.p)
  })
  
  # Display PCA of clustering
  output$outPlotClPCA <- renderPlot({
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotClPCA()
454
    loc.p = loc.p$plot
455
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
456
457
      return(NULL)
    
458
    return(loc.p)
dmattek's avatar
dmattek committed
459
460
461
462
  })
  
  # Display tree
  output$outPlotTree <- renderPlot({
463
464
465
466
467
468
469
470
471
472
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotTree()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
473
474
      return(NULL)
    
475
    return(loc.p)
dmattek's avatar
dmattek committed
476
477
478
479
  })
  
  # Display silhouette for a dendrogram cut
  output$outPlotSilhForCut <- renderPlot({
480
481
482
483
484
485
486
487
488
489
    # This is required to avoid
    # "Warning: Error in <Anonymous>: cannot open file 'Rplots.pdf'"
    # When running on a server. Based on:
    # https://github.com/ropensci/plotly/issues/494
    # if (names(dev.cur()) != "null device")
    #   dev.off()
    # pdf(NULL)
    
    loc.p = plotSilhForCut()
    if(is.null(loc.p))
dmattek's avatar
dmattek committed
490
491
      return(NULL)
    
492
    return(loc.p)
dmattek's avatar
dmattek committed
493
494
495
496
497
498
499
500
  })
  
  # Pop-overs ----
  addPopover(session, 
             ns("alLearnMore"),
             title = "Classes of cluster validation",
             content = helpText.clValid[["alLearnMore"]],
             trigger = "click")
dmattek's avatar
dmattek committed
501
502
503
504
505
506
507
508
509
510
511
512
  
  addPopover(session, 
             ns("alLearnMoreRel"),
             title = "Relative validation",
             content = helpText.clValid[["alLearnMoreRel"]],
             trigger = "click")
  
  addPopover(session, 
             ns("alLearnMoreInt"),
             title = "Internal validation",
             content = helpText.clValid[["alLearnMoreInt"]],
             trigger = "click")
dmattek's avatar
dmattek committed
513
514
515
}