selOutliers.R 5.96 KB
Newer Older
dmattek's avatar
dmattek committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This is the module of a Shiny web application.
# Outlier identification, selection

# UI-remove-outliers ----
modSelOutliersUI = function(id, label = "Outlier Selection") {
  ns <- NS(id)
  
  tagList(
    h4(
      "Remove outliers"
    ),
    fluidRow(
      column(2, 
             numericInput(ns('numOutliersPerc'),
                         label = '% of data',
                         min = 0,
                         max = 100,
                         value = 0, 
                         step = 0.05, width = '100px')
      ),
      column(2, 
             radioButtons(ns('rbOutliersType'), 
                          label = 'From', 
                          choices = c('top' = 'top', 'middle' = 'mid', 'bottom' = 'bot'))
             ),
      column(3,
             sliderInput(ns('slOutliersGapLen'),
                         label = 'Remove tracks with gaps equal to or longer than',
                         min = 1,
                         max = 10,
                         value = 1, 
                         step = 1)
      ),
      column(3,
             downloadButton(ns('downOutlierCSV'), label = 'CSV with outlier IDs'),
             htmlOutput(ns("txtOutliersPerc"))
      )
    )
  )
}

# Server-remove-outliers ----
modSelOutliers = function(input, output, session, in.data) {

  # reactive counter to hold number of tracks before and after outlier removal
  nCellsCounter <- reactiveValues(
    nCellsOrig = 0,
    nCellsAfter = 0,
    nOutlierTpts = 0
  )
  
  # reactive vector with cell ids
  vOut = reactiveValues(
    id = NULL
  )

  # Display number of tracks and outliers  
  output$txtOutliersPerc <- renderText({ 
    cat(file = stderr(), 'modSelOutliers: txtOutliersPerc\n')
    
      sprintf('<b>%d total tracks<br>%d outlier tracks<br>%d outlier points</b><br>', 
            nCellsCounter[['nCellsOrig']], 
            nCellsCounter[['nCellsOrig']] - nCellsCounter[['nCellsAfter']],
            nCellsCounter[['nOutlierTpts']])
    })
  
  # button for downloading CSV with ids of removed tracks
  output$downOutlierCSV <- downloadHandler(
    filename = FCSVOUTLIERS,
    content = function(file) {
      loc.dt = vOut[['id']]
      
      if (is.null(loc.dt))
        return(NULL)
      else
        write.csv(unique(loc.dt[, (COLID), with = F]), file, row.names = FALSE, quote = F)
    }
  )
  
# Identify outliers and remove them from dt
  dtReturn = reactive({ 
    cat(file = stderr(), 'modSelOutliers: dtReturn\n')
    
    loc.out = in.data()
    
    if (is.null(loc.out)) {
      return(NULL)
    }

    # Remove outliers if the slider with percentage of data is smaller than 100
    if (input$numOutliersPerc < 100) {
      
      # store the number of trajectories before prunning
      nCellsCounter[['nCellsOrig']] = length(unique(loc.out[['id']]))

      # scale all points (independently per track)      
      loc.out[, y.sc := scale(get(COLY))]  

      # Identify outlier points
      # In the UI, user selectes percentage of data to remove from the bottom, middle, or top part.
      # loc.outpts stores outlier points
      switch(input$rbOutliersType,
        'top' = {loc.outpts = loc.out[ y.sc > quantile(y.sc, 1 - input$numOutliersPerc * 0.01, na.rm = T)]},
        'mid' = {loc.outpts = loc.out[ y.sc < quantile(y.sc, input$numOutliersPerc * 0.005, na.rm = T) | 
                                     y.sc > quantile(y.sc, 1 - input$numOutliersPerc * 0.005, na.rm = T)]},
        'bot' = {loc.outpts = loc.out[ y.sc < quantile(y.sc, input$numOutliersPerc * 0.01, na.rm = T)]}
      )
      
      
      if (input$slOutliersGapLen > 1) {
        # remove tracks with gaps longer than the value set in slOutliersGapLen
        # shorter gaps are interpolated linearly
        
        # add index column per trajecory
        loc.out[, (COLIDX) := 1:.N, by = c(COLID)]
        
        # remove single outlier points (anti-join)
        # From: https://stackoverflow.com/a/46333620/1898713
        loc.out = loc.out[!loc.outpts, on = names(loc.outpts)]
        
        # calculate diff on index column to see the length of gaps due to removed points
        # the value of that column corresponds to the gap length (hence the "-1")
        loc.out[, (COLIDXDIFF) := c(1, diff(get(COLIDX))) - 1, by = c(COLID)]

        # get track ids where the max gap is equal to or longer than the threshold
        loc.idgaps = loc.out[, max(get(COLIDXDIFF)), by = c(COLID)][V1 >= input$slOutliersGapLen, get(COLID)]
        
        # remove outlier tracks with gaps longer than the value set in slOutliersGapLen
        loc.out = loc.out[!(get(COLID) %in% unique(loc.idgaps))]
        
        # fill removed outliers with NA's
        loc.out = loc.out[setkeyv(loc.out[, .(seq(min(get(COLIDX), na.rm = T), max(get(COLIDX), na.rm = T), 1)), by = c(COLGR, COLID)], c(COLGR, COLID, 'V1'))]

        # interpolate gaps with NAs
        if( (COLPOSX %in% names(loc.out)) & (COLPOSY %in% names(loc.out)) )
          s.cols = c(COLY, COLPOSX, COLPOSY)
        else
          s.cols = c(COLY)
        
        # Here, the missing part in interpolation of mid.in column (for highlighting trajectories)
        loc.out[, (s.cols) := lapply(.SD, na.interpolation), by = c(COLID), .SDcols = s.cols]
        
        # clean
        loc.out[, c(COLIDX, COLIDXDIFF) := NULL]
        
      } else {
        # remove outlier tracks with gaps of length 1 time point
        loc.out = loc.out[!(get(COLID) %in% unique(loc.outpts[[COLID]]))]
      }

      # clean
      loc.out[, y.sc := NULL]
      
      # count number of trajectories after removing outlier tracks
      nCellsCounter[['nCellsAfter']] = length(unique(loc.out[[COLID]]))
      
      # count number of outlier points
      nCellsCounter[['nOutlierTpts']] = length(loc.outpts[[COLID]])
      
      
      # store a vector of outlier timepoints with the corresponding IDs
      vOut[['id']] = loc.outpts
    }
    
    # return cleaned dt
    return(loc.out)
    
  })
  
  return(dtReturn)
}