Upgrade to new Gitlab Version 13.9 on Saturday 19th April 20:00. Expect an interruption of about 30 to 60 minutes

selOutliers.R 11.6 KB
Newer Older
dmattek's avatar
dmattek committed
1 2 3 4
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
dmattek's avatar
dmattek committed
5
# This is a module of a Shiny web application.
dmattek's avatar
dmattek committed
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# Outlier identification, selection

# UI-remove-outliers ----
modSelOutliersUI = function(id, label = "Outlier Selection") {
  ns <- NS(id)
  
  tagList(
    h4(
      "Remove outliers"
    ),
    fluidRow(
      column(2, 
             numericInput(ns('numOutliersPerc'),
                         label = '% of data',
                         min = 0,
                         max = 100,
                         value = 0, 
23
                         step = 0.05, width = '100px'),
24
             checkboxInput(ns('chBtrajInter'), 'Interpolate gaps', value = F)
dmattek's avatar
dmattek committed
25 26 27 28
      ),
      column(2, 
             radioButtons(ns('rbOutliersType'), 
                          label = 'From', 
Maciej Dobrzynski's avatar
Maciej Dobrzynski committed
29
                          choices = c('top' = 'top', 'top & bottom' = 'mid', 'bottom' = 'bot'))
dmattek's avatar
dmattek committed
30 31 32 33 34 35 36 37 38 39 40 41 42
             ),
      column(3,
             sliderInput(ns('slOutliersGapLen'),
                         label = 'Remove tracks with gaps equal to or longer than',
                         min = 1,
                         max = 10,
                         value = 1, 
                         step = 1)
      ),
      column(3,
             downloadButton(ns('downOutlierCSV'), label = 'CSV with outlier IDs'),
             htmlOutput(ns("txtOutliersPerc"))
      )
43 44 45
    ),
    checkboxInput(ns('chBplotDist'), 'Plot data distribution', value = F),
    uiOutput(ns('uiDistPlot'))
dmattek's avatar
dmattek committed
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
  )
}

# Server-remove-outliers ----
modSelOutliers = function(input, output, session, in.data) {

  # reactive counter to hold number of tracks before and after outlier removal
  nCellsCounter <- reactiveValues(
    nCellsOrig = 0,
    nCellsAfter = 0,
    nOutlierTpts = 0
  )
  
  # reactive vector with cell ids
  vOut = reactiveValues(
    id = NULL
  )

64 65
  
  
dmattek's avatar
dmattek committed
66 67
  # Display number of tracks and outliers  
  output$txtOutliersPerc <- renderText({ 
68
    cat(file = stdout(), 'modSelOutliers: txtOutliersPerc\n')
dmattek's avatar
dmattek committed
69
    
Maciej Dobrzynski's avatar
Maciej Dobrzynski committed
70
      sprintf('<b>%d total track(s)<br>%d removed track(s)<br>%d removed point(s)</b><br>', 
dmattek's avatar
dmattek committed
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
            nCellsCounter[['nCellsOrig']], 
            nCellsCounter[['nCellsOrig']] - nCellsCounter[['nCellsAfter']],
            nCellsCounter[['nOutlierTpts']])
    })
  
  # button for downloading CSV with ids of removed tracks
  output$downOutlierCSV <- downloadHandler(
    filename = FCSVOUTLIERS,
    content = function(file) {
      loc.dt = vOut[['id']]
      
      if (is.null(loc.dt))
        return(NULL)
      else
        write.csv(unique(loc.dt[, (COLID), with = F]), file, row.names = FALSE, quote = F)
    }
  )
  
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
  # Plot of value distribution
  output$uiDistPlot <- renderUI({
    ns <- session$ns
    
    if (input$chBplotDist) {

      locDT = in.data()
      
      if (is.null(locDT)) {
        return(NULL)
      }

      output$densPlot = renderPlot({

        # main density plot
        locP = ggplot(locDT, aes_string(x = COLY)) +
          geom_density()
        
        # Shade regions of the density plot according to
        # value set in input$numOutliersPerc.
        
        # extract data from density plot
        locDTtmp = as.data.table(ggplot_build(locP)$data[[1]])
        
        # shade region on the right
        if (input$rbOutliersType == 'top') {
          
          # find position of the right boundary
          locQuantR = quantile(locDT[[COLY]], 
                               1 - input$numOutliersPerc * 0.01, 
                               na.rm = T, 
                               type = 3)
          
          # select only those points of the density plot right to the right boundary
          locDTtmpSub = locDTtmp[x > locQuantR]
          
          # add shaded RIGHT region to the plot
          if (nrow(locDTtmpSub) > 0 )
            locP = locP + 
            geom_area(data = locDTtmpSub, aes(x=x, y=y), fill="red") +
            geom_vline(xintercept = locQuantR, linetype = 'dashed', color = 'red')
        } else 
          # shade region on the left
          if (input$rbOutliersType == 'bot') {
            
            # find position of the right boundary
            locQuantL = quantile(locDT[[COLY]], 
                                 input$numOutliersPerc * 0.01, 
                                 na.rm = T, 
                                 type = 3)
            
            # select only those points of the density plot left to the left boundary
            locDTtmpSub = locDTtmp[x < locQuantL]
            
            # add shaded LEFT region to the plot
            if (nrow(locDTtmpSub) > 0 )
              locP = locP + 
              geom_area(data = locDTtmpSub, aes(x=x, y=y), fill="red") +
              geom_vline(xintercept = locQuantL, linetype = 'dashed', color = 'red')
            
          } else 
            # shade region on the left
            if (input$rbOutliersType == 'mid') {
              
              # find position of the right boundary
              locQuantR = quantile(locDT[[COLY]], 
                                   1 - input$numOutliersPerc * 0.005, 
                                   na.rm = T, 
                                   type = 3)
              
              # find position of the left boundary
              locQuantL = quantile(locDT[[COLY]], 
                                   input$numOutliersPerc * 0.005, 
                                   na.rm = T, 
                                   type = 3)
              
              # select only those points of the density plot left or right of the boundaries
              locDTtmpSubL = locDTtmp[x < locQuantL]
              locDTtmpSubR = locDTtmp[x > locQuantR]
              
              # add shaded LEFT region to the plot
              if (nrow(locDTtmpSubL) > 0 )
                locP = locP + 
                geom_area(data = locDTtmpSubL, aes(x=x, y=y), fill="red") +
                geom_vline(xintercept = locQuantL, linetype = 'dashed', color = 'red')
              
              
              if (nrow(locDTtmpSubR) > 0 )
                locP = locP + 
                geom_area(data = locDTtmpSubR, aes(x=x, y=y), fill="red") +
                geom_vline(xintercept = locQuantR, linetype = 'dashed', color = 'red')
            }
        
        locP = locP +
          xlab('Measurement value') +
          LOCggplotTheme(in.font.base = PLOTFONTBASE, 
                         in.font.axis.text = PLOTFONTAXISTEXT, 
                         in.font.axis.title = PLOTFONTAXISTITLE, 
                         in.font.strip = PLOTFONTFACETSTRIP, 
                         in.font.legend = PLOTFONTLEGEND)
        
        return(locP)
        
      })
      
    } else
      return(NULL)
    
    plotOutput(ns('densPlot'))
  })
  
dmattek's avatar
dmattek committed
200 201
# Identify outliers and remove them from dt
  dtReturn = reactive({ 
202
    cat(file = stdout(), 'modSelOutliers: dtReturn\n')
dmattek's avatar
dmattek committed
203 204 205 206 207 208 209
    
    loc.out = in.data()
    
    if (is.null(loc.out)) {
      return(NULL)
    }

210 211 212
    # store the number of trajectories before prunning
    nCellsCounter[['nCellsOrig']] = length(unique(loc.out[['id']]))
    
213 214
    # Remove outliers if the field with percentage of data to remove is greater than 0
    if (input$numOutliersPerc > 0) {
dmattek's avatar
dmattek committed
215
      
216
      # scale all measurement points      
dmattek's avatar
dmattek committed
217 218 219 220 221
      loc.out[, y.sc := scale(get(COLY))]  

      # Identify outlier points
      # In the UI, user selectes percentage of data to remove from the bottom, middle, or top part.
      # loc.outpts stores outlier points
222
      # warning: quantile type = 3: SAS definition: nearest even order statistic.
dmattek's avatar
dmattek committed
223
      switch(input$rbOutliersType,
224 225 226 227
        'top' = {loc.outpts = loc.out[ y.sc > quantile(y.sc, 1 - input$numOutliersPerc * 0.01, na.rm = T, type = 3)]},
        'mid' = {loc.outpts = loc.out[ y.sc < quantile(y.sc, input$numOutliersPerc * 0.005, na.rm = T, type = 3) | 
                                     y.sc > quantile(y.sc, 1 - input$numOutliersPerc * 0.005, na.rm = T, type = 3)]},
        'bot' = {loc.outpts = loc.out[ y.sc < quantile(y.sc, input$numOutliersPerc * 0.01, na.rm = T, type = 3)]}
dmattek's avatar
dmattek committed
228 229
      )
      
230 231 232 233 234
      if (DEB) {
        cat(file = stdout(), 'selOutliers.dtReturn: Outlier points:\n')
        print(loc.outpts)
      }
        
dmattek's avatar
dmattek committed
235 236 237 238 239 240
      if (input$slOutliersGapLen > 1) {
        # remove tracks with gaps longer than the value set in slOutliersGapLen
        # shorter gaps are interpolated linearly
        
        # add index column per trajecory
        loc.out[, (COLIDX) := 1:.N, by = c(COLID)]
241

dmattek's avatar
dmattek committed
242 243 244
        # remove single outlier points (anti-join)
        # From: https://stackoverflow.com/a/46333620/1898713
        loc.out = loc.out[!loc.outpts, on = names(loc.outpts)]
245

dmattek's avatar
dmattek committed
246 247 248 249 250 251 252
        # calculate diff on index column to see the length of gaps due to removed points
        # the value of that column corresponds to the gap length (hence the "-1")
        loc.out[, (COLIDXDIFF) := c(1, diff(get(COLIDX))) - 1, by = c(COLID)]

        # get track ids where the max gap is equal to or longer than the threshold
        loc.idgaps = loc.out[, max(get(COLIDXDIFF)), by = c(COLID)][V1 >= input$slOutliersGapLen, get(COLID)]
        
253 254 255 256 257 258
        if (DEB) {
          cat(file = stdout(), '\nselOutliers.dtReturn: Track IDs with max gap >= threshold:\n')
          if (length(loc.idgaps) > 0)
            print(loc.idgaps) else
              cat("none\n")
        }
dmattek's avatar
dmattek committed
259
        
260 261 262 263
        # remove outlier tracks with gaps longer than the value set in slOutliersGapLen
        if (length(loc.idgaps) > 0)
          loc.out = loc.out[!(get(COLID) %in% unique(loc.idgaps))]

dmattek's avatar
dmattek committed
264 265
        # clean
        loc.out[, c(COLIDX, COLIDXDIFF) := NULL]
266 267 268 269 270 271 272 273 274

        # interpolate gaps due to outliers
        if (input$chBtrajInter) {
          # fill removed outliers with NA's
          setkeyv(loc.out, c(COLGR, COLID, COLRT))
          loc.out = loc.out[setkeyv(loc.out[, .(seq(min(get(COLRT), na.rm = T), max(get(COLRT), na.rm = T), 1)), by = c(COLGR, COLID)], c(COLGR, COLID, 'V1'))]

          # x-check: print all rows with NA's
          if (DEB) {
275
            cat(file = stdout(), '\nselOutliers.dtReturn: Rows with NAs to interpolate:\n')
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
            print(loc.out[rowSums(is.na(loc.out)) > 0, ])
          }
          
          # NA's may be already present in the dataset'.
          # Interpolate (linear) them with na.interpolate as well
          if( (COLPOSX %in% names(loc.out)) & (COLPOSY %in% names(loc.out)) )
            s.cols = c(COLY, COLPOSX, COLPOSY)
          else
            s.cols = c(COLY)
          
          
          # Apparently the loop is faster than lapply+SDcols
          for(col in s.cols) {
            # Interpolated columns should be of type numeric (float)
            # This is to ensure that interpolated columns are of porper type.
            data.table::set(loc.out, j = col, value = as.numeric(loc.out[[col]]))
            
            loc.out[, (col) := na.interpolation(get(col)), by = c(COLID)]        
          }
        } 
dmattek's avatar
dmattek committed
296 297
      } else {
        # remove outlier tracks with gaps of length 1 time point
298
        # !(input$slOutliersGapLen > 1)
dmattek's avatar
dmattek committed
299 300 301 302 303
        loc.out = loc.out[!(get(COLID) %in% unique(loc.outpts[[COLID]]))]
      }

      # clean
      loc.out[, y.sc := NULL]
304

dmattek's avatar
dmattek committed
305 306 307
      
      # store a vector of outlier timepoints with the corresponding IDs
      vOut[['id']] = loc.outpts
308 309 310 311 312
    } else {
      # no outlier removal
      # !(input$numOutliersPerc > 0)
      loc.outpts = NULL
      vOut = NULL
dmattek's avatar
dmattek committed
313
    }
314 315 316 317 318 319 320

    # count number of trajectories after removing outlier tracks
    nCellsCounter[['nCellsAfter']] = length(unique(loc.out[[COLID]]))
    
    # count number of outlier points
    nCellsCounter[['nOutlierTpts']] = length(loc.outpts[[COLID]])
    cat(sprintf("%d outlier tpts\n", nCellsCounter[['nOutlierTpts']]))
dmattek's avatar
dmattek committed
321 322
    
    # return cleaned dt
323 324 325
    if (nrow(loc.out) < 1)
      return(NULL) else
        return(loc.out)
dmattek's avatar
dmattek committed
326 327 328 329 330
    
  })
  
  return(dtReturn)
}