Commit 03f7ea27 authored by Maciej Dobrzynski's avatar Maciej Dobrzynski

Merge branch 'feature-outlier-new'

Implements new outlier selection method
parents 7c389fb2 8fc29800
......@@ -26,6 +26,7 @@ Following packages need to be installed in order to run the app locally:
* MASS
* robust
* pracma
* Hmisc
Install packages using `install.packages('name_of_the_package_from_the_list_above')` command in RStudio command line.
......@@ -36,27 +37,7 @@ install.packages(c("shiny", "shinyjs",
"dendextend", "dendextend", "RColorBrewer",
"sparcl", "dtw",
"imputeTS",
"MASS", "robust", "pracma"))
```
**Note**
As of July 2018, *sparcl* has been removed from CRAN. If you don't need sparse hierarchical clustering, you can simply comment the line `library(sparcl)` in `server.R` file or install it from the [archive](https://cran.r-project.org/web/packages/sparcl/index.html):
```
install_url('https://cran.r-project.org/src/contrib/Archive/sparcl/sparcl_1.0.3.tar.gz')
```
**Additionally**, a time series analysis package needs to be installed from [GitHub](https://github.com/dmattek/tca-package):
```
install_github("dmattek/tca-package")
```
The `install_url` and `install_github` functions are available in *devtools* package:
```
install.packages("devtools")
library(devtools)
"MASS", "robust", "pracma", "Hmisc"))
```
## Input file
......
source('modules/auxfunc.R')
source('modules/selOutliers.R')
source('modules/downPlot.R')
source('modules/downCellIDsCls.R')
source('modules/downCSV.R')
source('modules/dispStats.R')
source('modules/dispTrackStats.R')
source('modules/trajPlot.R')
......
......@@ -14,6 +14,10 @@ require(Hmisc) # for CI calculation
# Global parameters ----
# if true, additional output printed to R console
DEB = T
# font sizes in pts for plots
PLOTFONTBASE = 12
PLOTFONTAXISTEXT = 12
......@@ -24,6 +28,28 @@ PLOTFONTLEGEND = 12
# default number of facets in plots
PLOTNFACETDEFAULT = 3
# internal column names
COLRT = 'realtime'
COLY = 'y'
COLID = 'id'
COLIDUNI = 'trackObjectsLabelUni'
COLGR = 'group'
COLIN = 'mid.in'
COLOBJN = 'obj.num'
COLPOSX = 'pos.x'
COLPOSY = 'pos.y'
COLIDX = 'IDX'
COLIDXDIFF = 'IDXdiff'
# file names
FCSVOUTLIERS = 'outliers.csv'
FCSVTCCLEAN = 'tCoursesSelected_clean.csv'
FPDFTCMEAN = "tCoursesMeans.pdf"
FPDFTCSINGLE = "tCourses.pdf"
FPDFBOXAUC = 'boxplotAUC.pdf'
FPDFBOXTP = 'boxplotTP.pdf'
FPDFSCATTER = 'scatter.pdf'
# Colour definitions ----
rhg_cols <- c(
"#771C19",
......@@ -176,7 +202,7 @@ LOCcalcTrajCI = function(in.dt, in.col.meas, in.col.by = NULL, in.type = c('norm
#' @export
#' @import data.table
LOCgenTraj <- function(in.ntpts = 60, in.ntracks = 10, in.nfov = 6, in.nwells = 1, in.addna = NULL) {
LOCgenTraj <- function(in.ntpts = 60, in.ntracks = 10, in.nfov = 6, in.nwells = 1, in.addna = NULL, in.addout = NULL) {
x.rand.1 = c(rnorm(in.ntpts * in.ntracks * in.nfov * 1/3, 0.5, 0.1), rnorm(in.ntpts * in.ntracks * in.nfov * 1/3, 1, 0.2), rnorm(in.ntpts * in.ntracks * in.nfov * 1/3, 2, 0.5))
x.rand.2 = c(rnorm(in.ntpts * in.ntracks * in.nfov * 1/3, 0.25, 0.1), rnorm(in.ntpts * in.ntracks * in.nfov * 1/3, 0.5, 0.2), rnorm(in.ntpts * in.ntracks * in.nfov * 1/3, 1, 0.2))
......@@ -188,6 +214,13 @@ LOCgenTraj <- function(in.ntpts = 60, in.ntracks = 10, in.nfov = 6, in.nwells =
x.rand.2[round(runif(in.addna) * locTabLen)] = NA
}
# add outliers for testing
if (!is.null(in.addout)) {
locTabLen = length(x.rand.1)
x.rand.1[round(runif(in.addout) * locTabLen)] = 10
x.rand.2[round(runif(in.addout) * locTabLen)] = 10
}
x.arg = rep(seq(1, in.ntpts), in.ntracks * in.nfov)
dt.nuc = data.table(Metadata_Well = rep(LETTERS[1:in.nwells], each = in.ntpts * in.nfov * in.ntracks / in.nwells),
......@@ -225,7 +258,7 @@ LOCgenTraj <- function(in.ntpts = 60, in.ntracks = 10, in.nfov = 6, in.nwells =
LOCnormTraj = function(in.dt,
in.meas.col,
in.rt.col = 'RealTime',
in.rt.col = COLRT,
in.rt.min = 10,
in.rt.max = 20,
in.by.cols = NULL,
......
......@@ -17,7 +17,7 @@ modTrackStats = function(input, output, session,
ns <- session$ns
output$uiTabStats = renderUI({
cat(file = stderr(), 'UI uiTabStats\n')
cat(file = stderr(), 'modTrackStats: uiTabStats\n')
ns <- session$ns
if(input$chbTabStats) {
......@@ -33,7 +33,7 @@ modTrackStats = function(input, output, session,
# unused at the moment
calcStats = reactive({
cat(file = stderr(), 'tabBoxPlot: calsStats\n')
cat(file = stderr(), 'modTrackStats: calsStats\n')
loc.dt = in.data()
if (is.null(loc.dt))
......@@ -53,6 +53,9 @@ modTrackStats = function(input, output, session,
# Print number of tracks
output$txtNtracks = renderText({
cat(file = stderr(), 'modTrackStats: txtNtracks\n')
loc.dt = in.data()
loc.dt = in.data()
if (is.null(loc.dt))
......@@ -66,7 +69,7 @@ modTrackStats = function(input, output, session,
# Print a table with Track IDs assigned to multiple objects in a frame
output$outTabStats = DT::renderDataTable(server = FALSE, {
cat(file = stderr(), 'tabBoxPlot: outTabStats\n')
cat(file = stderr(), 'modTrackStats: outTabStats\n')
loc.dt = in.data()
if (is.null(loc.dt))
......
# RShiny module for downloading cellIDs with cluster numbers
# Use:
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This is the module of a Shiny web application.
# Download CSV with data
#
# Usage:
# in ui.R
# downPlotUI('uniqueID', "your_label")
# downCSV('uniqueID', "your_label")
#
# in server.R
# callModule(downPlot, "uniqueID", 'fname.pdf', input_plot_to_save)
# callModule(downCSV, "uniqueID", 'fname.csv', input_data_to_save)
downCellClUI <- function(id, label = "Download Data") {
# UI-download-csv ----
downCsvUI <- function(id, label = "Download Data") {
ns <- NS(id)
tagList(
......@@ -17,7 +23,8 @@ downCellClUI <- function(id, label = "Download Data") {
)
}
downCellCl <- function(input, output, session, in.fname, in.data) {
# SERVER-download-csv ----
downCsv <- function(input, output, session, in.fname, in.data) {
output$downCellCl <- downloadHandler(
filename = function() {
......
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This is the module of a Shiny web application.
# Outlier identification, selection
# UI-remove-outliers ----
modSelOutliersUI = function(id, label = "Outlier Selection") {
ns <- NS(id)
tagList(
h4(
"Remove outliers"
),
fluidRow(
column(2,
numericInput(ns('numOutliersPerc'),
label = '% of data',
min = 0,
max = 100,
value = 0,
step = 0.05, width = '100px'),
checkboxInput(ns('chBtrajInter'), 'Interpolate gaps?', value = F)
),
column(2,
radioButtons(ns('rbOutliersType'),
label = 'From',
choices = c('top' = 'top', 'middle' = 'mid', 'bottom' = 'bot'))
),
column(3,
sliderInput(ns('slOutliersGapLen'),
label = 'Remove tracks with gaps equal to or longer than',
min = 1,
max = 10,
value = 1,
step = 1)
),
column(3,
downloadButton(ns('downOutlierCSV'), label = 'CSV with outlier IDs'),
htmlOutput(ns("txtOutliersPerc"))
)
)
)
}
# Server-remove-outliers ----
modSelOutliers = function(input, output, session, in.data) {
# reactive counter to hold number of tracks before and after outlier removal
nCellsCounter <- reactiveValues(
nCellsOrig = 0,
nCellsAfter = 0,
nOutlierTpts = 0
)
# reactive vector with cell ids
vOut = reactiveValues(
id = NULL
)
# Display number of tracks and outliers
output$txtOutliersPerc <- renderText({
cat(file = stdout(), 'modSelOutliers: txtOutliersPerc\n')
sprintf('<b>%d total track(s)<br>%d outlier track(s)<br>%d outlier point(s)</b><br>',
nCellsCounter[['nCellsOrig']],
nCellsCounter[['nCellsOrig']] - nCellsCounter[['nCellsAfter']],
nCellsCounter[['nOutlierTpts']])
})
# button for downloading CSV with ids of removed tracks
output$downOutlierCSV <- downloadHandler(
filename = FCSVOUTLIERS,
content = function(file) {
loc.dt = vOut[['id']]
if (is.null(loc.dt))
return(NULL)
else
write.csv(unique(loc.dt[, (COLID), with = F]), file, row.names = FALSE, quote = F)
}
)
# Identify outliers and remove them from dt
dtReturn = reactive({
cat(file = stdout(), 'modSelOutliers: dtReturn\n')
loc.out = in.data()
if (is.null(loc.out)) {
return(NULL)
}
# store the number of trajectories before prunning
nCellsCounter[['nCellsOrig']] = length(unique(loc.out[['id']]))
# Remove outliers if the field with percentage of data to remove is greater than 0
if (input$numOutliersPerc > 0) {
# scale all measurement points
loc.out[, y.sc := scale(get(COLY))]
# Identify outlier points
# In the UI, user selectes percentage of data to remove from the bottom, middle, or top part.
# loc.outpts stores outlier points
# warning: quantile type = 3: SAS definition: nearest even order statistic.
switch(input$rbOutliersType,
'top' = {loc.outpts = loc.out[ y.sc > quantile(y.sc, 1 - input$numOutliersPerc * 0.01, na.rm = T, type = 3)]},
'mid' = {loc.outpts = loc.out[ y.sc < quantile(y.sc, input$numOutliersPerc * 0.005, na.rm = T, type = 3) |
y.sc > quantile(y.sc, 1 - input$numOutliersPerc * 0.005, na.rm = T, type = 3)]},
'bot' = {loc.outpts = loc.out[ y.sc < quantile(y.sc, input$numOutliersPerc * 0.01, na.rm = T, type = 3)]}
)
if (DEB) {
cat(file = stdout(), 'selOutliers.dtReturn: Outlier points:\n')
print(loc.outpts)
}
if (input$slOutliersGapLen > 1) {
# remove tracks with gaps longer than the value set in slOutliersGapLen
# shorter gaps are interpolated linearly
# add index column per trajecory
loc.out[, (COLIDX) := 1:.N, by = c(COLID)]
# remove single outlier points (anti-join)
# From: https://stackoverflow.com/a/46333620/1898713
loc.out = loc.out[!loc.outpts, on = names(loc.outpts)]
# calculate diff on index column to see the length of gaps due to removed points
# the value of that column corresponds to the gap length (hence the "-1")
loc.out[, (COLIDXDIFF) := c(1, diff(get(COLIDX))) - 1, by = c(COLID)]
# get track ids where the max gap is equal to or longer than the threshold
loc.idgaps = loc.out[, max(get(COLIDXDIFF)), by = c(COLID)][V1 >= input$slOutliersGapLen, get(COLID)]
if (DEB) {
cat(file = stdout(), '\nselOutliers.dtReturn: Track IDs with max gap >= threshold:\n')
if (length(loc.idgaps) > 0)
print(loc.idgaps) else
cat("none\n")
}
# remove outlier tracks with gaps longer than the value set in slOutliersGapLen
if (length(loc.idgaps) > 0)
loc.out = loc.out[!(get(COLID) %in% unique(loc.idgaps))]
# clean
loc.out[, c(COLIDX, COLIDXDIFF) := NULL]
# interpolate gaps due to outliers
if (input$chBtrajInter) {
# fill removed outliers with NA's
setkeyv(loc.out, c(COLGR, COLID, COLRT))
loc.out = loc.out[setkeyv(loc.out[, .(seq(min(get(COLRT), na.rm = T), max(get(COLRT), na.rm = T), 1)), by = c(COLGR, COLID)], c(COLGR, COLID, 'V1'))]
# x-check: print all rows with NA's
if (DEB) {
cat(file = stdout(), '\nselOutliers.dtReturn: Rows with NAs to interpolate:\n')
print(loc.out[rowSums(is.na(loc.out)) > 0, ])
}
# NA's may be already present in the dataset'.
# Interpolate (linear) them with na.interpolate as well
if( (COLPOSX %in% names(loc.out)) & (COLPOSY %in% names(loc.out)) )
s.cols = c(COLY, COLPOSX, COLPOSY)
else
s.cols = c(COLY)
# Apparently the loop is faster than lapply+SDcols
for(col in s.cols) {
# Interpolated columns should be of type numeric (float)
# This is to ensure that interpolated columns are of porper type.
data.table::set(loc.out, j = col, value = as.numeric(loc.out[[col]]))
loc.out[, (col) := na.interpolation(get(col)), by = c(COLID)]
}
}
} else {
# remove outlier tracks with gaps of length 1 time point
# !(input$slOutliersGapLen > 1)
loc.out = loc.out[!(get(COLID) %in% unique(loc.outpts[[COLID]]))]
}
# clean
loc.out[, y.sc := NULL]
# store a vector of outlier timepoints with the corresponding IDs
vOut[['id']] = loc.outpts
} else {
# no outlier removal
# !(input$numOutliersPerc > 0)
loc.outpts = NULL
vOut = NULL
}
# count number of trajectories after removing outlier tracks
nCellsCounter[['nCellsAfter']] = length(unique(loc.out[[COLID]]))
# count number of outlier points
nCellsCounter[['nOutlierTpts']] = length(loc.outpts[[COLID]])
cat(sprintf("%d outlier tpts\n", nCellsCounter[['nOutlierTpts']]))
# return cleaned dt
if (nrow(loc.out) < 1)
return(NULL) else
return(loc.out)
})
return(dtReturn)
}
\ No newline at end of file
#
# Time Course Inspector: Shiny app for plotting time series data
# Author: Maciej Dobrzynski
#
# This is the module of a Shiny web application.
# Hierarchical clustering
# UI
# UI ----
clustHierUI <- function(id, label = "Hierarchical CLustering") {
ns <- NS(id)
......@@ -173,7 +179,7 @@ clustHierUI <- function(id, label = "Hierarchical CLustering") {
)
}
# SERVER
# SERVER ----
clustHier <- function(input, output, session, in.data4clust, in.data4trajPlot, in.data4stimPlot) {
output$uiPlotHierClAss = renderUI({
......
......@@ -29,7 +29,7 @@ tabScatterPlotUI <- function(id, label = "Comparing t-points") {
4,
uiOutput(ns('uiSelTptX')),
uiOutput(ns('uiSelTptY')),
checkboxInput(ns('chBfoldChange'), 'Y-axis displays difference between two t-points')
checkboxInput(ns('chBfoldChange'), 'Display difference between two t-points on Y-axis')
),
column(
4,
......
This diff is collapsed.
......@@ -53,7 +53,7 @@ shinyUI(fluidPage(
uiOutput('uiButLoadStim'),
tags$hr(),
checkboxInput('chBtrajInter', 'Interpolate NAs and missing data?', value = T),
checkboxInput('chBtrajInter', 'Interpolate NAs and missing data?', value = F),
helpPopup(
title = 'Interpolation of NAs and missing data',
content = help.text[3],
......@@ -62,7 +62,7 @@ shinyUI(fluidPage(
),
uiOutput('varSelTimeFreq'),
checkboxInput('chBtrackUni', 'Create unique TrackLabel', T),
checkboxInput('chBtrackUni', 'Create unique TrackLabel', F),
helpPopup(
title = 'Create unique cell ID',
content = help.text[2],
......@@ -73,7 +73,7 @@ shinyUI(fluidPage(
uiOutput('varSelTrackLabel'),
tags$hr(),
checkboxInput('chBgroup', 'Dataset contains grouping column (e.g. treatment, condition)', TRUE),
checkboxInput('chBgroup', 'Dataset contains grouping column (e.g. treatment, condition)', F),
uiOutput('varSelGroup'),
uiOutput('varSelTime'),
uiOutput('varSelMeas1'),
......@@ -101,9 +101,6 @@ shinyUI(fluidPage(
uiOutput('uiChBnormRobust'),
uiOutput('uiChBnormGroup'),
tags$hr(),
checkboxInput('chBoutliers', 'Remove outliers', FALSE),
uiOutput('uiSlOutliers'),
uiOutput("uiTxtOutliers"),
downloadButton('downloadDataClean', 'Download mod\'d data')
),
......@@ -115,7 +112,7 @@ shinyUI(fluidPage(
"Plot time series: means per group or individual"
),
br(),
modSelOutliersUI('returnOutlierIDs'),
tabsetPanel(
tabPanel("Means",
br(),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment