Commit 1ac81d2a authored by Jerome Pasquier's avatar Jerome Pasquier
Browse files

initial commit, analyses 2021-11-18

parents
data/*
data-raw/*
documents/*
results/*
sessions/*
heures_668.csv
TODO.txt
!*/.keep
library(ggplot2)
library(magrittr)
library(pander)
library(parallel)
library(pROC)
library(readxl)
library(writexl)
options(mc.cores = detectCores() - 1)
# Set working directory
setwd("~/Projects/Consultations/Pitteloud Nelly (Puberté)")
# Output directory
outdir <- paste0("results/analyses_", format(Sys.Date(), "%Y%m%d"))
if (!dir.exists(outdir)) dir.create(outdir)
# Load data
f <- "data-raw/HEL_Database_october2021_updated_withoutnames_stats.xlsx"
dta <- as.data.frame(read_xlsx(f, na = c("", "NA")))
rownames(dta) <- dta$Pt
rm(f)
# Rename variables
names(dta) <- sub("^1st", "First", names(dta))
names(dta) <- sub("/", "_", names(dta))
# Predictors
X <- names(dta)[!(grepl("^(Pt|Dx)$", names(dta)))]
# Recoding
for (j in which(sapply(dta, class) == "character")) {
if (all(is.na(dta[[j]]) | dta[[j]] %in% c("N", "Y"))) {
dta[[j]] <- c(N = FALSE, Y = TRUE)[dta[[j]]]
}
}
dta$Olfaction <- factor(dta$Olfaction, c("Normosmic", "Anosmic", "Hyposmia"))
dta$Olfaction2 <- dta$Olfaction
levels(dta$Olfaction2) <- c("Normosmic", rep("Anosmic/Hyposmia", 2))
#X[X == "Olfaction"] <- "Olfaction2"
rm(j)
# New variables
dta$Partial_CHH <- ifelse(grepl("^Partial CHH", dta$Dx), 1, ifelse(
grepl("^CDGP", dta$Dx), 0, NA))
dta$Complete_CHH <- ifelse(grepl("^Complete CHH", dta$Dx), 1, ifelse(
grepl("^CDGP", dta$Dx), 0, NA))
dta$CHH <- ifelse(grepl("CHH", dta$Dx), 1,
ifelse(grepl("^CDGP", dta$Dx), 0, NA))
# Responses
Y <- c("CHH", "Partial_CHH", "Complete_CHH")
# AUC - List of models
f <- file.path(outdir, "AUC_tables_model_list.rds")
if (file.exists(f)) {
M <- readRDS(f)
} else {
M <- unlist(recursive = FALSE, lapply(Y, function(y) {
print(y)
clps <- function(z) paste(z, collapse = " / ")
x_list <- unlist(lapply(1:3, function(k) combn(X, k, simplify = FALSE)),
recursive = FALSE)
x_list <- Reduce(append, lapply(x_list, function(x) {
if (any(x == "Olfaction")) {
x <- list(x, sub("^Olfaction$", "Olfaction2", x))
} else {
x <- list(x)
}
return(x)
}))
mclapply(x_list, function(x) {
#print(paste(c(y, x), collapse = " "))
sdta <- na.omit(dta[c(y, x)])
fml <- paste(y, "~", paste(x, collapse = " + "))
n <- nrow(sdta)
npos <- sum(sdta[[y]] == 1)
nneg <- sum(sdta[[y]] == 0)
resp <- sdta[[y]]
e <- paste0("glm(", fml, ", family = binomial, data = sdta)")
r <- evals(e, env = environment(), graph.dir = "/tmp")[[1]]
fit.wrn <- unique(r$msg$warnings)
fit.wrn <- if (is.null(fit.wrn)) NA else clps(fit.wrn)
fit.err <- unique(r$msg$errors)
fit.err <- if (is.null(fit.err)) NA else clps(fit.err)
if (is.na(fit.err)) {
fit <- r$result
e <- "predict(fit, type = 'response')"
r <- evals(e, env = environment(), graph.dir = "/tmp")[[1]]
pred.wrn <- unique(r$msg$warnings)
pred.wrn <- if (is.null(pred.wrn)) NA else clps(pred.wrn)
pred.err <- unique(r$msg$errors)
pred.err <- if (is.null(pred.err)) NA else clps(pred.err)
if (is.na(pred.err)) {
prob <- r$result
} else {
prob <- NULL
}
z <- suppressMessages(roc(resp, prob, direction = "<"))
auc <- as.numeric(z$auc)
r <- evals("ci.auc(z)", env = environment(), graph.dir = "/tmp")[[1]]
auc.ci <- r$result
auc.ci.wrn <- unique(r$msg$warnings)
auc.ci.wrn <- if (is.null(auc.ci.wrn)) NA else clps(auc.ci.wrn)
loo <- lapply(1:nrow(sdta), function(i) {
e <- "glm(fml, family = binomial, data = sdta[-i, ])"
r <- evals(e, env = environment(), graph.dir = "/tmp")[[1]]
fit.wrn <- unique(r$msg$warnings)
fit.wrn <- if (is.null(fit.wrn)) NA else clps(fit.wrn)
fit.err <- unique(r$msg$errors)
fit.err <- if (is.null(fit.err)) NA else clps(fit.err)
if (is.na(fit.err)) {
fit <- r$result
e <- paste("predict(fit, newdata = sdta[i, , drop = FALSE],",
"type = 'response')")
r <- evals(e, env = environment(), graph.dir = "/tmp")[[1]]
pred.wrn <- unique(r$msg$warnings)
pred.wrn <- if (is.null(pred.wrn)) NA else clps(pred.wrn)
pred.err <- unique(r$msg$errors)
pred.err <- if (is.null(pred.err)) NA else clps(pred.err)
if (is.na(pred.err)) {
prob <- r$result
} else {
prob <- NA
}
} else {
prob <- NA
}
list(prob = prob, fit.wrn = fit.wrn, fit.err = fit.err,
pred.wrn = pred.wrn, pred.err = pred.err, fit = fit)
})
loo.prob <- sapply(loo, function(z) z$prob)
loo.fit.wrn <- na.omit(unique(sapply(loo, function(z) z$fit.wrn)))
loo.fit.wrn <- if (length(loo.fit.wrn) == 0) NA else clps(loo.fit.wrn)
loo.fit.err <- na.omit(unique(sapply(loo, function(z) z$fit.err)))
loo.fit.err <- if (length(loo.fit.err) == 0) NA else clps(loo.fit.err)
loo.pred.wrn <- na.omit(unique(sapply(loo, function(z) z$pred.wrn)))
loo.pred.wrn <- if (length(loo.pred.wrn) == 0) NA else clps(loo.pred.wrn)
loo.pred.err <- na.omit(unique(sapply(loo, function(z) z$pred.err)))
loo.pred.err <- if (length(loo.pred.err) == 0) NA else clps(loo.pred.err)
if (is.na(loo.fit.err) & is.na(loo.pred.err)) {
z <- suppressMessages(roc(resp, loo.prob, direction = "<"))
loo.auc <- as.numeric(z$auc)
r <- evals("ci.auc(z)", env = environment(), graph.dir = "/tmp")[[1]]
loo.auc.ci <- r$result
w <- unique(r$msg$warnings)
loo.auc.ci.wrn <- if (is.null(w)) NA else clps(w)
} else {
loo.auc <- NA
loo.auc.ci <- rep(NA, 3)
loo.auc.ci.wrn <- NA
}
} else {
prob <- NULL
auc <- NA
auc.ci <- rep(NA, 3)
auc.ci.wrn <- NA
loo.prob <- NULL
loo.fit.wrn <- NA
loo.fit.err <- NA
loo.pred.wrn <- NA
loo.pred.err <- NA
loo.auc <- NA
loo.auc.ci <- rep(NA, 3)
loo.auc.ci.wrn <- NA
}
list(
row = data.frame(
Response = y,
Predictor.1 = x[1],
Type.1 = class(sdta[[x[1]]]),
Predictor.2 = x[2],
Type.2 = class(sdta[[x[2]]]),
Predictor.3 = x[3],
Type.3 = class(sdta[[x[3]]]),
N = n,
Npos = npos,
Nneg = nneg,
AUC = auc,
AUC.lwr = auc.ci[1],
AUC.upr = auc.ci[3],
LOO.AUC = loo.auc,
LOO.AUC.lwr = loo.auc.ci[1],
LOO.AUC.upr = loo.auc.ci[3],
fit.wrn = fit.wrn,
fit.err = fit.err,
pred.wrn = pred.wrn,
pred.err = pred.err,
auc.ci.wrn = auc.ci.wrn,
loo.fit.wrn = loo.fit.wrn,
loo.fit.err = loo.fit.err,
loo.pred.wrn = loo.pred.wrn,
loo.pred.err = loo.pred.err,
loo.auc.ci.wrn = loo.auc.ci.wrn
),
resp = resp,
prob = prob,
loo.prob = loo.prob
)
})
}))
for (i in 1:length(M)) {
M[[i]]$model <- i
M[[i]]$row <- cbind(Model = i, M[[i]]$row)
}
rm(i)
saveRDS(M, f)
}
rm(f)
# AUC - Table
tbl <- do.call(rbind, lapply(M, function(z) z$row))
lapply(setNames(Y, Y), function(y) {
tbl <- tbl[tbl$Response == y, ]
tbl[order(tbl$LOO.AUC, decreasing = TRUE), ]
}) %>%
write_xlsx(file.path(outdir, "AUC_tables.xlsx"))
# Help function
get_fit <- function(m) {
y <- m$row$Response
x <- m$row[paste0("Predictor.", 1:3)]
x <- x[!is.na(x)]
dta <- na.omit(dta[c(y, x)])
fml <- as.formula(paste(y, "~", paste(x, collapse = " + ")))
do.call("glm", list(formula = fml, family = quote(binomial),
data = quote(dta)))
}
# Save image
save.image(file = file.path(outdir, "AUC_tables_workspace.rda"))
###############################################################################
# resp <- sdta[[y]]
# prob <- predict(fit, type = "response")
#
# prob_loocv <- sapply(1:nrow(dta), function(i) {
# fit <- glm(fml, family = "binomial", data = dta[-i, ])
# predict(fit, newdata = dta[i, , drop = FALSE], type = "response")
# })
#
# fml <- CHH ~ Age_1st + First_Ht + First_BMI + First_TV
# fit <- glm(fml, family = "binomial", data = dta)
# x <- predict(fit, type = "response")
# r <- roc(response = dta$CHH, predictor = x)
# x <- sapply(1:nrow(dta), function(i) {
# fit <- glm(fml, family = "binomial", data = dta[-i, ])
# predict(fit, newdata = dta[i, , drop = FALSE], type = "response")
# })
# roc(response = dta$CHH, predictor = x)
#
# library(caret)
# dta$y <- factor(paste0("X", dta$CHH))
# fml_2 <- update(fml, y ~ .)
# ctrl <- trainControl(method="LOOCV",
# summaryFunction=twoClassSummary,
# classProbs=T,
# savePredictions = T)
# fit <- train(fml_2, data=dta,
# method="glm", family = "binomial",
# trControl=ctrl)
# fit
---
title: "Retard pubértaire - AUC"
author: Jérôme Pasquier
date: 18 novembre 2021
output: xaringan::moon_reader
---
```{r, echo=FALSE, message=FALSE}
library(dplyr)
library(DT)
library(ggplot2)
library(knitr)
library(kableExtra)
library(pROC)
library(xaringanExtra)
# Options
opts_chunk$set(echo = FALSE, dpi=300, fig.width=4.5, fig.height=4.5,
out.width="500px", out.height="500px")
xaringanExtra::use_panelset()
# Data
load("../results/analyses_20211118/AUC_tables_workspace.rda")
# Help functions
pred.loo <- function(fit) {
d <- fit$model
fam <- as.character(fit$call$family)
sapply(1:nrow(d), function(i) {
m <- glm(fit$call$formula, family = fam, data = d[-i, ])
predict(m, newdata = d[i, , drop = FALSE], type = "response")
})
}
plot_roc <- function(fit) {
r0 <- roc(fit$model[[1]], predict(fit, type = "response"), direction = "<")
r1 <- roc(fit$model[[1]], pred.loo(fit), direction = "<")
ggroc(list(All = r0, LOO = r1)) +
annotate("text", x = .5, y = .25, hjust =0,
label = paste0("Area under the curve\nAll: ", signif(r0$auc, 4),
"\nLOO: ", signif(r1$auc, 4))) +
theme_bw() +
theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
#legend.position="bottom",
legend.title=element_blank()) +
scale_colour_manual(values = c("black", "red"))
}
```
```{css, echo=FALSE}
pre {
max-width: 100%;
overflow-y: scroll-x;
}
.small .remark-code {
font-size: 70% !important;
}
```
<!-- https://privefl.github.io/R-presentation/rmarkdown.html#33 -->
## Exemple de prédiction parfaite
- Variable réponse : CHH
- Variable explicatives : Olfaction, GnRH\_Test\_LH\_base, First\_T
```{r}
y <- "CHH"
x <- c("Olfaction", "GnRH_Test_LH_base", "First_T")
dta[c(y, x)] %>%
kable %>%
kable_styling("striped", full_width = FALSE) %>%
scroll_box(height = "400px")
sdta <- na.omit(dta[c(y, x)])
```
Nombre d'observations complètes : `r nrow(sdta)`
---
## Modèle de régression logistique :
.small[
```{r}
fml <- as.formula(paste(y, "~", paste(x, collapse = " + ")))
fit <- do.call("glm", list(formula = fml, family = quote(binomial),
data = quote(sdta)))
summary(fit)
```
]
---
## Prédictions du modèle
```{r}
cbind(
fit$model,
CombLin = predict(fit),
Proba = predict(fit, type = "response")
) %>%
kable %>%
kable_styling("striped", full_width = FALSE) %>%
scroll_box(height = "500px")
```
---
## Courbe ROC
```{r, message=FALSE}
r <- roc(fit$model$CHH, predict(fit, type = "response"), direction = "<")
ggroc(r) +
geom_abline(intercept = 1, slope = 1, color = "grey60") +
annotate("text", x = .25, y = .25,
label = paste("Area under the curve:", signif(r$auc, 4))) +
theme_bw() +
theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
```
---
## Exemple de prédiction parfaite (2)
- Variable réponse : Complete CHH
- Variable explicatives : First\_INB
```{r}
y <- "Complete_CHH"
x <- "First_INB"
dta[c("Complete_CHH", "First_INB")] %>%
filter(!is.na(Complete_CHH)) %>%
kable %>%
kable_styling("striped", full_width = FALSE) %>%
scroll_box(height = "400px")
```
Nombre d'observations complètes :
`r sum(!is.na(dta$Complete_CHH) & !is.na(dta$First_INB))`
---
## Modèle de régression logistique :
.small[
```{r}
fit <- glm(Complete_CHH ~ First_INB, family = binomial, data = dta)
summary(fit)
```
]
---
## First\_INB ~ Complete\_CHH
```{r}
dta %>%
filter(!is.na(Complete_CHH), !is.na(First_INB)) %>%
mutate(Complete_CHH = factor(Complete_CHH)) %>%
ggplot(aes(x = Complete_CHH, First_INB)) +
geom_boxplot()
```
---
## Prédictions du modèle
```{r}
cbind(
fit$model,
CombLin = predict(fit),
Proba = predict(fit, type = "response")
) %>%
kable %>%
kable_styling("striped", full_width = FALSE) %>%
scroll_box(height = "500px")
```
---
## Meilleurs prédicteurs
```{r}
Y <- c(chh = "CHH", pchh = "Partial_CHH", cchh = "Complete_CHH")
tbls1 <- lapply(Y, function(y) {
tbl %>%
filter(Response == y, is.na(Predictor.2)) %>%
select(Predictor.1, N, ends_with("AUC", ignore.case = FALSE)) %>%
rename(Predictor = Predictor.1) %>%
arrange(desc(AUC)) %>%
datatable(rownames = FALSE, options = list(dom = "tp", pageLength = 8)) %>%
formatRound(columns = grep("AUC$", names(tbl), value = TRUE), digits=3)
})
```
.panelset[
.panel[.panel-name[CHH]
```{r}
tbls1$chh
```
]
.panel[.panel-name[Partial CHH]
```{r}
tbls1$pchh
```
]
.panel[.panel-name[Complete CHH]
```{r}
tbls1$cchh
```
]
]
---
## Validation croisée : Leave-one-out
- On écarte la première observation de l'échantillon
- On estime le modèle sur les observations restantes
- On prédit, à l'aide du modèle estimé, la probabilité que l'observation
écartée soit positive
- On répète les trois étapes précédentes pour chaque observation
- On obtient une probabilité d'être positif pour chaque observation
- On calcule une courbe ROC et une aire sous la courbe à l'aide de ce vecteur
de probabilité
---
## Modèle intéressant - CHH
- Variable réponse : CHH
- Variable explicatives : Olfaction, Age\_1st
```{r}
y <- "CHH"
x <- c("Olfaction", "Age_1st")
dta[c(y, x)] %>%
kable %>%
kable_styling("striped", full_width = FALSE) %>%
scroll_box(height = "400px")
sdta <- na.omit(dta[c(y, x)])
```
Nombre d'observations complètes : `r nrow(sdta)`
---
## Régression logistique - CHH
.small[
```{r, message=FALSE}
fits <- list(
glm(CHH ~ Olfaction2 + Age_1st, family = binomial, data = dta),
glm(CHH ~ Olfaction2, family = binomial, data = dta),
glm(CHH ~ Age_1st, family = binomial, data = dta)
)
summary(fits[[1]])
```
]
---
## Courbes ROC - CHH
.panelset[
.panel[.panel-name[Olfaction + Age\_1st]
```{r, message=FALSE, fig.width=6, out.width="600px", out.height="400px"}
plot_roc(fits[[1]])
```
]
.panel[.panel-name[Olfaction]
```{r, message=FALSE, fig.width=6, out.width="600px", out.height="400px"}
plot_roc(fits[[2]])
```
]
.panel[.panel-name[Age\_1st]
```{r, message=FALSE, fig.width=6, out.width="600px", out.height="400px"}
plot_roc(fits[[3]])
```
]
]
---
## Modèle intéressant - Partial CHH
- Variable réponse : Partial\_CHH
- Variable explicatives : Olfaction, Age\_1st
```{r}
y <- "Partial_CHH"
x <- c("Olfaction", "Age_1st")
dta[c(y, x)] %>%
kable %>%
kable_styling("striped", full_width = FALSE) %>%
scroll_box(height = "400px")
sdta <- na.omit(dta[c(y, x)])
```
Nombre d'observations complètes : `r nrow(sdta)`
---
## Régression logistique - Partial CHH
.small[
```{r, message=FALSE}
fits <- list(
glm(Partial_CHH ~ Olfaction2 + Age_1st, family = binomial, data = dta),
glm(Partial_CHH ~ Olfaction2, family = binomial, data = dta),
glm(Partial_CHH ~ Age_1st, family = binomial, data = dta)
)
summary(fits[[1]])
```
]
---
## Courbes ROC - Partial CHH
.panelset[