lapply(c("ggplot2", "ggthemes", "sp", "sf", "loo"), library, character.only=TRUE) ### theme for graphics theme_set(theme_bw(base_size = 20)) ### clean up rm(list = ls()) getwd() WorkDir <- paste(getwd(), "dauphin_v2", sep = "/") DataDir <- paste(WorkDir, "data", sep = "/") OutDir <- paste(WorkDir, "output", sep = "/") ### load the data load(paste(DataDir, "Filtered_data.RData", sep = "/")) ### view the different objects in your environment ls() ### dataframe of observations head(obs) ### Exploratory data analysis ggplot(data = obs, aes(x = nombre) ) + geom_histogram(breaks = 0:300) + ylab("Effectif") + xlab("Nombre de dauphins") + theme(plot.title = element_text(lineheight = 0.8, face = "bold"), axis.text = element_text(size = 10) ) obs$nombreShift <- obs$nombre - 1 dauphin2.glm = glm(nombreShift ~ scale(DepthGebco) + scale(Slope) + scale(DistCot), family = "poisson", data = obs ) summary(dauphin2.glm) dauphin2.glm$coefficients library(R2jags) model.jags <- ' model{ # Prior intercept ~ dnorm(0.0, hyperparam_inter) slope[1] ~ dnorm(0.0, hyperparam_slope[1]) slope[2] ~ dnorm(0.0, hyperparam_slope[2]) slope[3] ~ dnorm(0.0, hyperparam_slope[3]) # Likelihood for (i in 1:n_obs) { lambda[i] <- exp(intercept + slope[1] * X1[i] + slope[2] * X2[i] + slope[3] * X3[i]) Y[i] ~ dpois(lambda[i]) } } ' data.jags <- list(n_obs = nrow(obs), Y = obs$nombreShift, X1 = as.vector(scale(obs$DepthGebco)), X2 = as.vector(scale(obs$Slope)), X3 = as.vector(scale(obs$DistCot)), hyperparam_inter = 1.0, hyperparam_slope = 1/rep(log(2)/2, 3)^2 ) # Inits function # On triche un peu: on va prendre les estimations de glm inits.jags <- function(idchain, glm) { list(intercept = rnorm(1, glm$coefficients[1], sqrt(10 * diag(vcov(glm)))[1]), slope = rnorm(3, glm$coefficients[2:4], sqrt(10 * diag(vcov(glm)))[2:4]) ) } # parameter of inferential interest params.jags <- c("intercept", "slope") # MCMC settings ni <- 2000 # TOTAL Number of iterations including Burn in nb <- 1000 #Number of iterations for Burn in nt <- 1 nc <- 3 # Load/check model, load data and inits # Start Gibbs sampler temp <- jags(data = data.jags, inits = lapply(1:nc, inits.jags, glm = dauphin2.glm), param = params.jags, model.file = textConnection(model.jags), n.chains = nc, n.burnin = nb, n.iter = ni, n.thin = nt ) out = temp$BUGSoutput # Inferences # After Burn in period let's work inference assuming ergodic regime is reached print(out$summary, dig = 2) library(coda) gelman.diag(x=out, confidence = 0.95, transform=FALSE, autoburnin=TRUE) gelman.plot(x=out) traceplot(out) library(tidyverse) other = tibble( param = paste0("slope[",1:3,"]"), class = "slope", value = dauphin2.glm$coefficients[2:4], type = "mle" ) %>% rbind(list("intercept", "intercept", dauphin2.glm$coefficients[1], "mle")) out$sims.matrix[,-1] %>% as_tibble() %>% tidyr::gather(param, value) %>% mutate(class = stringr::str_replace(param, "\\[\\d+\\]","")) %>% ggplot(aes_string(x="value", fill="class")) + geom_density(alpha=0.5) + facet_grid(param ~ .) + geom_vline(data = other, aes_string(xintercept="value", color="type", linetype="type"), size=0.8) + scale_colour_manual(values=c("red","purple")) + guides(fill=FALSE)+labs(x="valeur de l'inconnue", y="densité") #essayer facet_grid(~param, scales = "free") ### calcul du WAIC ## besoin de calculer la log vraisemblance sous le modèle library(loo) y_rep_1 <- log_lik1 <- array(NA, dim = c(nc * (ni - nb)/nt, nrow(obs))) for(i in 1:nrow(obs)) { linpred <- out$sims.list$intercept + out$sims.list$slope[, 1] * data.jags$X1[i] + out$sims.list$slope[, 2] * data.jags$X2[i] + out$sims.list$slope[, 3] * data.jags$X3[i] log_lik1[, i] <- dpois(data.jags$Y[i], lambda = exp(linpred), log = TRUE) y_rep_1[, i] <- rpois(nrow(y_rep_1), lambda = exp(linpred)) + 1 }; rm(i, linpred) loo::waic(log_lik1) loo::loo(log_lik1) y_rep_poisson_1 <- y_rep_1 dim(y_rep_poisson_1) hist(y_rep_poisson_1[, 10], las = 1, xlab = quote(y[10]^rep), main = "PPC for the tenth datum") abline(v = obs$nombreShift[10], lty = 2, col = "red", lwd = 2) round(mean(ifelse(y_rep_poisson_1[, 10] > (obs$nombreShift[10]), 1, 0)),dig=3) hist(y_rep_poisson_1[1, ], las = 1, main = "Iteration 1", breaks = c(0:100)) # rootograms rootogram <- function(countdata, y_rep, min_obs = 1) { f_histogram <- function(x, max_obs) { table(factor(x, levels = min_obs:max_obs)) } max_obs <- max(countdata, as.numeric(y_rep)) dd <- ggplot(data.frame(mids = (min_obs:max_obs + (min_obs + 1):(max_obs + 1))/2, y_obs = as.numeric(f_histogram(x = countdata, max_obs = max_obs)), y_rep = apply(apply(y_rep, 1, f_histogram, max_obs = max_obs), 1, mean) ), aes(x = mids, y = y_rep) ) + geom_rect(aes(xmin = mids - 0.5, xmax = mids + 0.5, ymax = y_obs, ymin = 0), fill = "lightgrey", color = grey(0.6), alpha = 0.5) + geom_line(aes(x = mids, y = y_rep), color = "black", size = 1.0) + scale_y_continuous(name = "Count") + scale_x_continuous(name = "Nb of dolphins", breaks = c(0, seq(1, max_obs, max_obs/50))) + guides(size = "none") + theme(plot.title = element_text(lineheight = 0.8, face = "bold"), axis.text = element_text(size = 12), strip.text = element_text(size = 12), strip.background = element_rect(fill = grey(0.95)) ) return(dd) } rootogram_model_1 <- rootogram(countdata = obs$nombre, y_rep = y_rep_1+1) rootogram_model_1 + coord_cartesian(xlim = c(1, 50)) plot_relationships <- function(data_df, jagsfit, cov_name) { X <- data_df[, cov_name] x_scale <- apply(X, 2, function(x) {c(mean(x), sd(x))}) x_range <- apply(apply(X, 2, scale), 2, range) stdX <- cbind(rep(1, 1e4), apply(x_range, 2, function(vec) {seq(vec[1], vec[2], length.out = 1e4)}) ) beta <- cbind(jagsfit$sims.list$intercept, jagsfit$sims.list$slope ) if(ncol(stdX) != ncol(beta)) { stop("Please check model and covariable names:\n\tdimension mismatch between slope parameters and covariables") } else { dd <- data.frame(x = numeric(0), y = numeric(0), lower = numeric(0), upper = numeric(0), what = character(0), covariable = character(0) ) for(j in 1:length(cov_name)) { Xmat <- cbind(rep(1, 1e4), matrix(0, nrow = 1e4, ncol = length(cov_name)) ) Xmat[, j+1] <- stdX[, j+1] linpred <- beta %*% t(Xmat) dd <- rbind(dd, data.frame(x = Xmat[, j+1] * x_scale[2, cov_name[j]] + x_scale[1, cov_name[j]], y = apply(linpred, 2, mean), lower = apply(linpred, 2, stats::quantile, prob = 0.025), upper = apply(linpred, 2, stats::quantile, prob = 0.975), what = rep("linear predictor", 1e4), covariable = rep(cov_name[j], 1e4) ), data.frame(x = Xmat[, j+1] * x_scale[2, cov_name[j]] + x_scale[1, cov_name[j]], y = apply(exp(linpred), 2, mean), lower = apply(exp(linpred), 2, stats::quantile, prob = 0.025), upper = apply(exp(linpred), 2, stats::quantile, prob = 0.975), what = rep("response", 1e4), covariable = rep(cov_name[j], 1e4) ) ) } dd$what <- factor(as.character(dd$what), levels = c("response", "linear predictor")) dd$covariable <- factor(as.character(dd$covariable), levels = cov_name) the_plot <- ggplot(data = dd, aes(x = x, y = y) ) + geom_ribbon(aes(x = x, ymin = lower, ymax = upper), fill = "lightgrey", color = grey(0.6), alpha = 0.5 ) + geom_line(aes(x = x, y = y), color = "black", size = 1.0) + scale_y_continuous(name = "y") + scale_x_continuous(name = "Covariable") + facet_grid(what~covariable, scales = "free") + theme(plot.title = element_text(lineheight = 0.8, face = "bold"), axis.text = element_text(size = 12), strip.text = element_text(size = 12), strip.background = element_rect(fill = grey(0.95)) ) return(the_plot) } } plot_relationships(data_df = obs, jagsfit = out, cov_name = c("DepthGebco", "Slope", "DistCot")) n <- 1e3 # nombre de données à simuler lambda <- 5 # moyenne du processus omega <- 3 # paramètre de surdispersion ggplot(data = data.frame(x = c(rpois(n, lambda), MASS::rnegbin(n, mu = lambda, theta = lambda/(omega - 1)) ), distribution = rep(c("Poisson", "NegBin"), each = n) ), aes(x = x, group = distribution) ) + geom_histogram(breaks = seq(0:50)) + facet_wrap(~distribution, ncol = 1) model.jags <- ' model{ # Prior intercept ~ dnorm(0.0, hyperparam_inter) slope[1] ~ dnorm(0.0, hyperparam_slope[1]) slope[2] ~ dnorm(0.0, hyperparam_slope[2]) slope[3] ~ dnorm(0.0, hyperparam_slope[3]) inv_overdispersion ~ dunif(0, 1) overdispersion <- 1/inv_overdispersion # Likelihood for (i in 1:n_obs) { lambda[i] <- exp(intercept + slope[1] * X1[i] + slope[2] * X2[i] + slope[3] * X3[i]) r[i] <- lambda[i] / (overdispersion - 1) Y[i] ~ dnegbin(inv_overdispersion, r[i]) } } ' data.jags <- list(n_obs = nrow(obs), Y = obs$nombreShift, X1 = as.vector(scale(obs$DepthGebco)), X2 = as.vector(scale(obs$Slope)), X3 = as.vector(scale(obs$DistCot)), hyperparam_inter = 1.0, hyperparam_slope = 1/rep(log(2)/2, 3)^2 ) # Inits function inits.jags <- function(idchain, glm) { list(intercept = rnorm(1, glm$coefficients[1], sqrt(10 * diag(vcov(glm)))[1]), slope = rnorm(3, glm$coefficients[2:4], sqrt(10 * diag(vcov(glm)))[2:4]), inv_overdispersion = runif(1, 0, 1) ) } params.jags <- c("intercept", "slope", "overdispersion") # Start Gibbs sampler temp <- jags(data = data.jags, inits = lapply(1:nc, inits.jags, glm = dauphin2.glm), param = params.jags, model.file = textConnection(model.jags), n.chains = nc, n.burnin = nb, n.iter = ni, n.thin = nt ) out3 = temp$BUGSoutput # Inferences # After Burn in period let's work inference assuming ergodic regime is reached print(out3$summary, dig = 2) library(coda) gelman.diag(x=out3, confidence = 0.95, transform=FALSE, autoburnin=TRUE) gelman.plot(x=out3) traceplot(out3) y_rep_3 <- log_lik3 <- array(NA, dim = c(nc * (ni - nb)/nt, nrow(obs))) for(i in 1:nrow(obs)) { overdispersion <- drop(out3$sims.list$overdispersion) linpred <- out3$sims.list$intercept + out3$sims.list$slope[, 1] * data.jags$X1[i] + out3$sims.list$slope[, 2] * data.jags$X2[i] + out3$sims.list$slope[, 3] * data.jags$X3[i] log_lik3[, i] <- dnbinom(data.jags$Y[i], prob = 1 / overdispersion, size = exp(linpred) / (overdispersion - 1), log = TRUE) y_rep_3[, i] <- rnbinom(nrow(y_rep_3), prob = 1 / overdispersion, size = exp(linpred) / (overdispersion - 1) ) + 1 }; rm(i, linpred, overdispersion) loo::waic(log_lik3) loo::loo(log_lik3) rootogram_model_3 <- rootogram(countdata = obs$nombre, y_rep = y_rep_3) rootogram_model_3 + coord_cartesian(xlim = c(1, 50)) plot_relationships(data_df = obs, jagsfit = out3, cov_name = c("DepthGebco", "Slope", "DistCot")) hist(out3$sims.list$overdispersion, las = 1, bty = 'n', xlab = quote(omega), ylab = "frequency", main = "Posterior") ggplot(data = data.frame(x = c(1 / runif(length(out3$sims.list$overdispersion), 0.0, 1.0), out3$sims.list$overdispersion ), distribution = rep(c("prior", "posterior"), each = length(out3$sims.list$overdispersion)) ), aes(x = x, group = distribution) ) + geom_histogram(breaks = c(0:100)) + facet_wrap(~distribution, ncol = 1) + coord_cartesian(xlim = c(0, 30)) model.jags <- ' model{ # Prior intercept ~ dnorm(0.0, hyperparam_inter) slope[1] ~ dnorm(0.0, hyperparam_slope[1]) slope[2] ~ dnorm(0.0, hyperparam_slope[2]) slope[3] ~ dnorm(0.0, hyperparam_slope[3]) inv_overdispersion ~ dunif(0, 1) overdispersion <- 1/inv_overdispersion sigma_year ~ dnorm(0.0, 1.0)T(0.0,) tau_alpha <- pow(sigma_year, -2) for(j in 1:n_year) { alpha[j] ~ dnorm(intercept, tau_alpha) } # Likelihood for (i in 1:n_obs) { lambda[i] <- exp(alpha[YEAR[i]] + slope[1] * X1[i] + slope[2] * X2[i] + slope[3] * X3[i]) r[i] <- lambda[i] / (overdispersion - 1) Y[i] ~ dnegbin(inv_overdispersion, r[i]) } } ' data.jags <- list(n_obs = nrow(obs), n_year = length(unique(obs$an)), Y = obs$nombreShift, X1 = as.vector(scale(obs$DepthGebco)), X2 = as.vector(scale(obs$Slope)), X3 = as.vector(scale(obs$DistCot)), YEAR = obs$an - min(obs$an) + 1, hyperparam_inter = 1.0, hyperparam_slope = 1/rep(log(2)/2, 3)^2 ) # Inits function inits.jags <- function(idchain, glm) { list(intercept = rnorm(1, glm$coefficients[1], sqrt(10 * diag(vcov(glm)))[1]), slope = rnorm(3, glm$coefficients[2:4], sqrt(10 * diag(vcov(glm)))[2:4]), inv_overdispersion = runif(1, 0, 1), sigma_year = abs(rnorm(1)), alpha = rnorm(length(unique(obs$an))) ) } params.jags <- c("intercept", "slope", "overdispersion", "alpha", "sigma_year") # Start Gibbs sampler temp <- jags(data = data.jags, inits = lapply(1:nc, inits.jags, glm = dauphin2.glm), param = params.jags, model.file = textConnection(model.jags), n.chains = nc, n.burnin = nb, n.iter = ni, n.thin = nt ) out4 = temp$BUGSoutput # Inferences # After Burn in period let's work inference assuming ergodic regime is reached print(out4$summary, dig = 2) library(coda) gelman.diag(x=out4, confidence = 0.95, transform=FALSE, autoburnin=TRUE) gelman.plot(x=out4) traceplot(out4) ### calcul du WAIC ## besoin de calculer la log vraisemblance sous le modele y_rep_4 <- log_lik4 <- array(NA, dim = c(nc * (ni - nb)/nt, nrow(obs))) for(i in 1:nrow(obs)) { overdispersion <- drop(out4$sims.list$overdispersion) linpred <- out4$sims.list$slope[, 1] * data.jags$X1[i] + out4$sims.list$slope[, 2] * data.jags$X2[i] + out4$sims.list$slope[, 3] * data.jags$X3[i] + out4$sims.list$alpha[, data.jags$YEAR[i]] log_lik4[, i] <- dnbinom(data.jags$Y[i], prob = 1 / overdispersion, size = exp(linpred) / (overdispersion - 1), log = TRUE ) y_rep_4[, i] <- rnbinom(nrow(y_rep_4), prob = 1 / overdispersion, size = exp(linpred) / (overdispersion - 1) ) + 1 }; rm(i, linpred, overdispersion) loo::waic(log_lik4) loo::loo(log_lik4) ## regarder l'histogramme des donnees et le comparer a l'histogramme attendu sous le modele rootogram_model_4 <- rootogram(countdata = obs$nombre, y_rep = y_rep_4) rootogram_model_4 + coord_cartesian(xlim = c(1, 50)) ### un peu mieux que le modele precedent... plot_relationships(data_df = obs, jagsfit = out4, cov_name = c("DepthGebco", "Slope", "DistCot")) ggplot(data = data.frame(x = c(out3$sims.list$overdispersion, out4$sims.list$overdispersion ), distribution = rep(c("Sans effet année", "Avec effet aléatoire année"), each=length(out3$sims.list$overdispersion) ) ), aes(x = x, group = distribution) ) + geom_histogram(breaks = seq(10, 25, 0.1)) + facet_wrap(~distribution, ncol = 1) + coord_cartesian(xlim = c(12, 22))+labs(x="paramètre d'overdispersion", y="répartition") 1 - round(mean(out4$sims.list$overdispersion / out3$sims.list$overdispersion ), 2 )