Licensed under the Creative Commons attribution-noncommercial license, http://creativecommons.org/licenses/by-nc/3.0/. Please share and remix noncommercially, mentioning its origin.
CC-BY_NC


Produced with R version 3.2.3 and pomp version 1.3.1.2.


Objectives

  1. To discuss forecasting and fitted values, both as ways to assess model fit and as potentially the major goal of time series analysis.

  2. To teach some forecasting methods based on POMP models.

  3. To demonstrate a forecasting analysis using POMP methods and the pomp computational environment.


17.1 Some theory for forecasting

\[\begin{eqnarray} f_{X_{n+p}|Y_{1:n}}(x_{n+p}|{y_{1:n}^*}{\, ; \,}\hat\theta) &=& \int f_{X_n|Y_{1:n}}(x_n|{y_{1:n}^*}{\, ; \,}\hat\theta) f_{X_{n+p}|X_n}(x_{n+p}{{\, | \,}}x_n{\, ; \,}\hat\theta) \, dx_{n} \\ &=& \int f_{X_n|Y_{1:n}}(x_n|{y_{1:n}^*}{\, ; \,}\hat\theta) \prod_{k=1}^p f_{X_{n+k}|X_{n+k-1}}(x_{n+k}{{\, | \,}}x_{n+k-1}{\, ; \,}\hat\theta) \, dx_{n}\dots dx_{n+p-1} \end{eqnarray}\]




17.2 Some comments on โ€œfitted valuesโ€ for time series

sp500_table <- read.table("sp500.csv",sep=",",header=TRUE)
Nt <- 1000
sp500 <- sp500_table$Close[Nt:1] # data are in reverse order in sp500.csv
sp500_date <- strptime(sp500_table$Date[Nt:1],"%Y-%m-%d")
sp500_time <-  as.numeric(format(sp500_date, format="%Y")) + as.numeric(format(sp500_date, format="%j"))/365
sp500_arma <- arima(log(sp500),order=c(1,0,1))
sp500_arma_fitted <- log(sp500)-resid(sp500_arma)
plot(sp500_time,sp500,log="y",type="l",xlab="Date")
lines(sp500_time,exp(sp500_arma_fitted),col="red")

plot(sp500_time,sp500,log="y",type="l",xlim=range(tail(sp500_time,100)),xlab="Date")
lines(sp500_time,exp(sp500_arma_fitted),col="red")

sp500_arma
## 
## Call:
## arima(x = log(sp500), order = c(1, 0, 1))
## 
## Coefficients:
##          ar1     ma1  intercept
##       0.9989  0.0209     7.4622
## s.e.  0.0012  0.0324     0.1565
## 
## sigma^2 estimated as 6.703e-05:  log likelihood = 3383.15,  aic = -6758.29
cor(sp500,sp500_arma_fitted)
## [1] 0.9970619
sd(log(sp500)[2:length(sp500)]-lag(log(sp500))[1:(length(sp500)-1)])
## [1] 0.008186035
sd(resid(sp500_arma))
## [1] 0.008183854




17.3 Case study: An emerging infectious disease outbreak

base_url <- "http://kingaa.github.io/sbied/"
read.csv(paste0(base_url,"data/ebola_data.csv"),stringsAsFactors=FALSE,
         colClasses=c(date="Date")) -> dat
sapply(dat,class)
##        week        date     country       cases 
##   "integer"      "Date" "character"   "numeric"
head(dat)
##   week       date country cases
## 1    1 2014-01-05  Guinea 2.244
## 2    2 2014-01-12  Guinea 2.244
## 3    3 2014-01-19  Guinea 0.073
## 4    4 2014-01-26  Guinea 5.717
## 5    5 2014-02-02  Guinea 3.954
## 6    6 2014-02-09  Guinea 5.444
## Population sizes in Guinea, Liberia, and Sierra Leone (census 2014)
populations <- c(Guinea=10628972,Liberia=4092310,SierraLeone=6190280)
dat %>%
  ggplot(aes(x=date,y=cases,group=country,color=country))+
  geom_line()




17.3.1 An SEIR model with gamma-distributed latent and infectious periods

  • Many of the early modeling efforts used variants on the simple SEIR model.

  • Here, weโ€™ll focus on a variant that attempts a more accurate description of the duration of the latent period.

  • Specifically, this model assumes that the amount of time an infection remains latent is \[\mathrm{LP} \sim \mathrm{Gamma}\left(m,\frac{1}{m\,\alpha}\right),\] where \(m\) is an integer.

  • This means that the latent period has expectation \(1/\alpha\) and variance \(1/(m\,\alpha)\). In this document, weโ€™ll fix \(m=3\).

  • We implement Gamma distributions using the so-called linear chain trick.




17.3.2 Process model simulator

rSim <- Csnippet('
  double lambda, beta;
  double *E = &E1;
  beta = R0 * gamma; // Transmission rate
  lambda = beta * I / N; // Force of infection
  int i;

  // Transitions
  // From class S
  double transS = rbinom(S, 1.0 - exp(- lambda * dt)); // No of infections
  // From class E
  double transE[nstageE]; // No of transitions between classes E
  for(i = 0; i < nstageE; i++){
    transE[i] = rbinom(E[i], 1.0 - exp(-nstageE * alpha * dt));
  }
  // From class I
  double transI = rbinom(I, 1.0 - exp(-gamma * dt)); // No of transitions I->R

  // Balance the equations
  S -= transS;
  E[0] += transS - transE[0];
  for(i=1; i < nstageE; i++) {
    E[i] += transE[i-1] - transE[i];
  }
  I += transE[nstageE-1] - transI;
  R += transI;
  N_EI += transE[nstageE-1]; // No of transitions from E to I
  N_IR += transI; // No of transitions from I to R
')




17.3.3 Deterministic skeleton

  • The deterministic skeleton is an ODE.
skel <- Csnippet('
  double lambda, beta;
  const double *E = &E1;
  double *DE = &DE1;
  beta = R0 * gamma; // Transmission rate
  lambda = beta * I / N; // Force of infection
  int i;

  // Balance the equations
  DS = - lambda * S;
  DE[0] = lambda * S - nstageE * alpha * E[0];
  for (i=1; i < nstageE; i++)
    DE[i] = nstageE * alpha * (E[i-1]-E[i]);
  DI = nstageE * alpha * E[nstageE-1] - gamma * I;
  DR = gamma * I;
  DN_EI = nstageE * alpha * E[nstageE-1];
  DN_IR = gamma * I;
')




17.3.4 Measurement model: overdispersed count data

  • \(C_t | H_t\) is negative binomial with \({\mathbb{E}}[C_t|H_t] = \rho\,H_t\) and \({\mathrm{Var}}[C_t|H_t] = \rho\,H_t\,(1+k\,\rho\,H_t)\).
dObs <- Csnippet('
  double f;
  if (k > 0.0)
    f = dnbinom_mu(nearbyint(cases),1.0/k,rho*N_EI,1);
  else
    f = dpois(nearbyint(cases),rho*N_EI,1);
  lik = (give_log) ? f : exp(f);
')

rObs <- Csnippet('
  if (k > 0) {
    cases = rnbinom_mu(1.0/k,rho*N_EI);
  } else {
    cases = rpois(rho*N_EI);
  }')




17.3.5 Parameter transformations

toEst <- Csnippet('
  const double *IC = &S_0;
  double *TIC = &TS_0;
  TR0 = log(R0);
  Trho = logit(rho);
  Tk = log(k);
  to_log_barycentric(TIC,IC,4);
')

fromEst <- Csnippet('
  const double *IC = &S_0;
  double *TIC = &TS_0;
  TR0 = exp(R0);
  Trho = expit(rho);
  Tk = exp(k);
  from_log_barycentric(TIC,IC,4);
')
  • The following function constructs a pomp object to hold the data for any one of the countries.
ebolaModel <- function (country=c("Guinea", "SierraLeone", "Liberia"),
                        timestep = 0.1, nstageE = 3) {

  ctry <- match.arg(country)
  pop <- unname(populations[ctry])
  nstageE <- as.integer(nstageE)

  globs <- paste0("static int nstageE = ",nstageE,";")

  dat <- subset(dat,country==ctry,select=-country)

  ## Create the pomp object
  dat %>% 
    extract(c("week","cases")) %>%
    pomp(
      times="week",
      t0=min(dat$week)-1,
      globals=globs,
      statenames=c("S","E1","I","R","N_EI","N_IR"),
      zeronames=c("N_EI","N_IR"),
      paramnames=c("N","R0","alpha","gamma","rho","k",
                   "S_0","E_0","I_0","R_0"),
      nstageE=nstageE,
      dmeasure=dObs, rmeasure=rObs,
      rprocess=discrete.time.sim(step.fun=rSim, delta.t=timestep),
      skeleton=skel, skeleton.type="vectorfield",
      toEstimationScale=toEst,
      fromEstimationScale=fromEst,
      initializer=function (params, t0, nstageE, ...) {
        all.state.names <- c("S",paste0("E",1:nstageE),"I","R","N_EI","N_IR")
        comp.names <- c("S",paste0("E",1:nstageE),"I","R")
        x0 <- setNames(numeric(length(all.state.names)),all.state.names)
        frac <- c(params["S_0"],rep(params["E_0"]/nstageE,nstageE),params["I_0"],params["R_0"])
        x0[comp.names] <- round(params["N"]*frac/sum(frac))
        x0
      }
    ) -> po
}

ebolaModel("Guinea") -> gin
ebolaModel("SierraLeone") -> sle
ebolaModel("Liberia") -> lbr




17.4 Parameter estimates for Ebola

options(stringsAsFactors=FALSE)
profs <- read.csv(paste0(base_url,"/ebola/ebola-profiles.csv"))
require(reshape2)
require(plyr)
require(magrittr)
require(ggplot2)
theme_set(theme_bw())

profs %>% 
  melt(id=c("profile","country","loglik")) %>%
  subset(variable==profile) %>%
  ddply(~country,mutate,dll=loglik-max(loglik)) %>%
  ddply(~country+profile+value,subset,loglik==max(loglik)) %>% 
  ggplot(mapping=aes(x=value,y=dll))+
  geom_point(color='red')+
  geom_hline(yintercept=-0.5*qchisq(p=0.99,df=1))+
  facet_grid(country~profile,scales='free')+
  labs(y=expression(l))




17.5 Diagnostics for the Ebola analysis

library(pomp)
library(plyr)
library(reshape2)
library(magrittr)
options(stringsAsFactors=FALSE)

profs %>%
  subset(country=="Guinea") %>%
  subset(loglik==max(loglik),
         select=-c(loglik,loglik.se,country,profile)) %>%
  unlist() -> coef(gin)

simulate(gin,nsim=20,as.data.frame=TRUE,include.data=TRUE) %>% 
  mutate(date=min(dat$date)+7*(time-1),
         is.data=ifelse(sim=="data","yes","no")) %>% 
  ggplot(aes(x=date,y=cases,group=sim,color=is.data,
         alpha=is.data))+
  geom_line()+
  guides(color=FALSE,alpha=FALSE)+
  scale_color_manual(values=c(no=gray(0.6),yes='red'))+
  scale_alpha_manual(values=c(no=0.5,yes=1))

growth.rate <- function (y) {
  cases <- y["cases",]
  fit <- lm(log1p(cases)~seq_along(cases))
  unname(coef(fit)[2])
}
probe(gin,probes=list(r=growth.rate),nsim=500) %>% plot()

growth.rate.plus <- function (y) {
  cases <- y["cases",]
  fit <- lm(log1p(cases)~seq_along(cases))
  c(r=unname(coef(fit)[2]),sd=sd(residuals(fit)))
}
probe(gin,probes=list(growth.rate.plus),
      nsim=500) %>% plot()

log1p.detrend <- function (y) {
  cases <- y["cases",]
  y["cases",] <- as.numeric(residuals(lm(log1p(cases)~seq_along(cases))))
  y
}

probe(gin,probes=list(
  growth.rate.plus,
  probe.quantile(var="cases",prob=c(0.25,0.75)),
  probe.acf(var="cases",lags=c(1,2,3),type="correlation",
            transform=log1p.detrend)
),nsim=500) %>% plot()




17.5.1 Exercise: the SEIR model for the Sierra Leone outbreak

  • Apply probes to investigate the extent to which the model is an adequate description of the data from the Sierra Leone outbreak.

  • Have a look at the probes provided with pomp: ?basic.probes.

  • Try also to come up with some informative probes of your own. Discuss the implications of your findings.




17.6 Forecasting Ebola

require(pomp)
require(plyr)
require(reshape2)
require(magrittr)
options(stringsAsFactors=FALSE)

set.seed(988077383L)

## forecast horizon
horizon <- 13

profs %>%
  subset(country=="SierraLeone") %>%
  subset(loglik==max(loglik),
         select=-c(loglik,loglik.se,country,profile)) %>%
  unlist() -> mle

## Weighted quantile function
wquant <- function (x, weights, probs = c(0.025,0.5,0.975)) {
  idx <- order(x)
  x <- x[idx]
  weights <- weights[idx]
  w <- cumsum(weights)/sum(weights)
  rval <- approx(w,x,probs,rule=1)
  rval$y
}

profs %>% 
  subset(country=="SierraLeone",
         select=-c(country,profile,loglik.se)) %>%
  subset(loglik>max(loglik)-0.5*qchisq(df=1,p=0.99)) %>%
  melt(variable.name="parameter") %>%
  ddply(~parameter,summarize,
        min=min(value),max=max(value)) %>%
  subset(parameter!="loglik") %>%
  melt(measure=c("min","max")) %>%
  acast(parameter~variable) -> ranges

params <- sobolDesign(lower=ranges[,'min'],
                      upper=ranges[,'max'],
                      nseq=20)
plot(params)

require(foreach)
require(doMC)
require(iterators)

registerDoMC(cores=4)

set.seed(887851050L,kind="L'Ecuyer")

foreach(p=iter(params,by='row'),
        .inorder=FALSE,
        .combine=rbind,
        .options.multicore=list(preschedule=TRUE,set.seed=TRUE)
) %dopar%
{
  require(pomp)
  
  M1 <- ebolaModel("SierraLeone")
  pf <- pfilter(M1,params=unlist(p),Np=2000,save.states=TRUE)
  pf$saved.states %>% tail(1) %>% melt() %>% 
    dcast(rep~variable,value.var="value") %>%
    ddply(~rep,summarize,S_0=S,E_0=E1+E2+E3,I_0=I,R_0=R) %>%
    melt(id="rep") %>% acast(variable~rep) -> x
  
  pp <- parmat(unlist(p),ncol(x))
  
  simulate(M1,params=pp,obs=TRUE) %>%
    melt() %>%
    mutate(time=time(M1)[time],
           period="calibration",
           loglik=logLik(pf)) -> calib

    M2 <- M1
  time(M2) <- max(time(M1))+seq_len(horizon)
  timezero(M2) <- max(time(M1))
  
  pp[rownames(x),] <- x
  
  simulate(M2,params=pp,obs=TRUE) %>%
    melt() %>%
    mutate(time=time(M2)[time],
           period="projection",
           loglik=logLik(pf)) -> proj
  
  rbind(calib,proj)
} %>% subset(variable=="cases",select=-variable) %>%
  mutate(weight=exp(loglik-mean(loglik))) %>%
  arrange(time,rep) -> sims

ess <- with(subset(sims,time==max(time)),weight/sum(weight))
ess <- 1/sum(ess^2); ess
## [1] 10126.79
sims %>% ddply(~time+period,summarize,prob=c(0.025,0.5,0.975),
               quantile=wquant(value,weights=weight,probs=prob)) %>%
  mutate(prob=mapvalues(prob,from=c(0.025,0.5,0.975),
                        to=c("lower","median","upper"))) %>%
  dcast(period+time~prob,value.var='quantile') %>%
  mutate(date=min(dat$date)+7*(time-1)) -> simq
simq %>% ggplot(aes(x=date))+
  geom_ribbon(aes(ymin=lower,ymax=upper,fill=period),alpha=0.3,color=NA)+
  geom_line(aes(y=median,color=period))+
  geom_point(data=subset(dat,country=="SierraLeone"),
             mapping=aes(x=date,y=cases),color='black')+
  labs(y="cases")




Acknowledgment

These notes draw on material developed for a short course on Simulation-based Inference for Epidemiological Dynamics by Aaron King and Edward Ionides, taught at the University of Washington Summer Institute in Statistics and Modeling in Infectious Diseases, 2015.


References

King, A. A., M. Domenech de Cellรจs, F. M. G. Magpantay, and P. Rohani. 2015. Avoidable errors in the modelling of outbreaks of emerging pathogens, with special reference to ebola. Proc. R. Soc. Lond. B 282:20150347.