--- title: Non-linear latent variable models and error-in-variable models author: Klaus Kähler Holst date: "`r Sys.Date()`" output: rmarkdown::html_vignette: fig_caption: yes fig_width: 6 fig_height: 4 vignette: > %\VignetteIndexEntry{Non-linear latent variable models and error-in-variable models} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} bibliography: ref.bib --- ```{r include=FALSE } knitr::opts_chunk$set( collapse = TRUE, comment = "#>", dev = "svg", fig.ext = "svg" ) ## library("svglite") mets <- lava:::versioncheck('mets', 1) fullVignette <- Sys.getenv("_R_FULL_VIGNETTE_") %in% c("1","TRUE") ``` ```{r load, results="hide",message=FALSE,warning=FALSE } library("lava") ``` We consider the measurement models given by $$X_{j} = u_{1} + \epsilon_{j}^{x}, \quad j=1,2,3$$ $$Y_{j} = u_{2} + \epsilon_{j}^{y}, \quad j=1,2,3$$ and with a structural model given by $$u_{2} = f(u_{1}) + Z + \zeta_{2}$$ $$u_{1} = Z + \zeta_{1}$$ with iid measurement errors $\epsilon_{j}^{x},\epsilon_{j}^{y},\zeta_{1},\zeta_{2}\sim\mathcal{N}(0,1), j=1,2,3.$ and standard normal distributed covariate $Z$. To simulate from this model we use the following syntax: ```{r sim } f <- function(x) cos(1.25*x) + x - 0.25*x^2 m <- lvm(x1+x2+x3 ~ u1, y1+y2+y3 ~ u2, latent=~u1+u2) regression(m) <- u1+u2 ~ z functional(m, u2~u1) <- f d <- sim(m, n=200, seed=42) # Default is all parameters are 1 ``` ```{r } ## plot(m, plot.engine="visNetwork") plot(m) ``` We refer to [@holst_budtzjorgensen_2013] for details on the syntax for model specification. # Estimation To estimate the parameters using the two-stage estimator described in [@holst_budtzjorgensen_2020], the first step is now to specify the measurement models ```{r specifymodels } m1 <- lvm(x1+x2+x3 ~ u1, u1 ~ z, latent=~u1) m2 <- lvm(y1+y2+y3 ~ u2, u2 ~ z, latent=~u2) ``` Next, we specify a quadratic relationship between the two latent variables ```{r } nonlinear(m2, type="quadratic") <- u2 ~ u1 ``` and the model can then be estimated using the two-stage estimator ```{r twostage1 } e1 <- twostage(m1, m2, data=d) e1 ``` We see a clear statistically significant effect of the second order term (`u2~u1_2`). For comparison we can also estimate the full MLE of the linear model: ```{r linear_mle } e0 <- estimate(regression(m1%++%m2, u2~u1), d) estimate(e0,keep="^u2~[a-z]",regex=TRUE) ## Extract coef. matching reg.ex. ``` Next, we calculate predictions from the quadratic model using the estimated parameter coefficients $$ \mathbb{E}_{\widehat{\theta}_{2}}(u_{2} \mid u_{1}, Z=0), $$ ```{r pred1 } newd <- expand.grid(u1=seq(-4, 4, by=0.1), z=0) pred1 <- predict(e1, newdata=newd, x=TRUE) head(pred1) ``` To obtain a potential better fit we next proceed with a natural cubic spline ```{r spline_twostage } kn <- seq(-3,3,length.out=5) nonlinear(m2, type="spline", knots=kn) <- u2 ~ u1 e2 <- twostage(m1, m2, data=d) e2 ``` Confidence limits can be obtained via the Delta method using the `estimate` method: ```{r spline_ci } p <- cbind(u1=newd$u1, estimate(e2,f=function(p) predict(e2,p=p,newdata=newd))$coefmat) head(p) ``` The fitted function can be obtained with the following code: ```{r figpred2 } plot(I(u2-z) ~ u1, data=d, col=Col("black",0.5), pch=16, xlab=expression(u[1]), ylab=expression(u[2]), xlim=c(-4,4)) lines(Estimate ~ u1, data=as.data.frame(p), col="darkblue", lwd=5) confband(p[,1], lower=p[,4], upper=p[,5], polygon=TRUE, border=NA, col=Col("darkblue",0.2)) ``` # Cross-validation A more formal comparison of the different models can be obtained by cross-validation. Here we specify linear, quadratic and cubic spline models with 4 and 9 degrees of freedom. ```{r spline_several } m2a <- nonlinear(m2, type="linear", u2~u1) m2b <- nonlinear(m2, type="quadratic", u2~u1) kn1 <- seq(-3,3,length.out=5) kn2 <- seq(-3,3,length.out=8) m2c <- nonlinear(m2, type="spline", knots=kn1, u2~u1) m2d <- nonlinear(m2, type="spline", knots=kn2, u2~u1) ``` To assess the model fit average RMSE is estimated with 5-fold cross-validation repeated two times ```{r cv_fit, cache=TRUE, eval=fullVignette } ## Scale models in stage 2 to allow for a fair RMSE comparison d0 <- d for (i in endogenous(m2)) d0[,i] <- scale(d0[,i],center=TRUE,scale=TRUE) ## Repeated 5-fold cross-validation: ff <- lapply(list(linear=m2a,quadratic=m2b,spline4=m2c,spline6=m2d), function(m) function(data,...) twostage(m1,m,data=data,stderr=FALSE,control=list(start=coef(e0),contrain=TRUE))) fit.cv <- lava:::cv(ff,data=d,K=5,rep=2,mc.cores=parallel::detectCores(),seed=1) ``` ```{r results="hide", echo=FALSE } ## To save time building the vignettes on CRAN, we cache time consuming computations if (fullVignette) { fit.cv$fit <- NULL saveRDS(fit.cv, "data/nonlinear_fitcv.rds") } else { fit.cv <- readRDS("data/nonlinear_fitcv.rds") } ``` ```{r } fit.cv$coef ``` Here the RMSE is in favour of the splines model with 4 degrees of freedom: ```{r multifit } fit <- lapply(list(m2a,m2b,m2c,m2d), function(x) { e <- twostage(m1,x,data=d) pr <- cbind(u1=newd$u1,predict(e,newdata=newd$u1,x=TRUE)) return(list(estimate=e,predict=as.data.frame(pr))) }) plot(I(u2-z) ~ u1, data=d, col=Col("black",0.5), pch=16, xlab=expression(eta[1]), ylab=expression(eta[2]), xlim=c(-4,4)) col <- c("orange","darkred","darkgreen","darkblue") lty <- c(3,4,1,5) for (i in seq_along(fit)) { with(fit[[i]]$pr, lines(u2 ~ u1, col=col[i], lwd=4, lty=lty[i])) } legend("bottomright", c("linear","quadratic","spline(df=4)","spline(df=6)"), col=col, lty=lty, lwd=3) ``` For convenience, the function `twostageCV` can be used to do the cross-validation (also for choosing the mixture distribution via the `nmix` argument, see the section below). For example, ```{r twostageCV, cache=TRUE, eval=fullVignette } set.seed(1) selmod <- twostageCV(m1, m2, data=d, df=2:4, nmix=1:2, nfolds=5, rep=2, mc.cores=parallel::detectCores()) ``` ```{r results="hide", echo=FALSE } ## To save time building the vignettes on CRAN, we cache time consuming computations if (fullVignette) { saveRDS(summary(selmod), "data/nonlinear_selmod.rds") } else { selmod <- readRDS("data/nonlinear_selmod.rds") } ``` applies cross-validation (here just 2 folds for simplicity) to select the best splines with degrees of freedom varying from 1-3 (the linear model is automatically included) ```{r } selmod ``` # Specification of general functional forms Next, we show how to specify a general functional relation of multiple different latent or exogenous variables. This is achieved via the `predict.fun` argument. To illustrate this we include interactions between the latent variable $u_{1}$ and a dichotomized version of the covariate $z$ ```{r } d$g <- (d$z<0)*1 ## Group variable mm1 <- regression(m1, ~g) # Add grouping variable as exogenous variable (effect specified via 'predict.fun') mm2 <- regression(m2, u2~ u1+u2+u1:g+u2:g+z) pred <- function(mu,var,data,...) { cbind("u1"=mu[,1],"u2"=mu[,1]^2+var[1], "u1:g"=mu[,1]*data[,"g"],"u2:g"=(mu[,1]^2+var[1])*data[,"g"]) } ee1 <- twostage(mm1, model2=mm2, data=d, predict.fun=pred) estimate(ee1,keep="u2~u",regex=TRUE) ``` A formal test show no statistically significant effect of this interaction ```{r } summary(estimate(ee1,keep="(:g)", regex=TRUE)) ``` # Mixture models Lastly, we demonstrate how the distributional assumptions of stage 1 model can be relaxed by letting the conditional distribution of the latent variable given covariates follow a Gaussian mixture distribution. The following code explictly defines the parameter constraints of the model by setting the intercept of the first indicator variable, $x_{1}$, to zero and the factor loading parameter of the same variable to one. ```{r } m1 <- baptize(m1) ## Label all parameters intercept(m1, ~x1+u1) <- list(0,NA) ## Set intercept of x1 to zero. Remove the label of u1 regression(m1,x1~u1) <- 1 ## Factor loading fixed to 1 ``` The mixture model may then be estimated using the `mixture` method (note, this requires the `mets` package to be installed), where the Parameter names shared across the different mixture components given in the `list` will be constrained to be identical in the mixture model. Thus, only the intercept of $u_{1}$ is allowed to vary between the mixtures. ```{r mixture1, cache=TRUE, eval=fullVignette } set.seed(1) em0 <- mixture(m1, k=2, data=d) ``` To decrease the risk of using a local maximizer of the likelihood we can rerun the estimation with different random starting values ```{r estmixture, cache=TRUE,warnings=FALSE,messages=FALSE,eval=FALSE } em0 <- NULL ll <- c() for (i in 1:5) { set.seed(i) em <- mixture(m1, k=2, data=d, control=list(trace=0)) ll <- c(ll,logLik(em)) if (is.null(em0) || logLik(em0)