/**********************************************************************/ /****************** LINEAR REGRESSION IMPUTATION MACRO ****************/ /**********************************************************************/ /* */ /* The following macro, LINREGIMPUTE_Z, imputes missing data as draws */ /* from the posterior predictive distribution implied by a linear */ /* regression model under an uninformative prior. */ /* */ /* LINREGIMPUTE_Z differs from LINREG IMPUTE by providing subroutines */ /* for automated model selection and parameter draw restriction. */ /* The automated model selection subroutine can be enabled using the */ /* &select option; parameter draws can be restricted to central */ /* regions of the posterior distribution using the &z option. */ /* */ /* ------------------------------------------------------------------ */ /* USAGE: %linregimpute_z(datain,dataout,y,yout,x,log=0,select=1,z=3) */ /* where: datain = input sas dataset */ /* dataout = output sas dataset (can be the same as datain) */ /* y = name of variable in datain to be imputed under a */ /* linear regression model */ /* yout = name of variable in dataout where LINREGIMPUTE */ /* writes the imputed values of y (can be y) */ /* x = names of variables in datain to use to condition */ /* the masking regression */ /* log = 1 indicates that &y is in natural log form. In */ /* case, LINREGIMPUTE returns the transformed */ /* imputed value: &yout = exp(&yhat) */ /* 0 (default) indicates that &y is not in log form */ /* and LINREGMASK returns &yout without */ /* transformation. */ /* select = 1 perform model selection based on Bayesian */ /* Information Criterion (BIC). */ /* 0 no model selection (all elements of &x are used*/ /* to condition the masking regression). */ /* z = p where p>0 rejects parameter draws if at least */ /* one element of the parameter (beta) vector is */ /* more than p standard deviations from its */ /* expected value. */ /* ------------------------------------------------------------------ */ /* RETURNS: A vector of imputed values &yout in an output data set */ /* called &dataout. The imputed values are resticted draws */ /* from the posterior predictive distribution implied by the */ /* regression model under an uniformative prior. The draw */ /* restriction is controlled by the parameter z. */ /* ------------------------------------------------------------------ */ /* NOTES: Unlike the masking macros, LINREGIMPUTE *creates* the output*/ /* data set &dataout. It is a copy of &datain with a new */ /* variable called &yout, which contains the imputed values */ /* of &y. To fill the missing values of &y with imputed */ /* values just set &y=&yout. Similarly, to replace &datain */ /* with the completed data, set &datain=&dataout */ /**********************************************************************/ /* written by: Simon D. Woodcock CISER, Dept. of Economics Cornell University 201 Caldwell Hall Ithaca, NY 14850 sdw9@cornell.edu */ %MACRO linregimpute_z(datain,dataout,y,yout,x,log=0,select=0,z=3); * add indexing variable to &datain so we can reassemble the completed data; data &datain; set &datain; imputeindex = _n_; run; * subset &datain into complete and incomplete observations; data complete; set &datain; if &y ~= .; run; data incomplete; set &datain; if &y = .; sigma2star = .; run; * clean up &datain; data &datain (drop = imputeindex); set &datain; run; * perform ols regression and collect output; proc reg data=complete outest=results covout tableout outsscp=sscp; model &y = &x / sse edf; title "linregimpute_z regression results"; title2 "input dataset: &datain., complete set of conditioning variables"; run; * model selection subroutine; %if &select.=1 %then %do; proc iml; use results; read all var {&x.} where(_TYPE_ = 'T') into t; xnames = {&x.}; use complete; read all var {&y.} into yvec; nobs = nrow(yvec); nvar = ncol(xnames); free yvec; keepvars = j(1,nvar,0); do i = 1 to nvar; if t[1,i] ^= . then do; if t[1,i]**2 > log(nobs) then keepvars[1,i]=1; end; end; if min(keepvars) = 1 then do; print "All variables kept"; end; else do; l = max(length(xnames)); * add dummy entry to list of variable names with length > l; dummy = 'a'; do i = 1 to l; dummy = dummy||'a'; end; dummy = rowcat(dummy); xnames = xnames||dummy; xdrop = j(1,nvar,''); xdrop = rowcat(xnames[,loc(keepvars=0)]); xkeep = rowcat(xnames[,loc(keepvars)]); print "The following conditioning variables were dropped from the predictive equation"; print "because they did not meet model selection criterion (BIC):"; print xdrop; print "Final set of conditioning variables is:"; print xkeep; call symput("x",xkeep); end; quit; * perform ols regression on reduced set of conditioning variables and collect output; proc reg data=complete outest=results covout tableout outsscp=sscp; model &y. = &x. / sse; title "linregimpute_z regression results"; title2 "input dataset: &datain., reduced set of conditioning variables"; run; %end; * generate masked data as draws from the normal approximation to the posterior * predictive distribution under an uninformative prior ; proc iml; edit incomplete; read all var {&x} into slopes; nmiss = nrow(slopes); xmat = j(nmiss,1,1)||slopes; free slopes; nvar = ncol(xmat); do i = 1 to nmiss; /* verify x is complete */ do k = 1 to nvar; if xmat[i,k] = . then do; print "ERROR: the conditioning matrix x cannot contain missing values"; print "offending dataset is &datain"; abort; end; end; end; use results; read var {_SSE_} into sse; read var {_EDF_} into df; read all var {Intercept &x} where(_TYPE_ = 'PARMS') into beta; /* now we need to drop columns of &x, rows of beta, and rows/columns of xpx, if xpx is not full rank */ find all where( (_TYPE_ = 'COV' ) & (Intercept ^= 0) ) into keepcols; tmp = nrow(keepcols); keepcols = keepcols - j(tmp,1,6); use sscp; select = 1:nvar; read point select var {Intercept &x} into xpx; xpx = xpx[keepcols,keepcols]; beta = beta[,keepcols]; beta = beta`; xmat = xmat[,keepcols]; nvar = ncol(xmat); /* and continue ... */ xpxi = inv(xpx); T = root(xpxi); z = j(nvar,1,0); /* pre-allocate vector z */ numdraw = 0; rankv = nvar; /* xpxi known to be full rank */ z_stats = j(nvar,1,0); /* pre-allocate vector of z-stats */ do until(max(abs(z_stats)) <= &z.); u = 2*rangam(0,df/2); /* draw chi-square(df) deviate */ sigma2star = sse/u; /* perturb sigma */ sigmastar = sqrt(sigma2star); do i = 1 to nvar; /* draw nvar normal deviates */ z[i,1] = rannor(0); end; betastar = beta + T*sigmastar*z; /* perturb beta */ st_errors=sqrt(vecdiag(sigma2star*xpxi)); z_stats=(betastar-beta)/st_errors; numdraw = numdraw +1; end; print "Total number of parameter draws:"; print numdraw; v = j(nmiss,1,0); /* pre-allocate vector v */ do i = 1 to nmiss; /* draw nmiss normal deviates */ v[i,1] = rannor(0); end; yimputed = xmat*betastar + sigmastar*v; /* draw imputed y */ setout incomplete; &yout = yimputed; replace all var{&yout}; /* fill imputed values */ replace all var{sigma2star}; /* add sigma2star to incomplete */ /* for undoing log transform */ quit; * do log transform if &log = 1; %if &log = 1 %then %do; data incomplete; set incomplete; &yout = exp(&yout); run; %end; * reassemble completed data in original order from &datain; data &dataout (drop = imputeindex sigma2star); set complete incomplete; by imputeindex; run; * clean up workspace; proc datasets library = work nolist; delete complete incomplete results sscp; run; quit; %MEND;