/**********************************************************************/ /*********** MULTINOMIAL LOGISTIC REGRESSION IMPUTATION MACRO *********/ /**********************************************************************/ /* */ /* USAGE: %mlogitimpute_z(datain,dataout,y,yout,x,z=10,conv=1-e8) */ /* where: datain = input sas dataset */ /* dataout = output sas dataset (can be the same as datain) */ /* y = name of variable in datain to be imputed under a */ /* linear regression model */ /* yout = name of variable in dataout where MLOGITIMPUTE */ /* writes the imputed values of y (can be y) */ /* x = names of variables in datain to use to condition */ /* the masking regression */ /* z = p where p>0 restricts parameter draws so that */ /* no elements of the parameter vector are more than*/ /* p standard deviations from their expected value. */ /* conv = 1e-8 is default convergence criterion for catmod */ /* ------------------------------------------------------------------ */ /* RETURNS: A vector of imputed values &yout in an output data set */ /* called &dataout. The imputed values are draws from the */ /* posterior predictive distribution implied by the */ /* regression model under an uniformative prior */ /* ------------------------------------------------------------------ */ /* NOTES: Unlike the masking macros, MLOGITIMPUTE *creates* the output*/ /* data set &dataout. It is a copy of &datain with a new */ /* variable called &yout, which contains the imputed values */ /* of &y. To fill the missing values of &y with imputed */ /* values just set &y=&yout. Similarly, to replace &datain */ /* with the completed data, set &datain=&dataout */ /**********************************************************************/ /* written by: Simon D. Woodcock CISER, Dept. of Economics Cornell University 201 Caldwell Hall Ithaca, NY 14850 sdw9@cornell.edu */ %MACRO mlogitimpute_z(datain,dataout,y,yout,x,z=10,conv=1e-8); * add indexing variable to &datain so we can reassemble the completed data; data &datain; set &datain; imputeindex = _n_; run; * subset &datain into complete and incomplete observations; data complete; set &datain.; if &y ~= .; run; data incomplete; set &datain.; if &y. = .; run; * clean up &datain; data &datain. (drop = imputeindex); set &datain.; run; * perform logistic regression and collect output; proc catmod data=complete; direct &x.; model &y. = &x. / nodesign noprofile epsilon=&conv.; response / outest = results; title "multinomial logit imputation regression, data set is &datain"; run; quit; * generate masked data as draws from the normal approximation to the posterior * predictive distribution under an uninformative prior ; proc iml; edit incomplete; read all var {&x} into slopes; nmiss = nrow(slopes); xmat = j(nmiss,1,1)||slopes; nvar = ncol(xmat); do i = 1 to nmiss; /* verify x is complete */ do k = 1 to nvar; if xmat[i,k] = . then do; print "ERROR: the conditioning matrix x cannot contain missing values"; abort; end; end; end; use results; read into beta; beta = beta`; numparm = nrow(beta); ncat = (numparm / nvar); select = 2:numparm+1; read point select into covb; T = root(covb); z = j(numparm,1,0); /* pre-allocate vector z */ z_stats = j(numparm,1,0); numdraw = 0; do until(max(abs(z_stats)) <= &z.); z = normal(z); /* draw vector of normal deviates */ betastar = beta + T*z; /* perturb beta */ st_errors=sqrt(vecdiag(covb)); z_stats=(betastar-beta)/st_errors; numdraw = numdraw +1; /* results = beta||betastar||st_errors||z_stats; print results; */ end; print "Total number of parameter draws:"; print numdraw; * put betastar in (nvar x ncat) matrix format; betastar_mat = j(nvar,ncat,0); do j = 1 to ncat; do i = 1 to nvar; select = (i-1)*ncat + j; betastar_mat[i,j] = betastar[select,1]; end; end; * obtain predicted probabilities; exb = exp(xmat*betastar_mat); sum_exb = exb*j(ncat,1,1); /* take row sums of exb */ P = j(nmiss,ncat,0); /* pre-allcoate matrix P */ do j = 1 to ncat; /* obtain predicted probabilities */ P[,j] = exb[,j] / ( j(nmiss,1,1) + sum_exb ); end; cumP = j(nmiss,ncat+1,0); /* pre-allocate matrix cumP */ do i = 1 to nmiss; /* compute cumulative probabilities */ cumP[i,] = cusum(P[i,])||1; end; * draw masked values; u = j(nmiss,1,0); /* pre-allocate vector u */ tmp = j(nmiss,ncat+1,0); /* pre-allocate matrix tmp */ do i = 1 to nmiss; /* draw nobs uniform deviates */ u[i,1] = ranuni(0); do j = 1 to ncat+1; tmp[i,j] = u[i,1] <= cumP[i,j]; end; end; yimputed = (ncat+2)*j(nmiss,1) - tmp*j(ncat+1,1,1); /* draw imputed y */ setout incomplete; &yout = yimputed; replace all var{&yout}; /* fill imputed values */ quit; * reassemble completed data in original order from &datain; data &dataout (drop = imputeindex); set complete incomplete; by imputeindex; run; * clean up workspace; proc datasets library = work nolist; delete complete incomplete results; run; quit; %MEND;