/* Purposeful Selection Macro Beta Version 1.1 September 2007 Programmers: Zoran Bursac Heath Gauss Keith Williams Dave Hosmer Macro variables: DATASET - we have just placed ours in the WORK library, but you could use it from anywhere, just specify that in the macro call i.e. SASUSER.YOUR_DATASET_NAME OUTCOME - binary variable, ideally coded as 0 or 1. Macro is set up to use descending option, or to model the probability of Y=1 so be aware of that. COVARIATES - either binary variables coded as 0 or 1, or continuous variables. While you could insert dummy variables it will only retain the significant ones or the confouders so you would have to force the other not-retained dummies back in after the selection is complete. We have not tested this or designed the macro to handle it yet, so use at your own risk. PVALUEI - inclusion criteria for covariates into the multivariable model. This p-value is the result of the univariate test between Y and each X separately and it creates a subset of candidate variables for inclusion into the multivariable model. We recommend setting this liberally to .25 because if we set it lower we could miss potentially important variables. PVALUER - once candidate variables are fitted in the multivariable model this becomes their retention criteria. We recommend setting this to 0.1 . CHBETA - this is % change in parameter estimates that we consider confounding once any X variable is removed from the model. In our simulations we have found that setting this to 15 (15% change) seems to give us optimal results. You can also test other levels like 20 or 25 and compare the findings. PVALUENC - this is the newest macro variable we implemented before JSM 2007 and it is the inclusion criteria for non-candidate variables i.e. the variables that did not make it into the initial multivariable model. Through our simulation studies we found that setting this to 0.15 gives us the optimal inclusion/retention results. Retention criteria for non-candidate variables is preset in the macro at 0.1 level. User instructions: 1. Place your data set in the WORK library and recode your variables to match the instructions above. 2. Run this macro. 3. Call the macro as follows: %PurposefulSelection (YOUR_DATASET_NAME, YOUR_Y, YOUR_X1 YOUR_X2 YOUR_X3 YOUR_XN, 0.25, 0.1, 15, 0.15); Note that the LOG screen will have more notes than you would ideally like to see. While we may suppress some of this in the future we haven’t yet. On the positive side it will give you more step by step information on what has happened along the way. OUTPUT screen, like other selection procedures, will also give you step by step analysis results. Last output should be your "final" main effects model. Be warned to carefully examine your model and determine why the selected variables are there. Compare your findings with other available selection procedures. If you use this macro for work to be published please use the following citation: Bursac Z, Gauss CH, Williams DK, Hosmer DW. (2007). A Purposeful Selection of Variables Macro for Logistic Regression. SAS Global Forum Proceedings, Paper 173: 1-5. The related paper can be found at: http://www2.sas.com/proceedings/forum2007/173-2007.pdf */ %macro ScanVar (c); /* scan the input covariates */ %if &c = %then %do; %put ERROR: No covariates given. Expecting at least one covariate; %end; %let covcnt=1; %let covnames=; %do %while (%scan(&c,&covcnt,%STR( %))) ne ); %let c&covcnt="%scan(&c,&covcnt,%STR( %)))"; /* scans for content between commas */ %put covariate %eval(&covcnt) is %upcase(&&c&covcnt) ; /* prints to log */ %if &covcnt=1 %then %let covnames=%upcase(&&c&covcnt); /* creates &covnames on 1st loop */ %else %let covnames=&covnames,%upcase(&&c&covcnt); /* adds next variable to end of &names */ %let covcnt=%eval(&covcnt+1); /* add 1 to covariate count */ %end; %GLOBAL N ; %let n=%eval(&covcnt-1 ); /* &n = number of covariates */ %put Number of covariates = &n ; /* write number of variables to log */ /* CREATE DATASET OF covariates: WORK.covariates*/ data covariates; /* create variable: VARNAME */ keep covname; array word $40 w1-w&n (&covnames); do i=1 to &n ; covname = compress(word(i)); output ; end; run; %put _local_; %mend ScanVar; %macro UniFit (pvc); %put n is &n ; /* fits logistic models with one covariate at the time */ data covariates ; set covariates nobs=numobs ; order = _N_; /* order of the variables : ORDER */ call symput ('startn', left(put(numobs,8.))) ; /* number of variables : &startn */ run ; %put Number of variables in the model = &&startn ; /* write to log */ %if &startn < &n %then %do; /* no variables to test : STOP*/ %put ERROR: NO VARIABLES ; %end ; %if &startn >= &n %then %do; /* START LOOP*/ data covariates ; set covariates ; temp = compress('c'||order) ; call symput (temp , covname) ; drop temp ; run ; %put MODEL VARIABLES: ; /* write to log */ %do i = 1 %to &startn ; %put %upcase(&&c&i); %end; %let log1 = %str(proc logistic data= &dataset descending ; model %quote(&outcome) = ) ; %let log2 = %str(/scale=none aggregate; run;) ; /* RUN LOGISTIC REGRESSION */ title 'Univariate logistic model for each X'; %do i=1 %to %eval(&&startn) ; /* fit one covariate at the time */ ods output ParameterEstimates=uf&i; /* output results: WORK.uf1...ufn */ *ods listing close; &log1 &&c&i &log2 /* write variables into PROC LOGISTIC */ %end; %end ; /* END LOOP */ data candidates;set uf1; run; %do i=2 %to %eval(&&startn) ; /* append the outputs to one file : WORK.candidates */ proc datasets; append base=candidates data=uf&i; run; %end; %do i=1 %to %eval(&&startn); proc datasets; delete uf&i; run; %end; /* clean WORK.candidates data set and create variable for signifcant ones */ data candidates;set candidates; if variable='Intercept' then delete; if probchisq <= &pvc then firstpass=1;else firstpass=0; run; %mend UniFit; %macro MVFit (d,o,pvc,p,b,pvnc); %put n is &n ; %put pvc is &pvc ; proc sql noprint; select variable into: sigvars separated by ' ' from candidates (where=(firstpass=1)) ; %let numsigvars=&sqlobs; quit; %put Significant variables are: &sigvars; %put Number of significant variables is: &numsigvars; %if &numsigvars > 0 %then %do; /* fit the logistic model with significant candidates */ title 'Logistic model with candidate variables significant at '; ods output ParameterEstimates=mf; /* output results: WORK.mf */ proc logistic data=&d descending; model &o=&sigvars / scale=none aggregate; run; /* write significant candidates into PROC LOGISTIC */ /*Suppose the data set containing the parameter estimates from the multivariable logistic regression model is called estfull.*/ data mf; set mf; if variable='Intercept' then delete; run; proc sort data=mf out=sortestmf; by descending probchisq; run; data sortestmf; set sortestmf; count=_N_; count2=_N_; run; data _NULL_; set sortestmf; if _N_=1 then call symput('pvaluecheck',left(probchisq)); run; %let varcount=1; %let counter=1; %let pvaluerank=1; %put pvaluecheck: &pvaluecheck; %if (&numsigvars = &n) and (%sysevalf(&pvaluecheck <= &p)) %then %do; title 'Final main effects model - all covariates significant '; ods output ParameterEstimates=prelimeff; ods listing; proc logistic data=&d descending; model &o=&sigvars / scale=none aggregate; run; %end; *ods listing close; %let numvars=0; %if (&numsigvars <= &n) %then %do; %do %while (%sysevalf(&pvaluecheck > &p)); proc sql noprint; select variable /*"variable" is the name of the variable containing the significant variable names*/ into :reducedsigvars separated by ' ' /*creating the macro variable &sigvars*/ from sortestmf (where=(count2 ne &counter)); /*data set containing the variable called "variable"*/ %let numvars2=&sqlobs; /*&sqlobs is created automatically and contains the number of rows in the incoming data set ff*/ quit; %put reduced significant variables: &reducedsigvars; title 'Logistic regression: reduced model or confounder back'; ods output parameterestimates = estred; proc logistic data=&d descending; model &o = &reducedsigvars; run; data estfull2; set mf; EstimateF=Estimate; keep Variable EstimateF; run; data estred2; set estred; EstimateR=Estimate; keep Variable EstimateR; run; proc sort data=estfull2; by variable; run; proc sort data=estred2; by variable; run; data estchg; merge estfull2 estred2; by variable; if variable='Intercept' then delete; EstPercentChg=100*abs((EstimateR - EstimateF)/EstimateF); run; proc sort data=estchg out=sortestchg; by descending EstPercentChg; run; data sortestchgrev; set sortestchg; count=_N_; run; data _NULL_; set sortestchgrev; if _N_=1 then call symput('betapctchangecheck',left(EstPercentChg)); run; %put change in beta is: &betapctchangecheck; %put beta is: &b; %let pvaluerank = %eval(&pvaluerank + 1); %if %sysevalf(&betapctchangecheck >= &b) %then %do; data _NULL_; set sortestmf; if count2=&pvaluerank then call symput('pvaluecheck',left(probchisq)); run; %put next p-value is: &pvaluecheck; %let counter = %eval(&counter + 1); %let varcount = %eval(&varcount + 1); %end; /* end >=20% */ %else %if %sysevalf(&betapctchangecheck < &b) %then %do; data sortestmf; set sortestmf; if count=&varcount then delete; run; proc sql noprint; select variable /*"variable" is the name of the variable containing the significant variable names*/ into :sigvars separated by ' ' /*creating the macro variable &sigvars*/ from sortestmf; /*data set containing the variable called "variable"*/ %let numvars=&sqlobs; /*&sqlobs is created automatically and contains the number of rows in the incoming data set ff*/ quit; %put significant variables are &sigvars; %put number of significant variables &numvars; title 'LR: New and updated model'; ods output parameterestimates = mf; proc logistic data=&d descending; model &o = &sigvars; run;quit; data mf; set mf; if variable='Intercept' then delete; run; proc sort data=mf out=sortestmf; by descending probchisq; run; data sortestmf; set sortestmf; count=_N_; count2=_N_; run; data _NULL_; set sortestmf; if _N_=1 then call symput('pvaluecheck',left(probchisq)); run; %put pvaluecheck is: &pvaluecheck; /* if all covariates confounder force the exit and retain them*/ %if (&numvars < &varcount) %then %do; %let pvaluecheck=0; %put NOTE: all covariates nonsignifcant but confounders; %end; %let varcount=1; %let counter=1; %let pvaluerank=1; %put sigvars1 are &sigvars; /* if its the last variable and still nonsignificant force the exit */ %if (&numvars=1) and (%sysevalf(&pvaluecheck > &p)) %then %do; %let pvaluecheck=0; %let sigvars= ; %end; %put sigvars2 are &sigvars; %put pvaluecheck is &pvaluecheck; %end; /* end <20 */ %end; /* end do while loop */ %put sigvars are &sigvars; %put numsigvars is &numsigvars; %put numvars is &numvars; %put n is &n; %if (&sigvars ne ) and (%sysevalf(&numvars = 1)) %then %do; title 'Final main effects model - one significant candidate'; ods output ParameterEstimates=prelimeff; proc logistic data=&d descending; model &o = &sigvars / scale=none aggregate lackfit; run;quit; %end; %if (&sigvars ne ) and (&numsigvars < &n) %then %do; title 'Preliminary main effects model'; ods output ParameterEstimates=prelim; proc logistic data=&d descending; model &o = &sigvars / scale=none aggregate lackfit; run;quit; data prelim; set prelim; if variable='Intercept' then delete; run; proc sql noprint; select variable /*"variable" is the name of the variable containing the significant variable names*/ into :prelimsigvars separated by ' ' /*creating the macro variable &sigvars*/ from prelim; /*data set containing the variable called "variable"*/ %let numprelimsigvars=&sqlobs; /*&sqlobs is created automatically and contains the number of rows in the incoming data set ff*/ quit; %put significant variables are &prelimsigvars; %put number of significant variables &numprelimsigvars; data noncandidates;set candidates; if firstpass=0; run; data noncandidates;set noncandidates nobs=numobs; counternc=_N_; /* counter of noncandidates */ call symput ('ncvarcount', left(put(numobs,8.))) ; /* number of noncandidates: &ncvarcount */ run ; %put Number of noncandidates: &&ncvarcount ; /* write to log */ data noncandidates;set noncandidates; temp = compress('nc'||counternc) ; call symput (temp , variable) ; drop temp ; run; %put Noncandidate variables are: ; /* write to log */ %do i = 1 %to &ncvarcount ; %put %upcase(&&nc&i); %end; %let log1 = %str(proc logistic data= &d descending ; model %quote(&o) = &sigvars) ; /* include preliminary main effects */ %let log2 = %str(/scale=none aggregate; run;) ; /* RUN LOGISTIC REGRESSION */ title 'One at the time noncandidate models'; %do i=1 %to %eval(&ncvarcount) ; /* fit one noncandidate at the time */ ods output ParameterEstimates=ncfit&i; /* output results: WORK.ncfit1...ncfitn*/ &log1 &&nc&i &log2 /* write variables into PROC LOGISTIC */ %end; %do i=1 %to %eval(&&ncvarcount) ; proc sort data=ncfit&i;by df; run; data allnoncandidates&i;set ncfit&i; by df; if last.df ; run; %end; data signoncandidates;set allnoncandidates1; run; %do i=2 %to %eval(&&ncvarcount) ; proc datasets; append base=signoncandidates data=allnoncandidates&i; run; %end; %do i=1 %to %eval(&&ncvarcount); proc datasets; delete allnoncandidates&i ncfit&i; run; %end; data signoncandidates1;set signoncandidates; if probchisq<=&pvnc; run; proc sql noprint; select variable into: signc separated by ' ' from signoncandidates1; %let numsignc=&sqlobs; quit; %put Significant noncandidates are: &signc; %put Number of significant noncandidates is: &numsignc; %if &numsignc > 0 %then %do; /* fit the logistic model with preliminary main effects and signifcant noncandidates */ title 'Preliminary main effects and significant noncandidates '; ods output ParameterEstimates=mfnc; proc logistic data=&d descending; model &o = &sigvars &signc / scale=none aggregate lackfit; run; data mfncrev;set mfnc; if variable='Intercept' then delete; run; data mfncrev;set mfncrev; if _N_ <= &numprelimsigvars then delete; run; proc sort data=mfncrev out=sortmfncrev; by descending probchisq; run; data sortmfncrev; set sortmfncrev nobs=numobs; count=_N_; call symput('numsigncleft',left(numobs)); run; %put Number of significant noncandidates left is: &numsigncleft ; data _NULL_; set sortmfncrev; if _N_=1 then call symput('pvaluechecknc',left(probchisq)); run; %let varcountnc=1; %let counternc=1; %let pvalueranknc=1; %put pvaluechecknc: &pvaluechecknc; /* do while loop for non candidate variable reduction */ %do %while (%sysevalf(&pvaluechecknc > &p)); %if &numsigncleft = 1 %then %do; title 'Logistic regression nc : reduced model or confounder back'; ods output ParameterEstimates=estrednc; proc logistic data=&d descending; model &o = &sigvars / scale=none aggregate; run; %end; %if &numsigncleft > 1 %then %do; proc sql noprint; select variable /*"variable" is the name of the variable containing the significant variable names*/ into :reducedsigvarsnc separated by ' ' /*creating the macro variable &sigvars*/ from sortmfncrev (where=(count ne &counternc)); /*data set containing the variable called "variable"*/ %let numvarsnc2=&sqlobs; /*&sqlobs is created automatically and contains the number of rows in the incoming data set ff*/ quit; %put reduced significant nc variables: &reducedsigvarsnc; %put number of reduced significant nc variables: &numvarsnc2; title 'Logistic regression nc: reduced model or confounder back'; ods output parameterestimates = estrednc; proc logistic data=&d descending; model &o = &sigvars &reducedsigvarsnc; run; %end; data estfull2; set mfnc; EstimateF=Estimate; keep Variable EstimateF; run; data estred2; set estrednc; EstimateR=Estimate; keep Variable EstimateR; run; proc sort data=estfull2; by variable; run; proc sort data=estred2; by variable; run; data estchg; merge estfull2 estred2; by variable; if variable='Intercept' then delete; EstPercentChg=100*abs((EstimateR - EstimateF)/EstimateF); run; proc sort data=estchg out=sortestchg; by descending EstPercentChg; run; data sortestchgrev; set sortestchg; count=_N_; run; data _NULL_; set sortestchgrev; if _N_=1 then call symput('betapctchangechecknc',left(EstPercentChg)); run; %put change in nc beta is: &betapctchangechecknc; %put beta is: &b; %let pvalueranknc = %eval(&pvalueranknc + 1); %if %sysevalf(&betapctchangechecknc >= &b) %then %do; %if &numsigncleft > 1 %then %do; data _NULL_; set sortmfncrev; if count=&pvalueranknc then call symput('pvaluechecknc',left(probchisq)); run; %put next p-value is: &pvaluechecknc; %let counternc = %eval(&counternc + 1); %let varcountnc = %eval(&varcountnc + 1); %end; %else %do; %let pvaluechecknc=0; %end; %end; /* end >=20% */ %else %if %sysevalf(&betapctchangechecknc < &b) %then %do; data sortmfncrev; set sortmfncrev; if count=&varcountnc then delete; run; %let numsigncleft= %eval(&numsigncleft-1); proc sql noprint; select variable /*"variable" is the name of the variable containing the significant variable names*/ into :signc separated by ' ' /*creating the macro variable &sigvars*/ from sortmfncrev; /*data set containing the variable called "variable"*/ %let numsigncvars=&sqlobs; /*&sqlobs is created automatically and contains the number of rows in the incoming data set ff*/ quit; %put significant nc variables are &signc; %put number of significant nc variables &numsigncvars; %if &numsigncvars > 0 %then %do; title 'LR nc: New and updated model'; ods output parameterestimates = mfnc; proc logistic data=&d descending; model &o = &sigvars &signc; run;quit; data mfncrev; set mfnc; if variable='Intercept' then delete; run; data mfncrev;set mfncrev; if _N_ <= &numprelimsigvars then delete; run; proc sort data=mfncrev out=sortmfncrev; by descending probchisq; run; data sortmfncrev; set sortmfncrev; count=_N_; run; data _NULL_; set sortmfncrev; if _N_=1 then call symput('pvaluechecknc',left(probchisq)); run; %put pvaluechecknc is: &pvaluechecknc; %let varcountnc=1; %let counternc=1; %let pvalueranknc=1; %end; /* end number signficant nc in the updated model > 0 */ %else %do; /* 0 noncandidates left */ %let pvaluechecknc=0; /* force end of do loop */ %end; %end; /* end <20 */ %end; /* end do while loop for non candidates reduction */ ods listing; %if &signc ne %then %do; /* final model */ title 'Final main effects model - some signficant noncandidates '; ods output ParameterEstimates=prelimeff; proc logistic data=&d descending; model &o = &sigvars &signc/ scale=none aggregate lackfit; run; %end; %else %do; ods listing; /* final model */ title 'Final main effects model - no signficant noncandidates '; ods output ParameterEstimates=prelimeff; proc logistic data=&d descending; model &o = &sigvars/ scale=none aggregate lackfit; run; %end; %end; /* end significant non candidates > 0 */ %else %do; %put There is no significant noncandidates. ; ods listing; /* final model */ title 'Final main effects - no significant noncandidates '; ods output ParameterEstimates=prelimeff; proc logistic data=&d descending; model &o = &sigvars / scale=none aggregate lackfit; run; %end; %end; /* end if for there is at least one signifcant candidate */ %else %do; %put NOTE: 1) All candidates significant at &pvc level were not significant in the multivariate model at &p level (ie no final model) or 2) All covariates were significant - final model was printed out ; %end; %end; /* end if for number of significant candidates <= total number of covariates */ %end; /* end if for significant candidates > 0 */ %else %do; %put NOTE: No significant covariates were identified. ; %end; %mend MVFit; %macro PurposefulSelection (dataset, outcome, covariates, pvaluei, pvaluer, chbeta, pvaluenc); /* main macro - calls sub-macros */ %ScanVar (&covariates); %UniFit (&pvaluei); %MVFit (&dataset, &outcome, &pvaluei, &pvaluer, &chbeta, &pvaluenc); %put _LOCAL_; %put _GLOBAL_; proc datasets; delete candidates covariates estchg estfull2 estred estred2 mf mfnc mfncrev noncandidates prelim signoncandidates signoncandidates1 sortestchg sortestchgrev sortestmf sortmfncrev estrednc; run; quit; %mend PurposefulSelection;