公告

[公告]
2014/01/17
由於已經是faculty的關係,不太有足夠時間寫部落格。因此更新的速度會相當緩慢。再加上近幾年來SAS GLOBAL FORUM沒有出現讓我覺得驚艷的技術文件,所以能分享的文章相對也減少許多。若有人推薦值得分享的SAS技術文件,請利用『問題討論區』告知。

2013/07/19
臉書留言板的功能因為有不明原因故障,因此特此移除。而intensedebate的留言板因管理不易,也一併移除。目前已經開啟內建的 G+ 留言系統,所以請有需要留言的朋友,可直接至『問題討論區』裡面留言。


2010年7月23日 星期五

Dropping Automatically Variables with Only Missing Values

Link: http://support.sas.com/resources/papers/proceedings10/048-2010.pdf

在進行資料分析前,有些人會習慣把一些含有missing data的樣本給清除掉,雖然這不會影響到分析結果,因為大部分的SAS程序都是用CCA(Complete Case Analysis)來處理含有missing data的數據,不過若要用手動的方法來清掉missing data的話,在遇到龐大數量的變數時,需要消耗很多時間在輸入遍數名稱上。美國人口普查局的Selvaratnam Sridharma發表了一個macro程序於SAS Global Forum 2010,讓這個程式撰寫的過程只需要幾秒鐘的時間就可以完成。


這個macro如下所示:
%DROPMISS(DSIN, DSOUT, NODROP);
裡面只需要定義兩個macro參數:

  • DSIN: 原始資料的名稱
  • DSOUT: 新資料的名稱
  • NODROP: 不要處理missing data的變數名稱,此為optional選項。

其中,DSIN和DSOUT應該不用多做解釋。NODROP如果沒有指定特定的變數名稱的話,這個macro就會針對所有在DSIN所指定的資料裡面的變數進行missing data的處理。但如果想要讓這個macro不處理所有文字型變數或數值型變數,則可以用 __NUMERIC_ 或_CHARACTER_ 來限制。如下所示:
%DROPMISS (DSIN=olddata,DSOUT=newdata, nodrop= _NUMERIC_ );
%DROPMISS (DSIN=oldedata,DSOUT=newdata, nodrop= _CHARACTER_ );
此macro的原始碼如下:
/******************/
options nomprint noSYMBOLGEN MLOGIC;
/****************************/
%macro DROPMISS( DSNIN /* name of input SAS dataset
*/
, DSNOUT /* name of output SAS dataset
*/
, NODROP= /* [optional] variables to be omitted from dropping even if
they have only missing values */
) ;
/* PURPOSE: To find both Character and Numeric the variables that have only
missing values and drop them if
* they are not in &NONDROP
*
* NOTE: if there are no variables in the dataset, produce no variables
processing code
*
*
* EXAMPLE OF USE:
* %DROPMISS( DSNIN, DSNOUT )
* %DROPMISS( DSNIN, DSNOUT, NODROP=A B C D--H X1-X100 )
* %DROPMISS( DSNIN, DSNOUT, NODROP=_numeric_ )
* %DROPMISS( DSNIN, DSNOUT, NOdrop=_character_ )
*/
%local I ;
%if "&DSNIN" = "&DSNOUT"
%then %do ;
%put /------------------------------------------------\ ;
%put | ERROR from DROPMISS: | ;
%put | Input Dataset has same name as Output Dataset. | ;
%put | Execution terminating forthwith. | ;
%put \------------------------------------------------/ ;
%goto L9999 ;
%end ;
/*###################################################################*/
/* begin executable code
/*####################################################################/
/*===================================================================*/
/* Create dataset of variable names that have only missing values
/* exclude from the computation all names in &NODROP
/*===================================================================*/
proc contents data=&DSNIN( drop=&NODROP ) memtype=data noprint out=_cntnts_( keep=
name type ) ; run ;
%let N_CHAR = 0 ;
%let N_NUM = 0 ;
data _null_ ;
set _cntnts_ end=lastobs nobs=nobs ;

if nobs = 0 then stop ;
n_char + ( type = 2 ) ;
n_num + ( type = 1 ) ;
/* create macro vars containing final # of char, numeric variables */
if lastobs
then do ;
call symput( 'N_CHAR', left( put( n_char, 5. ))) ;
call symput( 'N_NUM' , left( put( n_num , 5. ))) ;
end ;
run ;
/*===================================================================*/
/* if there are no variables in dataset, stop further processing
/*===================================================================*/
%if %eval( &N_NUM + &N_CHAR ) = 0
%then %do ;
%put /----------------------------------\ ;
%put | ERROR from DROPMISS: | ;
%put | No variables in dataset. | ;
%put | Execution terminating forthwith. | ;
%put \----------------------------------/ ;
%goto L9999 ;
%end ;
/*===================================================================*/
/* put global macro names into global symbol table for later retrieval
/*===================================================================*/
%LET NUM0 =0;
%LET CHAR0 = 0;
%IF &N_NUM >0 %THEN %DO;
%do I = 1 %to &N_NUM ;
%global NUM&I ;
%end ;
%END;
%if &N_CHAR > 0 %THEN %DO;
%do I = 1 %to &N_CHAR ;
%global CHAR&I ;
%end ;
%END;
/*===================================================================*/
/* create macro vars containing variable names
/* efficiency note: could compute n_char, n_num here, but must declare macro names
to be
global b4 stuffing them
/*
/*===================================================================*/
proc sql noprint ;
%if &N_CHAR > 0 %then %str( select name into :CHAR1 - :CHAR&N_CHAR from
_cntnts_ where type = 2 ; ) ;
%if &N_NUM > 0 %then %str( select name into :NUM1 - :NUM&N_NUM from
_cntnts_ where type = 1 ; ) ;
quit ;
/*===================================================================*/
/* Determine the variables that are missing

/*
/*===================================================================*/
%IF &N_CHAR > 1 %THEN %DO;
%let N_CHAR_1 = %EVAL(&N_CHAR - 1);
%END;
Proc sql ;
select %do I= 1 %to &N_NUM; max (&&NUM&I) , %end; %IF &N_CHAR > 1 %THEN %DO;
%do I= 1 %to &N_CHAR_1; max(&&CHAR&I), %END; %end; MAX(&&CHAR&N_CHAR)
into
%do I= 1 %to &N_NUM; :NUMMAX&I , %END; %IF &N_CHAR > 1 %THEN %DO;
%do I= 1 %to &N_CHAR_1; :CHARMAX&I,%END; %END; :CHARMAX&N_CHAR
from &DSNIN;
quit;
/*===================================================================*/
/* initialize DROP_NUM, DROP_CHAR global macro vars
/*===================================================================*/
%let DROP_NUM = ;
%let DROP_CHAR = ;
%if &N_NUM > 0 %THEN %DO;
DATA _NULL_;
%do I = 1 %to &N_NUM ;
%IF &&NUMMAX&I =. %THEN %DO;
%let DROP_NUM = &DROP_NUM %qtrim( &&NUM&I ) ;
%END;
%end ;
RUN;
%END;
%IF &N_CHAR > 0 %THEN %DO;
DATA _NULL_;
%do I = 1 %to &N_CHAR ;
%IF "%qtrim(&&CHARMAX&I)" eq "" %THEN %DO;
%let DROP_CHAR = &DROP_CHAR %qtrim( &&CHAR&I ) ;
%END;
%end ;
RUN;
%END;
/*===================================================================*/
/* Create output dataset
/*===================================================================*/
data &DSNOUT ;
%if &DROP_CHAR ^= %then %str(DROP &DROP_CHAR ; ) ; /* drop char variables
that
have only missing values */
%if &DROP_NUM ^= %then %str(DROP &DROP_NUM ; ) ; /* drop num variables
that
have only missing values */
set &DSNIN ;
%if &DROP_CHAR ^= or &DROP_NUM ^= %then %do;

%put /----------------------------------\ ;
%put | Variables dropped are &DROP_CHAR &DROP_NUM | ;
%put \----------------------------------/ ;
%end;
%if &DROP_CHAR = and &DROP_NUM = %then %do;
%put /----------------------------------\ ;
%put | No variables are dropped |;
%put \----------------------------------/ ;
%end;
run ;
%L9999:
%mend DROPMISS ;

CONTACT INFORMATION

Selvaratnam Sridharma
Economic Planning and Coordination Division
U.S. Bureau of the Census
Address
Washington, DC 20233-6100
301-763-6774
Email: selvaratnam.sridharma@census.gov

沒有留言:

張貼留言

要問問題的人請在文章下方的intensedebate欄位留言,請勿使用blogger預設的意見表單。今後用blogger意見表單留言的人我就不回應了。

CODE { display: block; /* fixes a strange ie margin bug */ font-family: Courier New; font-size: 8pt; overflow:auto; background: #f0f0f0 url(http://klcintw.images.googlepages.com/Code_BG.gif) left top repeat-y; border: 1px solid #ccc; padding: 10px 10px 10px 21px; max-height:200px; height:200px; // for IE6 line-height: 1.2em; }