公告

[公告]
2014/01/17
由於已經是faculty的關係,不太有足夠時間寫部落格。因此更新的速度會相當緩慢。再加上近幾年來SAS GLOBAL FORUM沒有出現讓我覺得驚艷的技術文件,所以能分享的文章相對也減少許多。若有人推薦值得分享的SAS技術文件,請利用『問題討論區』告知。

2013/07/19
臉書留言板的功能因為有不明原因故障,因此特此移除。而intensedebate的留言板因管理不易,也一併移除。目前已經開啟內建的 G+ 留言系統,所以請有需要留言的朋友,可直接至『問題討論區』裡面留言。


2011年3月15日 星期二

ROC Hard? No, ROC Made Easy!


SAS 裡面並沒有內建繪製 ROC 曲線的程序和語法,這和目前其他統計軟體都已經內建繪製 ROC 曲線的功能的情況比較起來,顯得相當奇怪。使用者在 SAS 的環境底下,必須先用 PROC LOGISTIC 程序將模型估計出來,再將一些報表另存新的檔案,然後才能根據這些新的資料來繪製 ROC 曲線並且計算 AUC 面積。SAS 官網有一段教學,詳情可參照 http://support.sas.com/kb/25/018.html,但繪製出來的圖形不甚美觀。Kriss Harris 在 SAS Global Forum 2010 發表了一篇技術文件,提供了一個相當實用的巨集程式 %ROC_CUTOFF 讓使用者可以在 SAS 裡面迅速且美觀地畫出 ROC 曲線。

首先,利用一段模擬的資料來當作本篇的範例:
data blood_for_roc;
CALL STREAMINIT(400);  
do i = 1 to 40;
Sputum_Eosinophils = RAND('NORMAL',3, 1); 
cht = 1;
output;
end;
do i = 41 to 80;
Sputum_Eosinophils = RAND('NORMAL',10, 5);  
cht = 2;
output;
end;
run;
%ROC_CUTOFF 的格式如下所示:
%macro roc_cutoff(DATASET = blood_for_roc ,
                  OUTCOME = cht , 
                  OUTCOME_LEV = 2 /* Number */ , 
                  XVAR = Sputum_Eosinophils,
                  XVAR_LABEL = Sputum Eosinophils,
                  AUC_TABLE = Y,
                  CONFUSION = N,
                  CUTOFF = );
每個參數的定義如下:
  • DATASET = SAS 資料集名稱
  • OUTCOME = 類別型的依變數名稱
  • OUTCOME_LEV = 依變數內的類別數
  • XVAR = 連續型的應變數名稱
  • XVAR_LABEL = 應變數的標籤
  • AUC_TABLE =  要不要顯示 AUC 表 (Y/N)
  • CONFUSION =  要不要顯示 confusion 表 (Y/N)
  • CUTOFF = 指定 cutoff 的數字(可留空白不填)
以模擬的資料來跑 ROC:
%macro roc_cutoff(DATASET = blood_for_roc ,
                  OUTCOME = cht , 
                  OUTCOME_LEV = 2 /* Number */ , 
                  XVAR = Sputum_Eosinophils,
                  XVAR_LABEL = Sputum Eosinophils,
                  AUC_TABLE = Y,
                  CONFUSION = N,
                  CUTOFF = );
結果如下所示:

若將CUTOFF 設定成 5,結果如下所示:

若將 AUC_TABLE 和 CONFUSION 都設定為 N,則一開頭的兩個表格就不會被秀出來:

原始碼如下:
%macro roc_cutoff(DATASET = blood_for_roc ,
                  OUTCOME = cht , 
                  OUTCOME_LEV = 2 /* Number */ , 
                  XVAR = Sputum_Eosinophils,
                  XVAR_LABEL = Sputum Eosinophils,
                  AUC_TABLE = Y,
                  CONFUSION = N,
                  CUTOFF = );
;*****************************************************************%
%* GETTING ROC OUTPUT FROM MODEL *;
;*****************************************************************%
%LET OUTCOME_LEV = %SYSFUNC(TRANWRD(&OUTCOME_LEV,%STR(%"),));
%LET OUTCOME_LEV = %SYSFUNC(TRANWRD(&OUTCOME_LEV,%STR(%'),)); 
ods output Association = roc_association (WHERE = (UPCASE(LABEL2) = "C"));
proc logistic data=&dataset outest = outest;
model &OUTCOME. (event = "&OUTCOME_LEV.") = &XVAR./EXPB outroc=ROCData;
output out = roctest p = p ;
run;
DATA ROCData;
SET ROCData; 
INDEX =1;
RUN;
DATA Outest (KEEP=INTERCEPT &XVAR. INDEX);
SET Outest; 
INDEX =1;
RUN;
/* Calculating X value */
DATA _merge;
MERGE ROCData Outest;
BY INDEX;
SPEC = 1-_1MSPEC_;
YI = (_SENSIT_ + SPEC) - 1;
N = _POS_+ _NEG_+ _FALPOS_+ _FALNEG_;
TA = (_POS_ + _NEG_)/N;
X_VALUE = (LOG(_PROB_/(1-_PROB_))-INTERCEPT)/&XVAR.;
IF (_NEG_+_FALNEG_) NE 0 THEN DO;
NPV = _NEG_/(_NEG_ +_FALNEG_);
END;
IF (_POS_+_FALPOS_) NE 0 THEN DO;
PPV = _POS_/(_POS_ +_FALPOS_);
END;
IF
((_POS_+_FALPOS_)*(_POS_+_FALNEG_)*(_NEG_+_FALPOS_)*(_NEG_+_FALNEG_)) NE 0
THEN DO;
MCC = ((_POS_*_NEG_)-(_FALPOS_*_FALNEG_))/
SQRT(((_POS_+_FALPOS_)*(_POS_+_FALNEG_)*(_NEG_+_FALPOS_)*(_NEG_+_FALNEG_)));
END;
RUN;
/* Merging in the original data, so that this can be plotted to */
proc sql;
create table merge2 as
select a.*, b.&XVAR. as XVAR1, b.&OUTCOME., b.p,
case when b.&OUTCOME. = &OUTCOME_LEV. then 3
else 4 end as Index1
from _merge as a right join Roctest as b
on a._PROB_ = b.p ;
quit;
data merge3;
set merge2;
string_check = anyalpha(&OUTCOME.);
if string_check = 0 then &OUTCOME._char = strip(put(&OUTCOME., best.))
;
else &OUTCOME. = &OUTCOME._char;
run;
;*****************************************************************%
%* CALCULATING X Value CUT-OFFS for Particular Specificities and 
Sensitivities and Vice Versa *;
;*****************************************************************%
proc sql;
create table One_Hundred_PC_Sens as
select max(X_VALUE) as max_x format 10.2
from _merge
where _SENSIT_ = 1;
quit;
proc sql;
create table sens_spec as
select *, abs(_SENSIT_ - SPEC) as abs_sens_spec, min(calculated abs_sens_spec) as min
from _merge;
quit;

proc sql;
create table sens_spec_intersection as
select X_VALUE format 10.2
from sens_spec 
where abs_sens_spec = min;
quit;
proc sql;
create table One_Hundred_PC_Spec as
select min(X_VALUE) as min_x format 10.2
from _merge
where spec = 1;
quit;
/* Fetching the values */
%let dsid=%sysfunc(open(Roc_association,i));
%let num_AUC=%sysfunc(varnum(&dsid,nValue2));
%let rc=%sysfunc(fetch(&dsid,1));
%let AUC=%sysfunc(getvarn(&dsid,&num_AUC));
%let AUC_format=%sysfunc(putn(&AUC,bestd10.2));
%let rc=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(One_Hundred_PC_Sens,i));
%let num_Sens=%sysfunc(varnum(&dsid,max_x));
%let rc=%sysfunc(fetch(&dsid,1));
%let Sens=%sysfunc(getvarn(&dsid,&num_Sens));
%let Sens_format=%sysfunc(putn(&Sens,bestd10.2));
%let rc=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(One_Hundred_PC_Spec,i));
%let num_Spec=%sysfunc(varnum(&dsid,min_x));
%let rc=%sysfunc(fetch(&dsid,1));
%let Spec=%sysfunc(getvarn(&dsid,&num_Spec));
%let Spec_format=%sysfunc(putn(&Spec,bestd10.2));
%let rc=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(sens_spec_intersection,i));
%let num_section=%sysfunc(varnum(&dsid,X_VALUE));
%let rc=%sysfunc(fetch(&dsid,1));
%let section=%sysfunc(getvarn(&dsid,&num_section));
%let rc=%sysfunc(close(&dsid));
%if &CUTOFF = %NRSTR() %then %do;
%let section2 = §ion;
%let section_format=%sysfunc(putn(§ion2,bestd10.2));
%let rc=%sysfunc(close(&dsid));
%end;
%else %do;
%let section2 = &CUTOFF;
%let section_format = &CUTOFF;
%end;
;*****************************************************************%
%* 2*2 Tables *;
;*****************************************************************%
data freq_input;
set merge3;
if XVAR1 ne .;
if XVAR1 <=  §ion2 then new_result = 1;
else new_result = 0;
run;
ods output  CrossTabFreqs =  CrossTabFreqs;
proc freq data = freq_input;
tables new_result * &OUTCOME._char / NOROW NOCOL;
run;
data crossTabFreqs_ref;
set CrossTabFreqs;
if &OUTCOME._char ne "";
if new_result ne .;
run;

data TP;
set CrossTabFreqs_ref;
if new_result = 0 and &OUTCOME._char = &OUTCOME_LEV.;
run;
data FP;
set CrossTabFreqs_ref;
if new_result = 0 and &OUTCOME._char ne &OUTCOME_LEV.;
run;
data FN;
set CrossTabFreqs_ref;
if new_result = 1 and &OUTCOME._char = &OUTCOME_LEV.;
run;
data TN;
set CrossTabFreqs_ref;
if new_result = 1 and &OUTCOME._char ne &OUTCOME_LEV.;
run;
/* Fetching the values */
%let dsid=%sysfunc(open(TP,i));
%let num_TP=%sysfunc(varnum(&dsid,Frequency));
%let rc=%sysfunc(fetch(&dsid,1));
%let TP=%sysfunc(getvarn(&dsid,&num_TP));
%let rc=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(FP,i));
%let num_FP=%sysfunc(varnum(&dsid,Frequency));
%let rc=%sysfunc(fetch(&dsid,1));
%let FP=%sysfunc(getvarn(&dsid,&num_FP));
%let rc=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(FN,i));
%let num_FN=%sysfunc(varnum(&dsid,Frequency));
%let rc=%sysfunc(fetch(&dsid,1));
%let FN=%sysfunc(getvarn(&dsid,&num_FN));
%let rc=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(TN,i));
%let num_TN=%sysfunc(varnum(&dsid,Frequency));
%let rc=%sysfunc(fetch(&dsid,1));
%let TN=%sysfunc(getvarn(&dsid,&num_TN));
%let rc=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(TP,i));
%let num_TP_per=%sysfunc(varnum(&dsid,Percent));
%let rc=%sysfunc(fetch(&dsid,1));
%let TP_per=%sysfunc(getvarn(&dsid,&num_TP_per));
%let TP_format=%sysfunc(putn(&TP_per,best3.0));
%let rc=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(FP,i));
%let num_FP_per=%sysfunc(varnum(&dsid,Percent));
%let rc=%sysfunc(fetch(&dsid,1));
%let FP_per=%sysfunc(getvarn(&dsid,&num_FP_per));
%let FP_format=%sysfunc(putn(&FP_per,best3.0));
%let rc=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(FN,i));
%let num_FN_per=%sysfunc(varnum(&dsid,Percent));
%let rc=%sysfunc(fetch(&dsid,1));
%let FN_per=%sysfunc(getvarn(&dsid,&num_FN_per));
%let FN_format=%sysfunc(putn(&FN_per,best3.0));
%let rc=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(TN,i));
%let num_TN_per=%sysfunc(varnum(&dsid,Percent));
%let rc=%sysfunc(fetch(&dsid,1));
%let TN_per=%sysfunc(getvarn(&dsid,&num_TN_per));
%let TN_format=%sysfunc(putn(&TN_per,best3.0));
%let rc=%sysfunc(close(&dsid));
/* Selecting distinct OUTCOME values */

proc sql;
create table distinct_values as
select distinct &OUTCOME.
from &dataset;
quit; 
data LEVEL NOTLEVEL;
  set distinct_values;
   if  &OUTCOME. =  UPCASE(&OUTCOME_LEV.) then output LEVEL;
else OUTPUT NOTLEVEL;   
run;
/* Turning it to character if it's not */
data level_char;
set level;
string_check = anyalpha(&OUTCOME.);
if string_check = 0 then &OUTCOME._char2 = strip(put(&OUTCOME., best.))
;
else &OUTCOME. = &OUTCOME._char2;
run;
data notlevel_char;
set notlevel;
string_check = anyalpha(&OUTCOME.);
if string_check = 0 then &OUTCOME._char2 = strip(put(&OUTCOME., best.))
;
else &OUTCOME. = &OUTCOME._char2;
run;
/* Fetching the distinct levels */
%let dsid=%sysfunc(open(level_char,i));
%let num_level=%sysfunc(varnum(&dsid,&OUTCOME._char2));
%let rc=%sysfunc(fetch(&dsid,1));
%let level=%sysfunc(getvarc(&dsid,&num_level));
%let rc=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(notlevel_char,i));
%let num_notlevel=%sysfunc(varnum(&dsid,&OUTCOME._char2));
%let rc=%sysfunc(fetch(&dsid,1));
%let notlevel=%sysfunc(getvarc(&dsid,&num_notlevel));
%let rc=%sysfunc(close(&dsid));
/* Plot */
ods html style = statistical;
proc template;
define statgraph sgplot;
mvar OUTCOME_TITLE OUTCOME_LEV OUTCOME AUC Sens Section2 Specif level notlevel confusion 
auc_table
XVAR_LABEL  TP FP FN TN TP_per FP_per FN_per TN_per Sensitivity Specificity; 
dynamic XVAR1 YVAR1;
begingraph;
EntryTitle "Operating Characterisitcs for " OUTCOME_TITLE " = " OUTCOME_LEV /;
if (confusion = "Y") 
EntryFootnote textattrs=(color = red )"*" textattrs=()"Confusion Matrix is for the "
XVAR_LABEL " at the cutoff = " textattrs=(color = blue ) Section2 ; 
*EntryFootnote "At cutoff, Sensitivity = " textattrs=(color = blue ) Sensitivity " and 
Specificity = " textattrs=(color = blue ) Specificity  ;
endif;
layout lattice / columns=1 rows=2  rowgutter=2px columngutter=2px
                        rowweights=(.7 .3 );
       columnaxes;
         columnaxis / label=XVAR_LABEL;
endcolumnaxes;
    sidebar / align=right; 
      layout gridded / border = false; 
layout overlay;

DiscreteLegend "SERIES" "SERIES1" / location = outside ACROSS = 1 down 
= 2; 
endlayout; 
layout overlay;
*DiscreteLegend "SERIES3" / location = outside;
endlayout; 
endlayout; 
     
    endsidebar; 
layout overlay / cycleattrs=true xaxisopts = (Label = XVAR_LABEL) yaxisopts=( 
Label="Proportion" type=linear linearopts=( tickvaluelist=( 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8
0.9 1 ) viewmin=0 viewmax=1 ) );
    StepPlot X=X_VALUE Y=SPEC / primary=true LegendLabel="Specificity" NAME="SERIES";
StepPlot X=X_VALUE Y=_SENSIT_ / LegendLabel="Sensitivity" NAME="SERIES1";
Referenceline x =  Section2 / LINEATTRS = (color = black pattern = 2);
if (confusion = "Y") 
/* 2 by 2 */
layout gridded /  columns=3 rows = 3
         halign = CENTER valign = top border=true 
opaque=true backgroundcolor=GraphWalls:color location = outside; 
entry halign=center " " / textattrs=GraphValueText; 
       entry halign=center level  / textattrs=(WEIGHT=bold) /* Setting the titles to bold */  
 ;
entry halign=center notlevel  / textattrs=(WEIGHT=bold); 
   entry halign=center "Event" / textattrs=(WEIGHT=bold) ; 
entry halign=center TP textattrs=(STYLE=ITALIC color = blue )"(" TP_per "%)" /* 
Formatting the percentages */ ; 
entry halign=center FP textattrs=(STYLE=ITALIC color = blue )"(" FP_per "%)" ; 
entry halign=center "Non Event" / textattrs=(WEIGHT=bold); 
      entry halign=center FN  textattrs=(STYLE=ITALIC color = blue )"(" FN_per "%)"; 
       entry halign=center TN textattrs=(STYLE=ITALIC color = blue )"(" TN_per "%)"; 
    
endlayout;
endif;
if (auc_table = "Y") 
layout gridded / columns=4 rows = 1
         halign = center valign = top border=true 
opaque=true backgroundcolor=GraphWalls:color location = outside; 
entry halign=center "AUC " / textattrs=(WEIGHT=bold); 
       entry halign=center "100% Sensitivity: " / textattrs=(WEIGHT=bold); 
entry halign=center "Intersection: "  / textattrs=(WEIGHT=bold); 
   entry halign=center "100% Specificity: " / textattrs=(WEIGHT=bold) ; 
entry halign=center AUC; 
      entry halign=center Sens; 
       entry halign=center Section2; 
entry halign=center Specif ; 
endlayout;

endif;
*DiscreteLegend "SERIES" "SERIES1" / location = outside;
endlayout;
layout lattice / columns=1 rows=2  rowgutter=2px columngutter=2px
                        rowweights=(.5 .5 ) COLUMNDATARANGE = Union;
columnaxes;
         columnaxis / label=XVAR_LABEL;
       endcolumnaxes;
    layout gridded / AUTOALIGN=NONE  columns=3 rows = 3
         halign = right valign = top border=true 
opaque=true backgroundcolor=GraphWalls:color location = outside; 
endlayout;
layout overlay /   yaxisopts=( Label=" " );
Scatterplot x=XVAR1 y = OUTCOME / group = OUTCOME INDEX = Index1 NAME = 
"SERIES3";
Referenceline x =  Section2 / LINEATTRS = (color = black pattern = 2);
endlayout;
layout overlay /   yaxisopts=( Label=" " );
Boxplot y = XVAR1 x = OUTCOME / ORIENT = horizontal;
Referenceline x =  Section2 / LINEATTRS = (color = black pattern = 2);
endlayout;
endlayout;
endlayout;
endgraph;
end;
run;
%let OUTCOME_TITLE = &OUTCOME;
%let OUTCOME_LEV = &OUTCOME_LEV;
%let OUTCOME =  &OUTCOME._char;
%let AUC =  &AUC_format;
%let Sens =  &Sens_format;
%let Section2 = &Section_format;
%let Specif =  &Spec_format;
%let level = &level;
%let notlevel = ¬level;
%let confusion = &confusion;
%let auc_table = &auc_table;
%let XVAR_LABEL = &XVAR_LABEL;
%let TP = &TP;
%let FP = &FP;
%let FN = &FN;
%let TN = &TN;
%let TP_per = &TP_format;
%let FP_per = &FP_format;
%let FN_per = &FN_format;
%let TN_per = &TN_format;
ods listing sge = on;
proc sgrender data=merge3 template=sgplot;
dynamic XVAR1 = "XVAR1" YVAR1 = "YVAR1";
run;
ods listing close;
proc sql;
drop table roc_association, Roctest, ROCData, Outest, _merge, merge2, merge3, 
One_Hundred_PC_Sens, 
sens_spec, sens_spec_intersection, One_Hundred_PC_Spec, freq_input,
CrossTabFreqs, crossTabFreqs_ref, TP, FP, FN, TN, distinct_values, 
LEVEL, NOTLEVEL, level_char, notlevel_char;
quit;
%mend roc_cutoff;


CONTACT INFORMATION
Your comments and questions are valued and encouraged. Please contact the author at:
Kriss Harris
GlaxoSmithKline
Mail Stop : 1B106003
Gunnels Wood Road
Stevenage SG1 2NY
Phone: 01438 76 4904
Email: Kriss.5.Harris@gsk.com
Web: http://www.krissharris.co.uk

沒有留言:

張貼留言

要問問題的人請在文章下方的intensedebate欄位留言,請勿使用blogger預設的意見表單。今後用blogger意見表單留言的人我就不回應了。

CODE { display: block; /* fixes a strange ie margin bug */ font-family: Courier New; font-size: 8pt; overflow:auto; background: #f0f0f0 url(http://klcintw.images.googlepages.com/Code_BG.gif) left top repeat-y; border: 1px solid #ccc; padding: 10px 10px 10px 21px; max-height:200px; height:200px; // for IE6 line-height: 1.2em; }