******************************************************************************************************************************** ** OUTLIER-IDENTIFICATION USING ORDER-ALPHA-EFFICIENCY (oaoutlier) ************************************************************* ******************************************************************************************************************************** *! version 1.0.0 2011-03-02 ht *! author Harald Tauchmann * Outlier-identification based on order-alpha efficiency analysis capture program drop oaoutlier program oaoutlier, rclass version 11.1 if !replay() { quietly { return clear local cmd "oaoutlier" local cmdline "`cmd' `*'" syntax varlist(max=1) [if] [in], INPuts(varlist num) OUTputs(varlist num) [ORT(string)] [NALpha(int -999)] [LEVel(real 0)] [noPLOt] [noROUgh] [noSMOoth] [noBIC] [SMOOTHER(string)] [DOTs] ** MANAGE MORE local moreold `c(more)' set more off ** NUMBER OF INPUTS and OUTPUTS local nin = wordcount("`inputs'") local nout = wordcount("`outputs'") ** TEMPORARY FILES, MATRICES, and VARIABLES tempname _empmat _superm _supers _superm _oaresult _resu _resub tempvar _tempid _strid _xmax _xmin _in_exclu _out_exclu _obsno _perc _cent _superi _share _alpha _twd_share _twdr_share _twdrs_share _oint0 _oint1 _oint2 _oint3 _oint4 _oint5 _empty _idl _idh _idli _idhi _idli_sq _idhi_sq forvalues jj = 1(1)`nin' { tempvar _xi_`jj' } forvalues jj = 1(1)`nout' { tempvar _xo_`jj' } ** CHECK INPUT-OUTPUT-SPECIFICATION forvalues ii = 1(1)`nin' { local iin = word("`inputs'",`ii') forvalues jj = 1(1)`nout' { local jout = word("`outputs'",`jj') if "`jout'" == "`iin'" { display as error "Error: variable `jout' is input and output" local ioerror = 1 continue, break } } } if "`ioerror'" == "1" { exit 103 } ** CHECK FOR DUPLICATES IN INPUT-LIST local firstinp = word("`inputs'",1) local inputsnew "`firstinp'" if `nin' > 1 { forvalues ii = 2(1)`nin' { local duperror = 0 local prior = `ii'-1 local iin = word("`inputs'",`ii') forvalues jj = 1(1)`prior' { local jin = word("`inputs'",`jj') if "`iin'" == "`jin'" { local duperror = 1 } } if `duperror' != 1 { local inputsnew "`inputsnew' `iin'" } } } if "`inputs'" != "`inputsnew'" { local inputs "`inputsnew'" local nin = wordcount("`inputs'") display as error "Warning: duplicates in list of inputs dropped" } ** CHECK FOR DUPLICATES IN OUTPUT-LIST local firstout = word("`outputs'",1) local outputsnew "`firstout'" if `nout' > 1 { forvalues ii = 2(1)`nout' { local duperror = 0 local prior = `ii'-1 local iout = word("`outputs'",`ii') forvalues jj = 1(1)`prior' { local jout = word("`outputs'",`jj') if "`iout'" == "`jout'" { local duperror = 1 } } if `duperror' != 1 { local outputsnew "`outputsnew' `iout'" } } } if "`outputs'" != "`outputsnew'" { local outputs "`outputsnew'" local nout = wordcount("`outputs'") display as error "Warning: duplicates in list of outputs dropped" } ** CHECK SPECIFICATION OF ORIentation if "`ort'" != "output" & "`ort'" != "input" & "`ort'" != "" { display as error "Error: option ort incorrectly specified" exit 198 } local ort2 "`ort'" if "`ort'" == "" { local ort2 "input" } ** TEMPORARY DMU-IDENTIFIER gen `_tempid' = _n ** PRESERVE ORIGINAL DATA-FILE preserve ** SELECT RELEVANT SAMPLE if "`if'" != "" { keep `if' } if "`in'" != "" { keep `in' } ** DROP DMUs WITH MISSING INFORMATION foreach jj of varlist `inputs' `outputs' { drop if `jj' <= 0 | `jj' >=. } drop if missing(`varlist') ** SAMPLE_SIZE FOR EFFICIENCY ANALYSIS local samps = _N ** CHECK FOR EMPTY SAMPLE if `samps' == 0 { display as error "Error: no observations" restore exit 2000 } ** CHECK IDENTIFIER FOR DUPLICATES duplicates report `varlist' if r(N) > r(unique_value) { display as error "Error: variable `varlist' does not uniquely identify dmus" restore exit 498 } ** CHECK NUMBER OF PERCENTILES if `nalpha' == -999 { local nperc = `samps'+1 } if `nalpha' <= 1 & `nalpha' != -999 { local nperc = `samps'+1 display as error "Warning: nalpha < 2, value set to default" } if `nalpha' > `samps'+1 { local nperc = `samps'+1 display as error "Warning: nalpha > N, value set to default" } if `nalpha' > 1 & `nalpha' <= `samps'+1 { local nperc = `nalpha'+1 } ** ORDER-ALPA EFFICIENCY SCORES if "`ort'" == "output" { forvalues ii = 1(1)`samps' { capture drop `_xmin' local jj = 0 foreach vv of varlist `outputs' { local jj = 1+ `jj' capture drop `_xo_`jj'' gen `_xo_`jj'' = `vv'/`vv'[`ii'] if `jj' == 1 { local xrlo "`_xo_`jj''" } else { local xrlo "`xrlo', `_xo_`jj''" } } if `jj' == 1 { gen `_xmin' = `xrlo' } else { gen `_xmin' = min(`xrlo') } capture drop `_in_exclu' gen `_in_exclu' = 0 foreach vv of varlist `inputs' { replace `_in_exclu' = 1 if `vv' > `vv'[`ii'] } ** INVERT OUTPUT-ORIENTED EFFICIENCY replace `_xmin' = (`_xmin')^-1 ** GENERETE PERCENTILES capture drop `_perc' `_cent' pctile `_perc' = `_xmin' if `_in_exclu' == 0, nquantiles(`nperc') genp(`_cent') capture drop `_superi' gen `_superi' = `_perc' > 1 replace `_superi' =. if `_perc' ==. replace `_cent' = 100 -`_cent' sort `_cent' if `ii' == 1 { mkmat `_cent' if `_cent'<., matrix(`_centm') mkmat `_superi' if `_superi'<., matrix(`_supers') } else { mkmat `_superi' if `_superi'<., matrix(`_superm') matrix `_supers' = `_supers'+ `_superm' } sort `_tempid' if "`dots'" == "dots" { if `ii'/50 == round(`ii'/50) | `ii' == `samps' { noisily: display as text ". `ii'" } else { if `ii' == 1 { noisily: display _newline as text "looping through data:" } noisily: display as text "." _continue } } } } else { forvalues ii = 1(1)`samps' { capture drop `_xmax' local jj = 0 foreach vv of varlist `inputs' { local jj = 1+ `jj' capture drop `_xi_`jj'' gen `_xi_`jj'' = `vv'/`vv'[`ii'] if `jj' == 1 { local xrli "`_xi_`jj''" } else { local xrli "`xrli', `_xi_`jj''" } } if `jj' == 1 { gen `_xmax' = `xrli' } else { gen `_xmax' = max(`xrli') } capture drop `_out_exclu' gen `_out_exclu' = 0 foreach vv of varlist `outputs' { replace `_out_exclu' = 1 if `vv' < `vv'[`ii'] } ** GENERETE PERCENTILES capture drop `_perc' `_cent' pctile `_perc' = `_xmax' if `_out_exclu' == 0, nquantiles(`nperc') genp(`_cent') capture drop `_superi' gen `_superi' = `_perc' > 1 replace `_superi' =. if `_perc' ==. replace `_cent' = 100 -`_cent' sort `_cent' if `ii' == 1 { mkmat `_cent' if `_cent'<., matrix(`_centm') mkmat `_superi' if `_superi'<., matrix(`_supers') } else { mkmat `_superi' if `_superi'<., matrix(`_superm') matrix `_supers' = `_supers'+ `_superm' } sort `_tempid' ** LOOP-DOTS if "`dots'" == "dots" { if `ii'/50 == round(`ii'/50) | `ii' == `samps' { noisily: display as text ". `ii'" } else { if `ii' == 1 { noisily: display _newline as text "looping through data:" } noisily: display as text "." _continue } } } } ** RESULTS BACK TO VARIABLES matrix `_supers' = (1/`samps')*`_supers' svmat `_supers', names(`_share') svmat `_cent', names(`_alpha') keep `_share' `_alpha' drop if `_share' ==. | `_alpha' ==. ** OUTLIER DETECTION ** TWICE-DIFFERENCED SERIES capture drop `_twd_share' gen `_twd_share' = `_share'[_n+1]+`_share'[_n-1]-2*`_share' regress `_twd_share' `_alpha' capture drop `_twdr_share' predict `_twdr_share', residuals if "`rough'" != "norough" { sum `twdr_share' capture drop `_oint0' gen `_oint0' = `_twdr_share'/r(sd) < invnormal(1-(100-`level')/200) & `_twdr_share'[_n-1]/r(sd) >= 0 /*invnormal(1-(100-`level')/200)*/ & `_twdr_share'[_n-1]<. sum `_twdr_share' if `_oint0' == 1 local minval = r(min) sum `_alpha' if `_oint0' == 1 & round(`_twdr_share',10^-12) == round(`minval',10^-12) local arough1 = r(mean) sum `_twdr_share' if `_oint0' == 1 & `_twdr_share' > `minval' local minval = r(min) sum `_alpha' if `_oint0' == 1 & round(`_twdr_share',10^-12)==round(`minval',10^-12) if r(mean) < 0.75*`arough1' { local arough2 = r(mean) sum `_twdr_share' if `_oint0' == 1 & `_twdr_share' > `minval' local minval = r(min) sum `_alpha' if `_oint0' == 1 & round(`_twdr_share',10^-12)==round(`minval',10^-12) if r(mean) < 0.75*`arough1' { local arough3 = r(mean) } } } if "`smooth'" != "nosmooth" { local kk = 0 if "`smoother'" == "" { local csmooth "3 3RSREH 3RSR5REH 3RSR5R7REH 3RSR5R7R9REH" } else { local csmooth "`smoother'" } local nsmooth = wordcount("`csmooth'") foreach smo in `csmooth' { local kk = `kk'+1 capture drop `_twdrs_share' smooth `smo' `_twdr_share', generate(`_twdrs_share') sum `_twdrs_share' capture drop `_oint`kk'' gen `_oint`kk'' = `_twdrs_share'/r(sd) < invnormal(1-(100-`level')/200) /*0*/ & `_twdrs_share'[_n-1]/r(sd) >= 0 /*invnormal(1-(100-`level')/200)*/ & `_twdrs_share'[_n-1]<. sum `_twdrs_share' if `_oint`kk'' == 1 local oN = r(N) local min1 = r(min) if `oN' > 1 { sum `_twdrs_share' if `_oint`kk'' == 1 & `_twdrs_share'>`min1' local min2 = r(min) } if `oN' > 2 { sum `_twdrs_share' if `_oint`kk'' == 1 & `_twdrs_share'>`min2' local min3 = r(min) } if `oN' == 0 { continue, break } if `oN' > 3 & `kk' < `nsmooth' { macro drop _asmooth1 sum `_alpha' if `_oint`kk'' == 1 & round(`_twdrs_share',10^-12) == round(`min1',10^-12) local asmooth1 = r(min) continue } if `oN' == 1 { macro drop _asmooth1 sum `_alpha' if `_oint`kk'' == 1 & round(`_twdrs_share',10^-12) == round(`min1',10^-12) local asmooth1 = r(min) } if `oN' == 2 { macro drop _asmooth1 macro drop _asmooth2 sum `_alpha' if `_oint`kk'' == 1 & round(`_twdrs_share',10^-12) == round(`min1',10^-12) local asmooth1 = r(min) sum `_alpha' if `_oint`kk'' == 1 & round(`_twdrs_share',10^-12) == round(`min2',10^-12) if round(`asmooth1',10^-12) != round(r(min),10^-12) { local asmooth2 = r(min) } } if `oN' == 3 | (`oN' > 3 & `kk' == `nsmooth') { macro drop _asmooth1 macro drop _asmooth2 macro drop _asmooth3 sum `_alpha' if `_oint`kk'' == 1 & round(`_twdrs_share',10^-12) == round(`min1',10^-12) local asmooth1 = r(min) sum `_alpha' if `_oint`kk'' == 1 & round(`_twdrs_share',10^-12) == round(`min2',10^-12) local asmooth2 = r(min) sum `_alpha' if `_oint`kk'' == 1 & round(`_twdrs_share',10^-12) == round(`min3',10^-12) local asmooth3 = r(min) } } } ** GLOBAL APPROACH TO OUTLIER DETECTION if "`bic'" != "nobic" { forvalues ii = 1(1)`samps' { capture drop `_idl' `_idh' `_idli' `_idhi' `_idli_sq' `_idhi_sq' local alph = `_alpha'[`ii'] gen `_idl' = 0 replace `_idl' = 1 if `_alpha' < `alph' gen `_idh' = 1- `_idl' gen `_idli' = `_idl'*`_alpha' gen `_idhi' = `_idh'*`_alpha' gen `_idli_sq' = `_idli'*`_idli' gen `_idhi_sq' = `_idhi'*`_idhi' forvalues kk = 1(1)4 { if `kk' == 1 { local sqreg "" } if `kk' == 2 { local sqreg "`_idli_sq'" } if `kk' == 3 { local sqreg "`_idhi_sq'" } if `kk' == 4 { local sqreg "`_idli_sq' `_idhi_sq'" } regress `_share' `_idl' `_idh' `_idli' `_idhi' `sqreg', noconst estat ic mat `_resu' = r(S) mat `_resub' = `_resu' if `_resu'[1,6] < `_resub'[1,6] & `kk' > 1 { mat `_resub' = `_resu' } } if `ii' == 1 { local abic1 = `alph' local bic = `_resub'[1,6] } else { if `_resub'[1,6] < `bic' { local abic1 = `alph' local bic = `_resub'[1,6] } } } } ** ORDER-ALPHA RESULTS AS MATRIX matrix `_oaresult' = (`_cent',`_supers') matrix colnames `_oaresult' = alpha share local oarow = rowsof(`_oaresult') ** MATCHING ALPHA TO SHARES and RETURNING RESULTS foreach type in smooth rough bic { foreach jj in 1 2 3 { if "`a`type'`jj''" != "" & "`a`type'`jj''" != "." { forvalues ll = 1(1)`oarow' { local verg = `_oaresult'[`ll',1] display "`a`type'`jj''" if round(`a`type'`jj'',10^-12) == round(`verg',10^-12) { local s`type'`jj' = `_oaresult'[`ll',2] } } } } } ** RETURN REMAINING RESULTS foreach kk in a s { foreach type in smooth rough bic { forvalues jj = 1(1)3 { if "``kk'`type'`jj''" != "" & "``kk'`type'`jj''" != "." { if `jj' == 1 | (`jj' == 2 & "``kk'`type'`jj''" != "``kk'`type'1'") | (`jj' == 3 & "``kk'`type'`jj''" != "``kk'`type'1'" & "``kk'`type'`jj''" != "``kk'`type'2'") { return scalar `kk'`type'`jj' = ``kk'`type'`jj'' } else { macro drop _`kk'`type'`jj' } } } } } return matrix oaresult = `_oaresult' if "`ort'" == "output" { return local ort "output" } else { return local ort "input" } return local smoother "`smo'" return local title "Order-alpha based outlier detection" return local cmdline `cmdline' return local cmd `cmd' ** GRAPH RESULTS if "`plot'" != "noplot" { local xline "" foreach jj in 1 2 3 { if "`jj'" == "1" { local style "solid" } if "`jj'" == "2" { local style "dash" } if "`jj'" == "3" { local style "dot" } foreach type in smooth rough bic { if "`type'" == "smooth" { local colour "blue" } if "`type'" == "rough" { local colour "red" } if "`type'" == "bic" { local colour "midgreen" } if "`a`type'`jj''" != "" & "`a`type'`jj''" != "." { local xline "`xline' xline(`a`type'`jj'', lpattern(`style') lwidth(medium) lcolor(`colour'))" } } } gen `_empty' =. local legline "" local legbox `"legend("' local legcol = 0 local legnum = 1 local legord "" if "`smooth'" != "nosmooth" { local legnum = `legnum' +1 local legnums `"`legnum'"' local legord `"`legord' `legnums'"' local legname `""smoothed""' local legline "`legline' (line `_empty' `_alpha', lpattern(solid) lwidth(medium) lcolor(blue))" local legbox `"`legbox'label(`legnums' `legname') "' local legcol = `legcol' + 1 } if "`rough'" != "norough" { local legnum = `legnum' +1 local legnums `"`legnum'"' local legord `"`legord' `legnums'"' local legname `""rough""' local legline "`legline' (line `_empty' `_alpha', lpattern(solid) lwidth(medium) lcolor(red))" local legbox `"`legbox'label(`legnums' `legname') "' local legcol = `legcol' + 1 } if "`bic'" != "nobic" { local legnum = `legnum' +1 local legnums `"`legnum'"' local legord `"`legord' `legnums'"' local legname `""bic""' local legline "`legline' (line `_empty' `_alpha', lpattern(solid) lwidth(medium) lcolor(midgreen))" local legbox `"`legbox'label(`legnums' `legname') "' local legcol = `legcol' + 1 } if "`smooth'" == "nosmooth" & "`rough'" == "norough" & "`bic'" == "nobic" { twoway (line `_share' `_alpha', lpattern(solid) lwidth(medthick) lcolor(black)), ytitle("share of super-efficient dmus") xtitle("alpha") ylabel(0.2(0.2)1, grid) xlabel(0(10)100) } else { twoway (line `_share' `_alpha', lpattern(solid) lwidth(medthick) lcolor(black)) `legline', /* */ `xline' ytitle("share of super-efficient dmus") xtitle("alpha") ylabel(0.2(0.2)1, grid) xlabel(0(10)100) `legbox' order(`legord') /* */ size(small) cols(`legcol') subtitle("suggested points of discontinuity", justification(center) margin(zero) size(small))) } } ** RESTORE ORG-DATA restore } set more `moreold' } ** DISPLAY RESULTS if "`bic'" != "nobic" | "`rough'" != "norough" | "`smooth'" != "nosmooth" { display _newline as text "Order-alpha based Outlier Detection (suggested points of discontinuity):" _newline foreach type in smooth rough bic { foreach jj in 1 2 3 { if "`jj'" == "1" { local num "Prime" } if "`jj'" == "2" { local num "Second-rate" } if "`jj'" == "3" { local num "Third-rate" } if "`a`type'`jj''" != "" & "`a`type'`jj''" != "." { display as text "`num' suggestion by criterion " as result "`type'" as text " at alpha "_column(55)"= " as result %-06.4g `a`type'`jj'' as text "(share of outliers = " as result %-06.4g `s`type'`jj'' as text")" } } } } else { display _newline as text "Order-alpha based Outlier Detection: no criterion for detecting discontinuities specified" } end exit 0