SAS and R

a home run · Teaching of Statistics in the Health Sciences

proc means
data
proc standard
proc standard data=indata out=outdata replace; 
run;

indata
outdata
var
ifelse()
df = data.frame(x = 1:20, y = c(1:10,rep(NA,10)))
df$y[is.na(df$y)] = mean(df$y, na.rm=TRUE)
# alternative
df = transform(df, y = ifelse(is.na(y), mean(y, na.rm=TRUE), y))

y
y
proc multtest
ODS
by
proc sort
by

%macro fdr(nsims=1, ntests = 20, nfalse=10, howfalse=.01);
ods select none;
data test;
do sim = 1 to &nsims;
  do i = 1 to &ntests;
    raw_p = uniform(0) * 
      ( ((i le &nfalse) * &howfalse ) + ((i gt &nfalse) * 1 ) );
    output;
  end;
end;
run;

ods output pvalues = __pv;
proc multtest inpvalues=test fdr;
by sim;
run; 

proc multtest
proc freq

data __pv1;
set __pv;
if falsediscoveryrate lt 0.05 then fdrprop = 1/&ntests;
else fdrprop =0;
run;

proc transpose data = __pv1 (keep =sim fdrprop) out = pvals_a;
by sim; run;

data pvals;
set pvals_a;
prop = sum(of col1 - col&ntests);
run;
ods select all;

proc freq data = pvals; tables prop; run;
%mend fdr;

%fdr(nsims = 1000, ntests = 20, nfalse = 10, howfalse=.001);

                                      Cumulative    Cumulative
     prop    Frequency     Percent     Frequency      Percent
     ---------------------------------------------------------
      0.5         758       75.80           758        75.80
     0.55         210       21.00           968        96.80
      0.6          27        2.70           995        99.50
     0.65           5        0.50          1000       100.00

matrix
apply
t()
ifelse()
 apply()

checkfdr = function(nsims=1, ntests=100, nfalse=0, howfalse=0.001) {
  raw_p = matrix(c(runif(nfalse * nsims) * howfalse, 
                   runif((ntests-nfalse) * nsims)), nrow=nsims)
  fdr = t(apply(raw_p, 1, p.adjust, "fdr"))
  reject = ifelse(fdr<.05, 1/ntests,0)
  prop = apply(reject, 1, sum)
  prop.table(table(prop)) 
}

> checkfdr(nsims=1000, ntests=20, nfalse=10, howfalse=.001)
prop
  0.5  0.55   0.6  0.65 
0.755 0.210 0.032 0.003 

p.adjust()

checkhoch = function(nsims=1, ntests=100, nfalse=0, howfalse=0.001) {
   pvals = matrix(c(runif(nfalse * nsims) * howfalse, 
                    runif((ntests-nfalse) * nsims)), nrow=nsims)
   hochberg = t(apply(pvals, 1, p.adjust,"hochberg"))
   reject = ifelse(hochberg<.05,1/ntests,0)
   prop = apply(reject, 1, sum)
   prop.table(table(prop)) 
}

> checkhoch(nsims=1000, ntests=20, nfalse=10, howfalse=.001)
prop
  0.5  0.55   0.6 
0.951 0.046 0.003

for i = 1 to n
  for j = 1 to i
    print "*"

outer()
ifelse()
TRUE
*

> ifelse(outer(1:5, 1:5, `>=`), "*", " ")  
     [,1] [,2] [,3] [,4] [,5]
[1,] "*"  " "  " "  " "  " " 
[2,] "*"  "*"  " "  " "  " " 
[3,] "*"  "*"  "*"  " "  " " 
[4,] "*"  "*"  "*"  "*"  " " 
[5,] "*"  "*"  "*"  "*"  "*" 

lapply()
n

> lapply(1:5, function(x) cat(rep("*", x), "\n"))
* 
* * 
* * * 
* * * * 
* * * * * 

proc iml

data test;
array star [5] $ star1 - star5;
do i = 1 to 5;
  star[i] = "*";
  output;
  end;
run;

proc print noobs; var star1 - star5; run;

             star1    star2    star3    star4    star5

               *
               *        *
               *        *        *
               *        *        *        *
               *        *        *        *        *

$
array
array
do

sgpanel

proc sgpanel data = 'c:\book\help.sas7bdat';
  panelby female;
  histogram cesd;
run;

lattice

ds = read.csv("http://www.math.smith.edu/r/data/help.csv")
ds$gender = ifelse(ds$female==1, "female", "male")
library(lattice)
histogram(~ cesd | gender, data=ds)

proc lca
proc lta
proc
proc
proc lca
ds

data ds_0; set "c:\book\help.sas7bdat"; run;

data ds; set ds_0;
   homeless = homeless+1;
   cesdcut = (cesd > 20) + 1;
   satreat = satreat+1;
   linkstatus = linkstatus+1;
run;

proc lca data=ds;
   title '3 class model';
   nclass 3;
   items homeless cesdcut satreat linkstatus;
   categories 2 2 2 2;
   seed 42;
   nstarts 20;
run;

Data Summary, Model Information, and Fit Statistics (EM 
Algorithm)

Number of subjects in dataset:              431
Number of subjects in analysis:             431

Number of measurement items:             4
Response categories per item:            2 2 2 2
Number of groups in the data:            1
Number of latent classes:                3
Rho starting values were randomly generated (seed = 42).

No parameter restrictions were specified (freely estimated).

Seed selected for best fitted model:    1486228051
Percentage of seeds associated with best fitted model:   40.00%

The model converged in 3241 iterations.

Maximum number of iterations: 5000
Convergence method: maximum absolute deviation (MAD)
Convergence criterion:  0.000001000

=============================================
Fit statistics:
=============================================
Log-likelihood:     -1032.48
G-squared:              1.22
AIC:                   29.22
BIC:                   86.15
CAIC:                 100.15
Adjusted BIC:          41.72
Entropy R-sqd.:         0.94
Degrees of freedom:        1

                            Parameter Estimates
Gamma estimates (class membership probabilities): 
Class:                     1          2          3
                      0.2163     0.0785     0.7052

Rho estimates (item response probabilities): 
  Response category  1:
Class:                     1          2          3
  homeless    :       0.2703     1.0000     0.5625
  cesdcut     :       0.1154     0.4214     0.1678
  satreat     :       0.0004     0.0000     1.0000
  linkstatus  :       0.6029     1.0000     0.5855

  Response category  2:
Class:                     1          2          3
  homeless    :       0.7297     0.0000     0.4375
  cesdcut     :       0.8846     0.5786     0.8322
  satreat     :       0.9996     1.0000     0.0000
  linkstatus  :       0.3971     0.0000     0.4145

within()

ds = read.csv("http://www.math.smith.edu/r/data/help.csv")
ds = within(ds, (cesdcut = ifelse(cesd>20, 1, 0)))

poLCA
poLCA()
proc lca
proc lca

library(poLCA)
res2 = poLCA(cbind(homeless=homeless+1, 
   cesdcut=cesdcut+1, satreat=satreat+1, 
   linkstatus=linkstatus+1) ~ 1, 
   maxiter=50000, nclass=3, 
   nrep=10, data=ds)

Model 1: llik = -1032.889 ... best llik = -1032.889
Model 2: llik = -1032.889 ... best llik = -1032.889
Model 3: llik = -1032.484 ... best llik = -1032.484
Model 4: llik = -1032.889 ... best llik = -1032.484
Model 5: llik = -1032.889 ... best llik = -1032.484
Model 6: llik = -1032.484 ... best llik = -1032.484
Model 7: llik = -1032.484 ... best llik = -1032.484
Model 8: llik = -1032.889 ... best llik = -1032.484
Model 9: llik = -1032.889 ... best llik = -1032.484
Model 10: llik = -1032.889 ... best llik = -1032.484
Conditional item response (column) probabilities,
 by outcome variable, for each class (row) 

$homeless
           Pr(1)  Pr(2)
class 1:  0.2703 0.7297
class 2:  1.0000 0.0000
class 3:  0.5625 0.4375

$cesdcut
           Pr(1)  Pr(2)
class 1:  0.1154 0.8846
class 2:  0.4213 0.5787
class 3:  0.1678 0.8322

$satreat
          Pr(1) Pr(2)
class 1:      0     1
class 2:      0     1
class 3:      1     0

$linkstatus
           Pr(1)  Pr(2)
class 1:  0.6029 0.3971
class 2:  1.0000 0.0000
class 3:  0.5855 0.4145

Estimated class population shares 
 0.2162 0.0785 0.7053 

Predicted class memberships (by modal posterior prob.) 
 0.181 0.1137 0.7053 

========================================================= 
Fit for 3 latent classes: 
========================================================= 
number of observations: 431 
number of estimated parameters: 14 
residual degrees of freedom: 1 
maximum log-likelihood: -1032.484 

AIC(3): 2092.967
BIC(3): 2149.893
G^2(3): 1.221830 (Likelihood ratio/deviance statistic) 
X^2(3): 1.233247 (Chi-square goodness of fit) 

proc lca
proc lca

SAS and R

Catalogs of posts

Friday, April 25, 2014

Example 2014.5: Simple mean imputation

Monday, May 21, 2012

Example 9.32: Multiple testing simulation

Monday, April 30, 2012

Example 9.29: the perils of for loops

Monday, June 13, 2011

Example 8.40: Side-by-side histograms

Tuesday, January 18, 2011

Example 8.21: latent class analysis

About SAS and R

Topics discussed