# logistic_sample_size.R - A sample-size calculator for logistic regression
# Version 1.0 (2009-06-12)
#
# Copyright (C) 2009 Jaymie Strecker
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# For a copy of the GNU General Public License, see
# .
#
# This program implements sample-size formulas by:
# Hsieh, F. Y., Bloch, D. A., and Larsen, M. D. "A simple method of
# sample size calculation for linear and logistic
# regression". Statistics in Medicine, 17:14, 1998, p. 1623-1634.
args <- commandArgs();
args.start <- which(args == "--args")[1] + 1;
if (is.na(args.start) || length(args) != args.start) {
cat("Arguments: .\n");
cat(" Data file should contain a data frame named 'my.data' and, optionally, a vector called 'effectSizes'.\n");
cat(" Each column of my.data should hold a list of values (reals or 2-level factors) for one study variable. Each row should represent a data point. The last column is for the response variable and should contain only 0s and 1s (not as factors).\n");
cat(" If effectSizes is present, it should hold the effect sizes for which the sample size is to be calculated. Otherwise, default effect sizes will be used.\n");
quit(status = 1);
}
my.file <- args[args.start];
load(my.file);
indeps <- names(my.data)[1:(length(names(my.data))-1)];
dep <- names(my.data)[length(names(my.data))];
Y <- my.data[[dep]];
if (! exists("effectSizes")) {
effectSizes <- 0.05 * 1:10;
}
alpha <- 0.05 / length(indeps);
beta <- 0.20;
Zalpha <- qnorm(1 - alpha/2);
Zbeta <- qnorm(1 - beta);
my.sizes <- array(dim = c(length(indeps), length(effectSizes)));
for (i in 1:length(indeps)) {
X1 <- my.data[[indeps[i]]];
if (is.factor(X1)) {
if (length(levels(X1)) != 2) {
stop(paste("Variable ", indeps[i], " has ", length(levels(X1)),
" levels, not 2", sep=""));
}
P1 <- sum(Y[X1 == levels(X1)[1]]) / sum(X1 == levels(X1)[1]);
P2 <- sum(Y[X1 == levels(X1)[2]]) / sum(X1 == levels(X1)[2]);
B <- sum(X1 == levels(X1)[2]) / length(X1);
P <- (1 - B)*P1 + B*P2;
} else {
X1.glm <- glm(formula = as.formula(paste("Y", "X1", sep=" ~ ")),
family = binomial(link=logit));
P1 <- predict(X1.glm, newdata = data.frame(X1 = mean(X1)),
type = "response");
}
inds <- 1:length(indeps);
inds <- inds[inds != i];
for (j in 1:length(effectSizes)) {
betaStar <- effectSizes[j];
rhs <- paste(indeps[inds], collapse=" + ");
if (is.factor(X1)) {
n1 <- ( Zalpha * (P*(1 - P)/B)^(1/2) +
Zbeta * (P1*(1 - P1) + P2*(1 - P2)*(1 - B)/B)^(1/2) )^2 /
( (P1 - P2)^2 * (1 - B) );
X1.cont <- X1 == levels(X1)[2];
X1.Rmod <- lm(formula = as.formula(paste("X1.cont", rhs, sep=" ~ ")),
data = my.data);
} else {
n1 <- ( (Zalpha + Zbeta)^2 ) / ( P1*(1 - P1)*betaStar^2 );
X1.Rmod <- lm(formula = as.formula(paste(indeps[i], rhs, sep=" ~ ")),
data = my.data);
}
Rsquared <- summary(X1.Rmod)$r.squared;
my.sizes[i,j] <- round(n1/(1 - Rsquared));
}
}
rownames(my.sizes) <- indeps;
colnames(my.sizes) <- effectSizes;
my.maxes <- array(dim = c(1, length(effectSizes)));
for (i in 1:length(effectSizes)) {
my.maxes[1,i] <- max(my.sizes[ ,i], na.rm = TRUE);
}
rownames(my.maxes) <- c("max");
colnames(my.maxes) <- effectSizes;
cat("\nDependent variable: ");
cat(dep); cat("\n\n");
cat("Sample size for each independent at each effect size:\n");
print(my.sizes); cat("\n\n");
cat("Maximum sample size at each effect size:\n");
print(my.maxes); cat("\n");