# logistic_sample_size.R - A sample-size calculator for logistic regression # Version 1.0 (2009-06-12) # # Copyright (C) 2009 Jaymie Strecker # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # For a copy of the GNU General Public License, see # . # # This program implements sample-size formulas by: # Hsieh, F. Y., Bloch, D. A., and Larsen, M. D. "A simple method of # sample size calculation for linear and logistic # regression". Statistics in Medicine, 17:14, 1998, p. 1623-1634. args <- commandArgs(); args.start <- which(args == "--args")[1] + 1; if (is.na(args.start) || length(args) != args.start) { cat("Arguments: .\n"); cat(" Data file should contain a data frame named 'my.data' and, optionally, a vector called 'effectSizes'.\n"); cat(" Each column of my.data should hold a list of values (reals or 2-level factors) for one study variable. Each row should represent a data point. The last column is for the response variable and should contain only 0s and 1s (not as factors).\n"); cat(" If effectSizes is present, it should hold the effect sizes for which the sample size is to be calculated. Otherwise, default effect sizes will be used.\n"); quit(status = 1); } my.file <- args[args.start]; load(my.file); indeps <- names(my.data)[1:(length(names(my.data))-1)]; dep <- names(my.data)[length(names(my.data))]; Y <- my.data[[dep]]; if (! exists("effectSizes")) { effectSizes <- 0.05 * 1:10; } alpha <- 0.05 / length(indeps); beta <- 0.20; Zalpha <- qnorm(1 - alpha/2); Zbeta <- qnorm(1 - beta); my.sizes <- array(dim = c(length(indeps), length(effectSizes))); for (i in 1:length(indeps)) { X1 <- my.data[[indeps[i]]]; if (is.factor(X1)) { if (length(levels(X1)) != 2) { stop(paste("Variable ", indeps[i], " has ", length(levels(X1)), " levels, not 2", sep="")); } P1 <- sum(Y[X1 == levels(X1)[1]]) / sum(X1 == levels(X1)[1]); P2 <- sum(Y[X1 == levels(X1)[2]]) / sum(X1 == levels(X1)[2]); B <- sum(X1 == levels(X1)[2]) / length(X1); P <- (1 - B)*P1 + B*P2; } else { X1.glm <- glm(formula = as.formula(paste("Y", "X1", sep=" ~ ")), family = binomial(link=logit)); P1 <- predict(X1.glm, newdata = data.frame(X1 = mean(X1)), type = "response"); } inds <- 1:length(indeps); inds <- inds[inds != i]; for (j in 1:length(effectSizes)) { betaStar <- effectSizes[j]; rhs <- paste(indeps[inds], collapse=" + "); if (is.factor(X1)) { n1 <- ( Zalpha * (P*(1 - P)/B)^(1/2) + Zbeta * (P1*(1 - P1) + P2*(1 - P2)*(1 - B)/B)^(1/2) )^2 / ( (P1 - P2)^2 * (1 - B) ); X1.cont <- X1 == levels(X1)[2]; X1.Rmod <- lm(formula = as.formula(paste("X1.cont", rhs, sep=" ~ ")), data = my.data); } else { n1 <- ( (Zalpha + Zbeta)^2 ) / ( P1*(1 - P1)*betaStar^2 ); X1.Rmod <- lm(formula = as.formula(paste(indeps[i], rhs, sep=" ~ ")), data = my.data); } Rsquared <- summary(X1.Rmod)$r.squared; my.sizes[i,j] <- round(n1/(1 - Rsquared)); } } rownames(my.sizes) <- indeps; colnames(my.sizes) <- effectSizes; my.maxes <- array(dim = c(1, length(effectSizes))); for (i in 1:length(effectSizes)) { my.maxes[1,i] <- max(my.sizes[ ,i], na.rm = TRUE); } rownames(my.maxes) <- c("max"); colnames(my.maxes) <- effectSizes; cat("\nDependent variable: "); cat(dep); cat("\n\n"); cat("Sample size for each independent at each effect size:\n"); print(my.sizes); cat("\n\n"); cat("Maximum sample size at each effect size:\n"); print(my.maxes); cat("\n");