Rで統計的仮説検定:F検定からt検定への2段階検定のシミュレーション

Rでデータサイエンス

F検定からt検定への2段階検定のシミュレーション

  1. F検定対立仮説 : alternative hypothesis: true ratio of variances is not equal to 1
  2. t検定対立仮説 : alternative hypothesis: true difference in means is not equal to 0

シミュレーション

fun_sim <- function(n_x, n_y, mean_x, mean_y, sd_x_sample, sd_y_sample, iter, conf.level, seed) {
    set.seed(seed = seed)
    t.test.p.value.by.ftest <- t.test.p.value.by.actual <- vector()
    for (iii in seq(iter)) {
        sd_x <- sample(x = sd_x_sample, size = 1)
        sd_y <- sample(x = sd_y_sample, size = 1)
        x <- rnorm(n = n_x, mean = mean_x, sd = sd_x)
        y <- rnorm(n = n_y, mean = mean_y, sd = sd_y)
        # F検定
        var.test.p.value <- var.test(x = x, y = y, ratio = 1, alternative = "two.sided", conf.level = conf.level)$p.value
        var.equal <- (var.test.p.value > (1 - conf.level))
        # F検定の結果(p値)による
        t.test.p.value.by.ftest[iii] <- t.test(x = x, y = y, alternative = "two.sided", paired = F, var.equal = var.equal, conf.level = conf.level)$p.value
        # 実際の分散による
        t.test.p.value.by.actual[iii] <- t.test(x = x, y = y, alternative = "two.sided", paired = F, var.equal = (sd_x == sd_y), conf.level = conf.level)$p.value
    }
    return(list(t.test.p.value.by.ftest = t.test.p.value.by.ftest, t.test.p.value.by.actual = t.test.p.value.by.actual))
}

平均値が同一の例

seed <- 20230507
iter <- 10000
n_x <- 30
n_y <- 30
sd_x_sample <- c(2, 5)
sd_y_sample <- c(2, 5)
conf.level <- 0.95
mean_x <- 10
mean_y <- 10
tmp <- fun_sim(n_x = n_x, n_y = n_y, mean_x = mean_x, mean_y = mean_y, sd_x_sample = sd_x_sample, sd_y_sample = sd_y_sample, iter = iter, conf.level = conf.level, seed = seed)
t.test.p.value.by.ftest <- tmp$t.test.p.value.by.ftest
t.test.p.value.by.actual <- tmp$t.test.p.value.by.actual
F検定の結果(p値)で分散均一・不均一を判断した場合のt検定
1 - sum(t.test.p.value.by.ftest > (1 - conf.level))/iter
[1] 0.0485
実際の分散によるt検定
1 - sum(t.test.p.value.by.actual > (1 - conf.level))/iter
[1] 0.0485

平均値が異なる例

mean_x <- 10
mean_y <- 15
tmp <- fun_sim(n_x = n_x, n_y = n_y, mean_x = mean_x, mean_y = mean_y, sd_x_sample = sd_x_sample, sd_y_sample = sd_y_sample, iter = iter, conf.level = conf.level, seed = seed)
t.test.p.value.by.ftest <- tmp$t.test.p.value.by.ftest
t.test.p.value.by.actual <- tmp$t.test.p.value.by.actual
F検定の結果(p値)で分散均一・不均一を判断した場合のt検定
1 - sum(t.test.p.value.by.ftest < (1 - conf.level))/iter
[1] 0.008
実際の分散によるt検定
1 - sum(t.test.p.value.by.actual < (1 - conf.level))/iter
[1] 0.0079

t.test {stats}のコード

getS3method(f = "t.test", class = "default")
function (x, y = NULL, alternative = c("two.sided", "less", "greater"), 
    mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95, 
    ...) 
{
    alternative <- match.arg(alternative)
    if (!missing(mu) && (length(mu) != 1 || is.na(mu))) 
        stop("'mu' must be a single number")
    if (!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) || 
        conf.level < 0 || conf.level > 1)) 
        stop("'conf.level' must be a single number between 0 and 1")
    if (!is.null(y)) {
        dname <- paste(deparse1(substitute(x)), "and", deparse1(substitute(y)))
        if (paired) 
            xok <- yok <- complete.cases(x, y)
        else {
            yok <- !is.na(y)
            xok <- !is.na(x)
        }
        y <- y[yok]
    }
    else {
        dname <- deparse1(substitute(x))
        if (paired) 
            stop("'y' is missing for paired test")
        xok <- !is.na(x)
        yok <- NULL
    }
    x <- x[xok]
    if (paired) {
        x <- x - y
        y <- NULL
    }
    nx <- length(x)
    mx <- mean(x)
    vx <- var(x)
    if (is.null(y)) {
        if (nx < 2) 
            stop("not enough 'x' observations")
        df <- nx - 1
        stderr <- sqrt(vx/nx)
        if (stderr < 10 * .Machine$double.eps * abs(mx)) 
            stop("data are essentially constant")
        tstat <- (mx - mu)/stderr
        method <- if (paired) 
            "Paired t-test"
        else "One Sample t-test"
        estimate <- setNames(mx, if (paired) 
            "mean difference"
        else "mean of x")
    }
    else {
        ny <- length(y)
        if (nx < 1 || (!var.equal && nx < 2)) 
            stop("not enough 'x' observations")
        if (ny < 1 || (!var.equal && ny < 2)) 
            stop("not enough 'y' observations")
        if (var.equal && nx + ny < 3) 
            stop("not enough observations")
        my <- mean(y)
        vy <- var(y)
        method <- paste(if (!var.equal) 
            "Welch", "Two Sample t-test")
        estimate <- c(mx, my)
        names(estimate) <- c("mean of x", "mean of y")
        if (var.equal) {
            df <- nx + ny - 2
            v <- 0
            if (nx > 1) 
                v <- v + (nx - 1) * vx
            if (ny > 1) 
                v <- v + (ny - 1) * vy
            v <- v/df
            stderr <- sqrt(v * (1/nx + 1/ny))
        }
        else {
            stderrx <- sqrt(vx/nx)
            stderry <- sqrt(vy/ny)
            stderr <- sqrt(stderrx^2 + stderry^2)
            df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 
                1))
        }
        if (stderr < 10 * .Machine$double.eps * max(abs(mx), 
            abs(my))) 
            stop("data are essentially constant")
        tstat <- (mx - my - mu)/stderr
    }
    if (alternative == "less") {
        pval <- pt(tstat, df)
        cint <- c(-Inf, tstat + qt(conf.level, df))
    }
    else if (alternative == "greater") {
        pval <- pt(tstat, df, lower.tail = FALSE)
        cint <- c(tstat - qt(conf.level, df), Inf)
    }
    else {
        pval <- 2 * pt(-abs(tstat), df)
        alpha <- 1 - conf.level
        cint <- qt(1 - alpha/2, df)
        cint <- tstat + c(-cint, cint)
    }
    cint <- mu + cint * stderr
    names(tstat) <- "t"
    names(df) <- "df"
    names(mu) <- if (paired) 
        "mean difference"
    else if (!is.null(y)) 
        "difference in means"
    else "mean"
    attr(cint, "conf.level") <- conf.level
    rval <- list(statistic = tstat, parameter = df, p.value = pval, 
        conf.int = cint, estimate = estimate, null.value = mu, 
        stderr = stderr, alternative = alternative, method = method, 
        data.name = dname)
    class(rval) <- "htest"
    rval
}
<bytecode: 0x0000023df5c3a438>
<environment: namespace:stats>

最終更新

Sys.time()
[1] "2024-04-07 04:40:52 JST"

R、Quarto、Package

R.Version()$version.string
[1] "R version 4.3.3 (2024-02-29 ucrt)"
quarto::quarto_version()
[1] '1.4.542'
packageVersion(pkg = "tidyverse")
[1] '2.0.0'

著者