Rで統計的仮説検定:分散分析:二元配置分散分析:バランスケース

Rでデータサイエンス

分散分析:二元配置分散分析:バランスケース

サンプル

library(dplyr)
seed <- 20230901
set.seed(seed = seed)
f1 <- rep(c("x", "y"), times = c(20, 20)) %>%
    as.factor()
f2 <- rep(c("a", "b", "c", "d"), 10) %>%
    as.factor()
m1 <- ifelse(f1 == "x", 15, 23)
m2 <- ifelse(f2 == "a", 1, ifelse(f2 == "b", 1.75, ifelse(f2 == "c", 1.95, 2.25)))
df <- data.frame(f1, f2, m1, m2)
df$m <- df$m1 * df$m2 + rnorm(n = nrow(df))
df$value <- sapply(X = df$m, FUN = function(x) rnorm(n = 1, mean = x, sd = 10))
df
   f1 f2 m1   m2        m     value
1   x  a 15 1.00 14.57591 15.586698
2   x  b 15 1.75 24.76703 31.784648
3   x  c 15 1.95 28.50603 51.066508
4   x  d 15 2.25 34.17910 27.398679
5   x  a 15 1.00 15.29687 35.679410
6   x  b 15 1.75 25.91275 18.683544
7   x  c 15 1.95 27.79418 34.876149
8   x  d 15 2.25 34.30550 34.986304
9   x  a 15 1.00 15.84223 27.965686
10  x  b 15 1.75 27.02978 15.943180
11  x  c 15 1.95 28.55509 31.659975
12  x  d 15 2.25 32.97901 28.581968
13  x  a 15 1.00 13.86567 22.052890
14  x  b 15 1.75 25.97982 32.087304
15  x  c 15 1.95 28.05549 33.839756
16  x  d 15 2.25 32.82247 33.015406
17  x  a 15 1.00 15.06985 19.930193
18  x  b 15 1.75 26.39754 42.762575
19  x  c 15 1.95 28.80750 29.406095
20  x  d 15 2.25 32.41746  7.930854
21  y  a 23 1.00 23.19875 17.659411
22  y  b 23 1.75 39.54051 42.843771
23  y  c 23 1.95 43.08214 49.947940
24  y  d 23 2.25 50.75247 47.427668
25  y  a 23 1.00 23.26423 20.084177
26  y  b 23 1.75 40.68578 23.418295
27  y  c 23 1.95 41.80933 36.139193
28  y  d 23 2.25 51.09129 52.084460
29  y  a 23 1.00 23.33087 35.516986
30  y  b 23 1.75 39.86712 29.327258
31  y  c 23 1.95 46.28799 51.377011
32  y  d 23 2.25 51.50059 48.192886
33  y  a 23 1.00 21.72716 20.961337
34  y  b 23 1.75 41.31516 41.704087
35  y  c 23 1.95 43.87779 33.781705
36  y  d 23 2.25 52.74479 49.867135
37  y  a 23 1.00 23.92167 17.171498
38  y  b 23 1.75 39.87791 67.851184
39  y  c 23 1.95 45.31850 52.030496
40  y  d 23 2.25 52.86933 63.101183
rbind(df[df$f1 == "x", ] %>%
    head(4), df[df$f1 == "y", ] %>%
    head(4))
   f1 f2 m1   m2        m    value
1   x  a 15 1.00 14.57591 15.58670
2   x  b 15 1.75 24.76703 31.78465
3   x  c 15 1.95 28.50603 51.06651
4   x  d 15 2.25 34.17910 27.39868
21  y  a 23 1.00 23.19875 17.65941
22  y  b 23 1.75 39.54051 42.84377
23  y  c 23 1.95 43.08214 49.94794
24  y  d 23 2.25 50.75247 47.42767
# 各群のサンプルサイズ
paste0(df$f1, "*", df$f2) %>%
    table()
.
x*a x*b x*c x*d y*a y*b y*c y*d 
  5   5   5   5   5   5   5   5 

分散分析表の作成

library(knitr)
library(kableExtra)
options(knitr.kable.NA = "")
anova_tb <- matrix(nrow = 5, ncol = 5)  # 分散分析表
colnames(anova_tb) <- c("偏差平方和", "自由度", "分散", "分散比(F値)", "p値")
row.names(anova_tb) <- c("全体変動", "因子f1", "因子f2", "交互作用:f1×f2", "誤差変動")
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動
因子f1
因子f2
交互作用:f1×f2
誤差変動

平均値の算出

(mean_for_all <- mean(df$value))
[1] 34.39314
(mean_for_f1 <- aggregate(x = df$value, by = list(df$f1), FUN = mean))
  Group.1        x
1       x 28.76189
2       y 40.02438
(mean_for_f2 <- aggregate(x = df$value, by = list(df$f2), FUN = mean))
  Group.1        x
1       a 23.26083
2       b 34.64058
3       c 40.41248
4       d 39.25865
(mean_for_f1f2 <- aggregate(x = df$value, by = list(df$f1, df$f2), FUN = mean))
  Group.1 Group.2        x
1       x       a 24.24298
2       y       a 22.27868
3       x       b 28.25225
4       y       b 41.02892
5       x       c 36.16970
6       y       c 44.65527
7       x       d 26.38264
8       y       d 52.13467

偏差平方和の算出

size_x <- sum(df$f1 == "x")
size_y <- sum(df$f1 == "y")
size_a <- sum(df$f2 == "a")
size_b <- sum(df$f2 == "b")
size_c <- sum(df$f2 == "c")
size_d <- sum(df$f2 == "d")
anova_tb[1, 1] <- sum((df$value - mean_for_all)^2)
anova_tb[2, 1] <- sum((mean_for_f1$x - mean_for_all)^2 * c(size_x, size_y))
anova_tb[3, 1] <- sum((mean_for_f2$x - mean_for_all)^2 * c(size_a, size_b, size_c, size_d))
anova_tb[4, 1] <- sum((mean_for_f1f2$x - mean_for_all)^2 * aggregate(x = df$value, by = list(df$f1, df$f2), FUN = length)$x) - anova_tb[2, 1] - anova_tb[3, 1]
anova_tb[5, 1] <- anova_tb[1, 1] - anova_tb[2, 1] - anova_tb[3, 1] - anova_tb[4, 1]
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 7458.7980
因子f1 1268.4375
因子f2 1838.9530
交互作用:f1×f2 987.2461
誤差変動 3364.1615

自由度

anova_tb[1, 2] <- nrow(df) - 1
anova_tb[2, 2] <- df$f1 %>%
    unique() %>%
    length() - 1
anova_tb[3, 2] <- df$f2 %>%
    unique() %>%
    length() - 1
anova_tb[4, 2] <- anova_tb[2, 2] * anova_tb[3, 2]
anova_tb[5, 2] <- anova_tb[1, 2] - anova_tb[2, 2] - anova_tb[3, 2] - anova_tb[4, 2]
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 7458.7980 39
因子f1 1268.4375 1
因子f2 1838.9530 3
交互作用:f1×f2 987.2461 3
誤差変動 3364.1615 32

不偏分散

anova_tb[2, 3] <- anova_tb[2, 1]/anova_tb[2, 2]
anova_tb[3, 3] <- anova_tb[3, 1]/anova_tb[3, 2]
anova_tb[4, 3] <- anova_tb[4, 1]/anova_tb[4, 2]
anova_tb[5, 3] <- anova_tb[5, 1]/anova_tb[5, 2]
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 7458.7980 39
因子f1 1268.4375 1 1268.4375
因子f2 1838.9530 3 612.9843
交互作用:f1×f2 987.2461 3 329.0820
誤差変動 3364.1615 32 105.1300

分散比

anova_tb[2, 4] <- anova_tb[2, 3]/anova_tb[5, 3]
anova_tb[3, 4] <- anova_tb[3, 3]/anova_tb[5, 3]
anova_tb[4, 4] <- anova_tb[4, 3]/anova_tb[5, 3]
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 7458.7980 39
因子f1 1268.4375 1 1268.4375 12.065414
因子f2 1838.9530 3 612.9843 5.830725
交互作用:f1×f2 987.2461 3 329.0820 3.130238
誤差変動 3364.1615 32 105.1300

p値

anova_tb[2, 5] <- pf(q = anova_tb[2, 4], df1 = anova_tb[2, 2], df2 = anova_tb[5, 2], lower.tail = FALSE)
anova_tb[3, 5] <- pf(q = anova_tb[3, 4], df1 = anova_tb[3, 2], df2 = anova_tb[5, 2], lower.tail = FALSE)
anova_tb[4, 5] <- pf(q = anova_tb[4, 4], df1 = anova_tb[4, 2], df2 = anova_tb[5, 2], lower.tail = FALSE)
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 7458.7980 39
因子f1 1268.4375 1 1268.4375 12.065414 0.0014959
因子f2 1838.9530 3 612.9843 5.830725 0.0026929
交互作用:f1×f2 987.2461 3 329.0820 3.130238 0.0391552
誤差変動 3364.1615 32 105.1300
  • 「xとyの平均値は同一」、「a、b、c、dの平均値は同一」、「f1とf2の効果は同一」との帰無仮説はいずれも棄却される(有意水準5%の場合)。

(参考) anova {stats}による結果

anova(object = aov(formula = value ~ f1 + f2 + f1:f2, data = df))
Analysis of Variance Table

Response: value
          Df Sum Sq Mean Sq F value   Pr(>F)   
f1         1 1268.4 1268.44 12.0654 0.001496 **
f2         3 1839.0  612.98  5.8307 0.002693 **
f1:f2      3  987.2  329.08  3.1302 0.039155 * 
Residuals 32 3364.2  105.13                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(参考) Anova {car}による結果

car::Anova(lm(formula = value ~ f1 * f2, data = df), type = 2)
Anova Table (Type II tests)

Response: value
          Sum Sq Df F value   Pr(>F)   
f1        1268.4  1 12.0654 0.001496 **
f2        1839.0  3  5.8307 0.002693 **
f1:f2      987.2  3  3.1302 0.039155 * 
Residuals 3364.2 32                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::Anova(lm(formula = value ~ f2 * f1, data = df), type = 2)
Anova Table (Type II tests)

Response: value
          Sum Sq Df F value   Pr(>F)   
f2        1839.0  3  5.8307 0.002693 **
f1        1268.4  1 12.0654 0.001496 **
f2:f1      987.2  3  3.1302 0.039155 * 
Residuals 3364.2 32                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

参考引用資料

最終更新

Sys.time()
[1] "2024-04-11 04:45:15 JST"

R、Quarto、Package

R.Version()$version.string
[1] "R version 4.3.3 (2024-02-29 ucrt)"
quarto::quarto_version()
[1] '1.4.542'
packageVersion(pkg = "tidyverse")
[1] '2.0.0'

著者