【R】尺度の組み合わせ別の類似度を求める

尺度 [1]の組み合わせ別の類似度分析の手法をまとめてみました。MIC, HSIC など比較的 新しい相関係数については触れていません。
また, 調査不足による抜けもあるかと思います…

手法 正規性 尺度の組み合わせ
Pearson’s Correlation 量的変数 x 量的変数
Cramer’s V 不要 名義尺度 x 名義尺度 (2×2より大きい)
Goodman-Kruskal Gamma 不要 順序尺度 x 順序尺度 (2×2より大きい)
Spearman’s Rank-Order Correlation 不要

順序尺度 x 順序尺度 (順位の差)
Kendall Rank Correlation 不要

順序尺度 x 順序尺度 (順序関係の一致率)
Polychoric Correlation

順序尺度 x 順序尺度 (心理尺度で有効, 背景に連続性を仮定)
Polyserial Correlation

順序尺度 x 量的変数
Tetrachoric Correlation 不要

順序尺度 x 順序尺度 (2値なので名義尺度?)

これらを R で実践してみます。用いるデータは架空のデータセットを含みます。

Pearson’s Correlation

d <- read.csv("data/accident.csv")
#     city post accident population
# 1     1  160       58         85
# 2     2  175       68         91
# 3     3  158       55         79
# 4     4  165       63         88
# 5     5  177       66         95
# 6     6  166       67         89
# 7     7  170       59         87
# 8     8  171       62         91
# 9     9  173       65         93
# 10   10  168       61         90

cor(d$accident, d$population, method = "p")
# [1] 0.8132599

cor(d$accident, d$post, method="p")
# [1] 0.7440411

偏相関係数を求める場合は psych::partial.r() を用いる方法がある。

require(psych)

partial.r(d, c(2, 4), 3)
#   partial correlations
#            post population
# post       1.00       0.76
# population 0.76       1.00

partial.r(d, c(2, 3), 4)
#   partial correlations
#          post accident
# post     1.00     0.04
# accident 0.04     1.00

Cramer's V

require(lsr)

d <- read.csv("data/conditions.csv")
#     choice condition1 condition2
# 1      a         30         35
# 2      b         20         30
# 3      c         50         35

x <- cbind(d$condition1, d$condition2)

cramersV(x)
# [1] 0.1586139

Goodman-Kruskal Gamma

require(vcdExtra)

# See https://www.inside-r.org/packages/cran/vcdExtra/docs/JobSat
# contingency table of job satisfaction (1996 General Social Survey)
data(JobSat)
#          satisfaction
# income   VeryD LittleD ModerateS VeryS
# < 15k      1       3        10     6
# 15-25k     2       3        10     7
# 25-40k     1       6        14    12
# > 40k      0       1         9    11

GKgamma(JobSat, level=0.95)
# gamma        : 0.221
# std. error   : 0.117
# CI           : -0.009 0.451

Spearman's Rank-Order Correlation

d <- read.csv("data/satisfaction.csv")
#     id satisfaction price gender
# 1   1            5     4   male
# 2   2            2     1 female
# 3   3            3     2 female
# 4   4            1     2 female
# 5   5            1     0   male
# 6   6            2     3 female
# 7   7            3     1   male
# 8   8            3     3 female
# 9   9            4     5   male
# 10 10            0     1 female

# Spearman's rank correlation rho
cor.test(d$satisfaction, d$price, method="s")
# data:  d$satisfaction and d$price
# S = 48.774, p-value = 0.02295
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
#     rho
# 0.7044025

Kendall Rank Correlation

require(Kendall)

d <- read.csv("data/satisfaction.csv")
#     id satisfaction price gender
# 1   1            5     4   male
# 2   2            2     1 female
# 3   3            3     2 female
# 4   4            1     2 female
# 5   5            1     0   male
# 6   6            2     3 female
# 7   7            3     1   male
# 8   8            3     3 female
# 9   9            4     5   male
# 10 10            0     1 female

# tau Kendall’s tau statistic
Kendall(d$satisfaction, d$price)
# tau = 0.575, 2-sided pvalue =0.039549

# or cor.test()
cor.test(d$satisfaction, d$price, method="k")
# Kendall's rank correlation tau
#
# data:  d$satisfaction and d$price
# z = 2.152, p-value = 0.0314
# alternative hypothesis: true tau is not equal to 0
# sample estimates:
# tau
# 0.575

Polychoric Correlation

require(polycor)

d <- read.csv("data/satisfaction.csv")
#     id satisfaction price gender
# 1   1            5     4   male
# 2   2            2     1 female
# 3   3            3     2 female
# 4   4            1     2 female
# 5   5            1     0   male
# 6   6            2     3 female
# 7   7            3     1   male
# 8   8            3     3 female
# 9   9            4     5   male
# 10 10            0     1 female

# rho the polychoric correlation.
polychor(d$satisfaction, d$price)
# [1] 0.7377534

polychor(d$satisfaction, d$price, ML = TRUE, std.err = TRUE)
# Polychoric Correlation, ML est. = 0.7384 (0.1673)
# Test of bivariate normality: Chisquare = 15.39, df = 24, p = 0.9089
#
#   Row Thresholds
#   Threshold Std.Err.
# 1  -1.25300   0.5145
# 2  -0.53780   0.4094
# 3  -0.02593   0.3898
# 4   0.92270   0.4775
# 5   1.43200   0.5435
#
#
#   Column Thresholds
#   Threshold Std.Err.
# 1   -1.2430   0.5176
# 2   -0.2821   0.3923
# 3    0.2130   0.4006
# 4    0.9278   0.4769
# 5    1.4350   0.5431

Polyserial Correlation

require(polycor)

d <- read.csv("data/quality.csv")
#     id quality price
# 1   1       1   980
# 2   2       2  2980
# 3   3       3  2980
# 4   4       4  4980
# 5   5       3  4480
# 6   6       2  1980
# 7   7       3  1480
# 8   8       3  1850
# 9   9       1  1200
# 10 10       5  9800

polyserial(d$price, d$quality, std.err = TRUE)
# Polyserial Correlation, 2-step est. = 0.9015 (0.06796)
# Test of bivariate normality: Chisquare = 10.83, df = 14, p = 0.6991

Tetrachoric Correlation

require(psych)

d <- read.csv("data/gender.csv")
#     bought male female
# 1   true  350     50
# 2  false  210    400

x <- cbind(d$male, d$female)

# tetrachoric correlation
tetrachoric(x, na.rm = TRUE)
# [1] 0.76
#
#  with tau of
# [1] -0.26  0.14

Codeは GitHub に置いた。


[1] 尺度には 名義尺度, 順序尺度, 間隔尺度, 比尺度 (比例尺度)があり, 名義尺度は定性的変数で残りが定量的変数となる。ただし, 順序尺度は離散値しか取らず, 比尺度は連続値しか取らない。
[2] 順序尺度の相関係数(ポリコリック相関係数)について
[3] 研究と検定