0. What we'll be doing

  1. The %in% operator
  2. Association Tests

1. The %in% operator

vec1 <- c(1,2,3,NA,5,6)
vec2 <- c(1,2,3,4,1,2)

# The == operator
vec1 == vec2
## [1]  TRUE  TRUE  TRUE    NA FALSE FALSE
# The %in% operator
vec1 %in% vec2
## [1]  TRUE  TRUE  TRUE FALSE FALSE FALSE
# This has important consequences
library(tidyverse)
library(rio)
cses <- import("cses5.rdata")

# Code in lab 7
tmp1 <- cses |>
  mutate(E2003 = ifelse(E2003>96,NA,E2003),
         educ_low=ifelse(E2003 %in% c(1:3,96),1,0),
         educ_mid=ifelse(E2003 %in% 4:5,1,0),
         educ_hig=ifelse(E2003 %in% 6:9,1,0))

table(cses$E2003,tmp1$educ_low,useNA="ifany")
##     
##          0     1
##   1      0  3592
##   2      0 11334
##   3      0 13548
##   4  30983     0
##   5  11239     0
##   6   6760     0
##   7  21034     0
##   8  11314     0
##   9   1301     0
##   96     0   586
##   97   823     0
##   98   628     0
##   99  1572     0
# What you should be doing in assignment 3
tmp2 <- cses |>
  mutate(educ_low=ifelse(E2003 %in% c(1:3,96),1,ifelse(E2003 >96,NA,0)),
         educ_mid=ifelse(E2003 %in% 4:5,1,      ifelse(E2003 >96,NA,0)),
         educ_hig=ifelse(E2003 %in% 6:9,1,      ifelse(E2003 >96,NA,0)))

table(cses$E2003,tmp2$educ_low,useNA="ifany")
##     
##          0     1  <NA>
##   1      0  3592     0
##   2      0 11334     0
##   3      0 13548     0
##   4  30983     0     0
##   5  11239     0     0
##   6   6760     0     0
##   7  21034     0     0
##   8  11314     0     0
##   9   1301     0     0
##   96     0   586     0
##   97     0     0   823
##   98     0     0   628
##   99     0     0  1572

2. Association Tests

2.1 Contigency tables/cross-tabs

The frequency of each combination made by two variables.

  • For frequencies: table()
  • For percentages: prop.table(table(),margin=2)
    • Here margin=2 implies that the columns will sum up to 100.
    • margin=1 makes it so that rows sum up to 100.
    • Usually, put the DV in the rows and IV in the columns.
  • There is also a DAMisc function: xt
# install.packages("wooldridge")
data(happiness, package="wooldridge")
dat <- happiness %>% 
  slice(sample(1:nrow(happiness),3000)) %>% 
  mutate(happy=fct_rev(droplevels(happy)),
         income=droplevels(income),
         attend=droplevels(attend),
         divorce=droplevels(divorce),
         workstat=droplevels(workstat),
         wrkFTime=ifelse(workstat=="working fulltime",1,0),
         workstat_cat=case_when(workstat=="working fulltime"~"Fulltime",
                                workstat=="working parttime" ~"Parttime",
                                workstat=="retired"~"Retired",
                                workstat=="keeping house"~"Keeping house",
                                workstat %in% c("temp not working",
                                                "unempl, laid off",
                                                "retired",
                                                "school",
                                                "other") ~ "Unemployed",
                                TRUE~ NA),
         educMoreHS=ifelse(educ>=13,1,0),
         kids=ifelse(babies==0&preteen==0&teens==0,0,1))

prop.table(table(dat$happy,dat$reg16),margin = 2)
##                
##                    foreign new england middle atlantic e. nor. central
##   not too happy 0.15348837  0.09790210      0.13191489      0.12110727
##   pretty happy  0.53953488  0.57342657      0.54468085      0.57958478
##   very happy    0.30697674  0.32867133      0.32340426      0.29930796
##                
##                 w. nor. central south atlantic e. sou. central w. sou. central
##   not too happy      0.10222222     0.14252874      0.15736041      0.09602649
##   pretty happy       0.59111111     0.51494253      0.52284264      0.60927152
##   very happy         0.30666667     0.34252874      0.31979695      0.29470199
##                
##                   mountain    pacific
##   not too happy 0.10897436 0.09318996
##   pretty happy  0.60256410 0.58422939
##   very happy    0.28846154 0.32258065
tab <- xt(dat,"happy","reg16")
tab$tab
## [[1]]
##    happy/reg16    foreign new england middle atlantic e. nor. central
##  not too happy  15%  (33)   10%  (14)       13%  (62)       12%  (70)
##   pretty happy  54% (116)   57%  (82)       54% (256)       58% (335)
##     very happy  31%  (66)   33%  (47)       32% (152)       30% (173)
##          Total 100% (215)  100% (143)      100% (470)      100% (578)
##  w. nor. central south atlantic e. sou. central w. sou. central   mountain
##        10%  (23)      14%  (62)       16%  (31)       10%  (29)  11%  (17)
##        59% (133)      51% (224)       52% (103)       61% (184)  60%  (94)
##        31%  (69)      34% (149)       32%  (63)       29%  (89)  29%  (45)
##       100% (225)     100% (435)      100% (197)      100% (302) 100% (156)
##     pacific        Total
##    9%  (26)  12%   (367)
##   58% (163)  56% (1,690)
##   32%  (90)  31%   (943)
##  100% (279) 100% (3,000)
tab$stats
## [[1]]
##               statistic p-value
## Chi-squared 18.66565814   0.412
## Cramers V    0.05577583   0.412
## Lambda       0.00000000   1.000

You might ask whether those two variables are related in the population or not:

  • \(H_0:\) Variables X and Y are independent in the population.
  • \(H_A:\) Variables X and Y are not independent in the population.

You'll learn about the \(\chi^2\) test on Wednesday.

2.2 Which test?

Test Data type Why do we use it? Interpretation
\(\chi^2\) Nominal Are the frequencies of the contingency table of the two variables randomly distributed? \(H_0\): Frequencies are randomly distributed.
\(\phi\) Nominal
(2x2 table)
How strong is the relationship (-1 to 1)? 0=no
>0.1=moderate
>0.3=strong
Cramer's V Nominal
(>2x2 table)
How strong is the relationship (-1 to 1)? 0=no
>0.1=moderate
>0.3=strong
\(\lambda\) Nominal What is the exact improvement in error in predicting DV by knowing IV? Percent improvement
\(\gamma\) Ordinal How strong is the relationship (-1 to 1)? 0=no
>0.1=moderate
>0.3=strong
Somer's d Ordinal How strong is the relationship (-1 to 1)?
(A \(\gamma\) that accounts for ties on IV)
0=no
>0.1=moderate
>0.3=strong
Kendall's \(\tau_b\) Ordinal How strong is the relationship (-1 to 1)?
(A \(\gamma\) that accounts for ties on IV and DV)
0=no
>0.1=moderate
>0.3=strong
Spearman's \(\rho_s\) Ordinal How strong is the relationship (-1 to 1)?
(Talk of a correlation. Testing for monotonic relationship)
0=no
>0.1=moderate
>0.3=strong
Pearson r Continuous How strong is the relationship (-1 to 1)?
(Talk of a correlation. Testing for linear relationship)
0=no
>0.1=moderate
>0.3=strong
  • \(°□°)/ ''Greek ABCs!'' \(°□°)/
Letter Name Letter Name
\(A \alpha\) Alpha \(N\nu\) Nu
\(B \beta\) Beta \(\Xi\xi\) Xi
\(\Gamma\gamma\) Gamma \(O\omicron\) Omicron
\(\Delta\delta\) Delta \(\Pi\pi\) Pi
\(E\epsilon/\varepsilon\) Epsilon \(P\rho\) Rho
\(Z\zeta\) Zeta \(\Sigma\sigma\) Sigma
\(H\eta\) Eta \(T\tau\) Tau
\(\Theta\theta\) Theta \(\Upsilon\upsilon\) Upsilon
\(I\iota\) Iota \(\Phi\phi\) Phi
\(K\kappa\) Kappa \(X\chi\) Chi
\(\Lambda\lambda\) Lambda \(\Psi\psi\) Psi
\(M\mu\) Mu \(\Omega\omega\) Omega
  • One function to rule them all: uwo4419::makeStats

2.3 Nominal Association Test Example

Using the variables income,attend,wrkFTime,educMoreHS,kids, calculate measures of association for the relationship between each of these and happy. Make a plot that shows the effects of each variable.

# Stats
library(uwo4419)
stat_income     <- makeStats(dat$happy,dat$income,    lambda=T,cramersV=T,chisq=T,gamma=T,d=T,taub=T,rho=T)
stat_attend     <- makeStats(dat$happy,dat$attend,    lambda=T,cramersV=T,chisq=T,gamma=T,d=T,taub=T,rho=T)
stat_wrkFTime   <- makeStats(dat$happy,dat$wrkFTime,  lambda=T,cramersV=T,chisq=T,gamma=T,d=T,taub=T,rho=T)
stat_educMoreHS <- makeStats(dat$happy,dat$educMoreHS,lambda=T,cramersV=T,chisq=T,gamma=T,d=T,taub=T,rho=T)
stat_kids       <- makeStats(dat$happy,dat$kids,      lambda=T,cramersV=T,chisq=T,gamma=T,d=T,taub=T,rho=T)

# Data for Graph
stats <- bind_rows(stat_income %>% as_tibble(rownames="stat") %>% mutate(var="Income"),
                   stat_attend %>% as_tibble(rownames="stat") %>% mutate(var="Rel. Attend."),
                   stat_wrkFTime %>% as_tibble(rownames="stat") %>% mutate(var="Works"),
                   stat_educMoreHS %>% as_tibble(rownames="stat") %>% mutate(var="Higher Ed."),
                   stat_kids %>% as_tibble(rownames="stat") %>% mutate(var="Kids"))

# Graph
ggplot(stats %>% filter(stat!="Chi-squared"),aes(x=reorder(as.factor(var), statistic, mean), y=statistic)) + 
  geom_bar(stat="identity") + 
  facet_wrap(~stat,scales = "free_x") + 
  theme_minimal() + 
  labs(x="", y="Statistic")+
  coord_flip()

2.4 Variance, Covariance and Correlation

  • Sample variance of Y: \[\sigma_Y=\frac{\sum_{i=1}^n(Y_i-\bar Y)^2}{n-1}\]
  • Sample covariance between X and Y: \[cov(X,Y)=\frac{\sum_{i=1}^n (X_i-\bar X)(Y_i-\bar Y)}{n-1}\]
  • Pearson correlation between X and Y: \[r_{XY}=\frac{\sum_{i=1}^n(X_i-\bar X)(Y_i-\bar Y)}{\sqrt{\sum_{i=1}^n(X_i-\bar X)^2\sum_{i=1}^n(Y_i-\bar Y)^2}}\]
  • R functions:
    • Covariance: cov
    • Correlation: cor
      • There is also DAMisc::pwCorrMat() to produce a correlation matrix, more on this in a bit.
# A random variable X
X <- rnorm(1000,0,1)

# Another random variable Y
Ypos <- rnorm(1000,0,1)+X
Ynull <- rnorm(1000,0,1)+rnorm(1000,0,1)
Yneg <- rnorm(1000,0,1)-X

FakeData <- data.frame(X=X,Ypos=Ypos,Ynull=Ynull,Yneg=Yneg)

# Variance covariance matrix
cov(FakeData)
##                 X          Ypos         Ynull        Yneg
## X      1.00576319  1.0098977468  0.0259181614 -1.02344345
## Ypos   1.00989775  1.9904549383  0.0004238368 -1.01800984
## Ynull  0.02591816  0.0004238368  2.0387405539 -0.02177242
## Yneg  -1.02344345 -1.0180098427 -0.0217724240  2.02154498
# Correlations
cor(FakeData[,1],FakeData[,2:4])
##           Ypos      Ynull       Yneg
## [1,] 0.7137619 0.01809987 -0.7177518
# Vizualisation 
FakeData %>% 
  pivot_longer(starts_with("Y"),values_to = "Y",names_to = "var",names_pattern = "Y(.*)") %>%
  mutate(var=factor(var,levels=c("pos","null","neg"))) %>% 
  ggplot(aes(x=X,y=Y))+
  geom_point(alpha=0.2,color="#4F2683",shape=16)+
  geom_smooth(method="lm",color="#4F2683")+
  facet_wrap(~var)+
  theme_minimal()

2.5 Functions to copy and paste

custom_smooth <- function(data, mapping, 
                          ..., span=.35, pt.alpha=.25, jitter=TRUE) {
  if(jitter){
    pos <- position_jitter(width=2, height=2)
  }else{
    pos <- position_identity()
  }
  ggplot(data, mapping, ...) + 
    geom_point(shape=1, col="gray", 
               position=pos, alpha=pt.alpha) + 
    geom_smooth(method="loess", span=span,
                family="symmetric",
                se=FALSE, col="red") + 
    geom_smooth(method="lm", col="black", se=FALSE) 
}

check_lin <- function(formula, data, ...){
  requireNamespace("fANCOVA")
  D <- get_all_vars(formula, data)
  D <- na.omit(D)
  if(nrow(D) == 0)stop("No non-missing data.\n")
  forms <- sapply(names(D)[-1], \(x)reformulate(x, response=names(D)[1]))
  y <- c(D[,1])
  X <- as.matrix(D[,-1])
  spans <- sapply(1:ncol(X), \(i)fANCOVA::loess.as(X[,i], y, criterion = "gcv")$pars$span)
  los <- lapply(seq_along(forms), \(i)do.call(loess, list(formula=forms[[i]], data=D, span=spans[i])))
  lms <- lapply(seq_along(forms), \(i)do.call(lm, list(formula=forms[[i]], data=D)))
  n <- nrow(D)
  rss0 <- sapply(lms, \(x)sum(x$residuals^2))
  rss1 <- sapply(los, \(x)sum(x$residuals^2))
  df.linmod <- sapply(lms, \(x)x$rank)
  df.lomod <- sapply(los, \(x)x$trace.hat)
  df.res <- n-df.lomod
  F0 <- ((rss0-rss1)/(df.lomod-df.linmod))/(rss1 / df.res)
  pval <- pf(F0, (df.lomod-df.linmod), df.res, lower.tail=FALSE)
  nl_r2 <- sapply(los, \(m)cor(m$y, m$fitted)^2)
  lin_r2 <- sapply(lms, \(m)cor(m$fitted.values, model.response(model.frame(m)))^2)
  out <- data.frame(vbl = names(D)[-1], lin_r2 = lin_r2, nonlin_r2 = nl_r2)
  out$diff <- out$nonlin_r2 - out$lin_r2
  out$p <- pval
  class(out) <- c("checklin", class(out))
  return(out)
}

2.6 Correlation Examples

#install.packages("GGally")
library(GGally)

# Data
data(countymurders, package="wooldridge")
sub <- countymurders %>% 
  filter(year==1996) %>% 
  select(murdrate,arrestrate,percblack,percmale,rpcpersinc)

# Correlations
DAMisc::pwCorrMat(murdrate~arrestrate+percblack+percmale+rpcpersinc,data=sub)
## Pairwise Correlations
##            murdrate arrestrate percblack percmale rpcpersinc
## murdrate                                                    
## arrestrate 0.567*                                           
## percblack  0.380*   0.381*                                  
## percmale   -0.081*  -0.095*    -0.262*                      
## rpcpersinc -0.073*  -0.105*    -0.065*   -0.121*
# Visualize
ggpairs(sub,
        lower = list(continuous = wrap(custom_smooth, 
                                       span=.5, 
                                       pt.alpha=1,
                                       jitter=FALSE))) + 
  theme(legend.position = "bottom")

# Test for non-linearity
check_lin(murdrate~arrestrate+percblack+percmale+rpcpersinc,data=sub)
##          vbl      lin_r2  nonlin_r2         diff            p
## 1 arrestrate 0.321954125 0.35841350 0.0364593719 1.598702e-25
## 2  percblack 0.144494166 0.14486312 0.0003689559 8.261143e-01
## 3   percmale 0.006610729 0.06359355 0.0569828165 1.106390e-24
## 4 rpcpersinc 0.005380532 0.01991495 0.0145344160 3.271076e-06

2.6.1 X1 and X2

##   vbl    lin_r2 nonlin_r2        diff         p
## 1  X1 0.6884561 0.6896512 0.001195098 0.2169462
  • Linearity?
  • Significance?
  • Strong relationship?

2.6.2 X1 and X3

##   vbl     lin_r2 nonlin_r2      diff            p
## 1  X1 1.4076e-05 0.2230931 0.2230791 1.619498e-49
  • Linearity?
  • Significance?
  • Strong relationship?

2.6.3 X1 and X4

##   vbl    lin_r2 nonlin_r2       diff            p
## 1  X1 0.9314118 0.9477851 0.01637333 2.671061e-48
  • Linearity?
  • Significance?
  • Strong relationship?

2.6.4 X1 and X5

##   vbl     lin_r2  nonlin_r2        diff         p
## 1  X1 0.01063344 0.01509156 0.004458115 0.1568782
  • Linearity?
  • Significance?
  • Strong relationship?

2.6.5 X1 and X6

##   vbl    lin_r2 nonlin_r2       diff            p
## 1  X1 0.6864107 0.7244987 0.03808802 1.016584e-24
  • Linearity?
  • Significance?
  • Strong relationship?