vec1 <- c(1,2,3,NA,5,6)
vec2 <- c(1,2,3,4,1,2)
# The == operator
vec1 == vec2
## [1] TRUE TRUE TRUE NA FALSE FALSE
# The %in% operator
vec1 %in% vec2
## [1] TRUE TRUE TRUE FALSE FALSE FALSE
# This has important consequences
library(tidyverse)
library(rio)
cses <- import("cses5.rdata")
# Code in lab 7
tmp1 <- cses |>
mutate(E2003 = ifelse(E2003>96,NA,E2003),
educ_low=ifelse(E2003 %in% c(1:3,96),1,0),
educ_mid=ifelse(E2003 %in% 4:5,1,0),
educ_hig=ifelse(E2003 %in% 6:9,1,0))
table(cses$E2003,tmp1$educ_low,useNA="ifany")
##
## 0 1
## 1 0 3592
## 2 0 11334
## 3 0 13548
## 4 30983 0
## 5 11239 0
## 6 6760 0
## 7 21034 0
## 8 11314 0
## 9 1301 0
## 96 0 586
## 97 823 0
## 98 628 0
## 99 1572 0
# What you should be doing in assignment 3
tmp2 <- cses |>
mutate(educ_low=ifelse(E2003 %in% c(1:3,96),1,ifelse(E2003 >96,NA,0)),
educ_mid=ifelse(E2003 %in% 4:5,1, ifelse(E2003 >96,NA,0)),
educ_hig=ifelse(E2003 %in% 6:9,1, ifelse(E2003 >96,NA,0)))
table(cses$E2003,tmp2$educ_low,useNA="ifany")
##
## 0 1 <NA>
## 1 0 3592 0
## 2 0 11334 0
## 3 0 13548 0
## 4 30983 0 0
## 5 11239 0 0
## 6 6760 0 0
## 7 21034 0 0
## 8 11314 0 0
## 9 1301 0 0
## 96 0 586 0
## 97 0 0 823
## 98 0 0 628
## 99 0 0 1572
The frequency of each combination made by two variables.
table()prop.table(table(),margin=2)
margin=2 implies that the columns will sum up to
100.margin=1 makes it so that rows sum up to 100.DAMisc function: xt# install.packages("wooldridge")
data(happiness, package="wooldridge")
dat <- happiness %>%
slice(sample(1:nrow(happiness),3000)) %>%
mutate(happy=fct_rev(droplevels(happy)),
income=droplevels(income),
attend=droplevels(attend),
divorce=droplevels(divorce),
workstat=droplevels(workstat),
wrkFTime=ifelse(workstat=="working fulltime",1,0),
workstat_cat=case_when(workstat=="working fulltime"~"Fulltime",
workstat=="working parttime" ~"Parttime",
workstat=="retired"~"Retired",
workstat=="keeping house"~"Keeping house",
workstat %in% c("temp not working",
"unempl, laid off",
"retired",
"school",
"other") ~ "Unemployed",
TRUE~ NA),
educMoreHS=ifelse(educ>=13,1,0),
kids=ifelse(babies==0&preteen==0&teens==0,0,1))
prop.table(table(dat$happy,dat$reg16),margin = 2)
##
## foreign new england middle atlantic e. nor. central
## not too happy 0.15348837 0.09790210 0.13191489 0.12110727
## pretty happy 0.53953488 0.57342657 0.54468085 0.57958478
## very happy 0.30697674 0.32867133 0.32340426 0.29930796
##
## w. nor. central south atlantic e. sou. central w. sou. central
## not too happy 0.10222222 0.14252874 0.15736041 0.09602649
## pretty happy 0.59111111 0.51494253 0.52284264 0.60927152
## very happy 0.30666667 0.34252874 0.31979695 0.29470199
##
## mountain pacific
## not too happy 0.10897436 0.09318996
## pretty happy 0.60256410 0.58422939
## very happy 0.28846154 0.32258065
tab <- xt(dat,"happy","reg16")
tab$tab
## [[1]]
## happy/reg16 foreign new england middle atlantic e. nor. central
## not too happy 15% (33) 10% (14) 13% (62) 12% (70)
## pretty happy 54% (116) 57% (82) 54% (256) 58% (335)
## very happy 31% (66) 33% (47) 32% (152) 30% (173)
## Total 100% (215) 100% (143) 100% (470) 100% (578)
## w. nor. central south atlantic e. sou. central w. sou. central mountain
## 10% (23) 14% (62) 16% (31) 10% (29) 11% (17)
## 59% (133) 51% (224) 52% (103) 61% (184) 60% (94)
## 31% (69) 34% (149) 32% (63) 29% (89) 29% (45)
## 100% (225) 100% (435) 100% (197) 100% (302) 100% (156)
## pacific Total
## 9% (26) 12% (367)
## 58% (163) 56% (1,690)
## 32% (90) 31% (943)
## 100% (279) 100% (3,000)
tab$stats
## [[1]]
## statistic p-value
## Chi-squared 18.66565814 0.412
## Cramers V 0.05577583 0.412
## Lambda 0.00000000 1.000
You might ask whether those two variables are related in the population or not:
You'll learn about the \(\chi^2\) test on Wednesday.
| Test | Data type | Why do we use it? | Interpretation |
|---|---|---|---|
| \(\chi^2\) | Nominal | Are the frequencies of the contingency table of the two variables randomly distributed? | \(H_0\): Frequencies are randomly distributed. |
| \(\phi\) | Nominal (2x2 table) | How strong is the relationship (-1 to 1)? | 0=no >0.1=moderate >0.3=strong |
| Cramer's V | Nominal (>2x2 table) | How strong is the relationship (-1 to 1)? | 0=no >0.1=moderate >0.3=strong |
| \(\lambda\) | Nominal | What is the exact improvement in error in predicting DV by knowing IV? | Percent improvement |
| \(\gamma\) | Ordinal | How strong is the relationship (-1 to 1)? | 0=no >0.1=moderate >0.3=strong |
| Somer's d | Ordinal | How strong is the relationship (-1 to 1)? (A \(\gamma\) that accounts for ties on IV) | 0=no >0.1=moderate >0.3=strong |
| Kendall's \(\tau_b\) | Ordinal | How strong is the relationship (-1 to 1)? (A \(\gamma\) that accounts for ties on IV and DV) | 0=no >0.1=moderate >0.3=strong |
| Spearman's \(\rho_s\) | Ordinal | How strong is the relationship (-1 to 1)? (Talk of a correlation. Testing for monotonic relationship) | 0=no >0.1=moderate >0.3=strong |
| Pearson r | Continuous | How strong is the relationship (-1 to 1)? (Talk of a correlation. Testing for linear relationship) | 0=no >0.1=moderate >0.3=strong |
| Letter | Name | Letter | Name |
|---|---|---|---|
| \(A \alpha\) | Alpha | \(N\nu\) | Nu |
| \(B \beta\) | Beta | \(\Xi\xi\) | Xi |
| \(\Gamma\gamma\) | Gamma | \(O\omicron\) | Omicron |
| \(\Delta\delta\) | Delta | \(\Pi\pi\) | Pi |
| \(E\epsilon/\varepsilon\) | Epsilon | \(P\rho\) | Rho |
| \(Z\zeta\) | Zeta | \(\Sigma\sigma\) | Sigma |
| \(H\eta\) | Eta | \(T\tau\) | Tau |
| \(\Theta\theta\) | Theta | \(\Upsilon\upsilon\) | Upsilon |
| \(I\iota\) | Iota | \(\Phi\phi\) | Phi |
| \(K\kappa\) | Kappa | \(X\chi\) | Chi |
| \(\Lambda\lambda\) | Lambda | \(\Psi\psi\) | Psi |
| \(M\mu\) | Mu | \(\Omega\omega\) | Omega |
uwo4419::makeStatsUsing the variables
income,attend,wrkFTime,educMoreHS,kids,
calculate measures of association for the relationship between each of
these and happy. Make a plot that shows the effects of each
variable.
# Stats
library(uwo4419)
stat_income <- makeStats(dat$happy,dat$income, lambda=T,cramersV=T,chisq=T,gamma=T,d=T,taub=T,rho=T)
stat_attend <- makeStats(dat$happy,dat$attend, lambda=T,cramersV=T,chisq=T,gamma=T,d=T,taub=T,rho=T)
stat_wrkFTime <- makeStats(dat$happy,dat$wrkFTime, lambda=T,cramersV=T,chisq=T,gamma=T,d=T,taub=T,rho=T)
stat_educMoreHS <- makeStats(dat$happy,dat$educMoreHS,lambda=T,cramersV=T,chisq=T,gamma=T,d=T,taub=T,rho=T)
stat_kids <- makeStats(dat$happy,dat$kids, lambda=T,cramersV=T,chisq=T,gamma=T,d=T,taub=T,rho=T)
# Data for Graph
stats <- bind_rows(stat_income %>% as_tibble(rownames="stat") %>% mutate(var="Income"),
stat_attend %>% as_tibble(rownames="stat") %>% mutate(var="Rel. Attend."),
stat_wrkFTime %>% as_tibble(rownames="stat") %>% mutate(var="Works"),
stat_educMoreHS %>% as_tibble(rownames="stat") %>% mutate(var="Higher Ed."),
stat_kids %>% as_tibble(rownames="stat") %>% mutate(var="Kids"))
# Graph
ggplot(stats %>% filter(stat!="Chi-squared"),aes(x=reorder(as.factor(var), statistic, mean), y=statistic)) +
geom_bar(stat="identity") +
facet_wrap(~stat,scales = "free_x") +
theme_minimal() +
labs(x="", y="Statistic")+
coord_flip()
covcor
DAMisc::pwCorrMat() to produce a
correlation matrix, more on this in a bit.# A random variable X
X <- rnorm(1000,0,1)
# Another random variable Y
Ypos <- rnorm(1000,0,1)+X
Ynull <- rnorm(1000,0,1)+rnorm(1000,0,1)
Yneg <- rnorm(1000,0,1)-X
FakeData <- data.frame(X=X,Ypos=Ypos,Ynull=Ynull,Yneg=Yneg)
# Variance covariance matrix
cov(FakeData)
## X Ypos Ynull Yneg
## X 1.00576319 1.0098977468 0.0259181614 -1.02344345
## Ypos 1.00989775 1.9904549383 0.0004238368 -1.01800984
## Ynull 0.02591816 0.0004238368 2.0387405539 -0.02177242
## Yneg -1.02344345 -1.0180098427 -0.0217724240 2.02154498
# Correlations
cor(FakeData[,1],FakeData[,2:4])
## Ypos Ynull Yneg
## [1,] 0.7137619 0.01809987 -0.7177518
# Vizualisation
FakeData %>%
pivot_longer(starts_with("Y"),values_to = "Y",names_to = "var",names_pattern = "Y(.*)") %>%
mutate(var=factor(var,levels=c("pos","null","neg"))) %>%
ggplot(aes(x=X,y=Y))+
geom_point(alpha=0.2,color="#4F2683",shape=16)+
geom_smooth(method="lm",color="#4F2683")+
facet_wrap(~var)+
theme_minimal()
custom_smooth <- function(data, mapping,
..., span=.35, pt.alpha=.25, jitter=TRUE) {
if(jitter){
pos <- position_jitter(width=2, height=2)
}else{
pos <- position_identity()
}
ggplot(data, mapping, ...) +
geom_point(shape=1, col="gray",
position=pos, alpha=pt.alpha) +
geom_smooth(method="loess", span=span,
family="symmetric",
se=FALSE, col="red") +
geom_smooth(method="lm", col="black", se=FALSE)
}
check_lin <- function(formula, data, ...){
requireNamespace("fANCOVA")
D <- get_all_vars(formula, data)
D <- na.omit(D)
if(nrow(D) == 0)stop("No non-missing data.\n")
forms <- sapply(names(D)[-1], \(x)reformulate(x, response=names(D)[1]))
y <- c(D[,1])
X <- as.matrix(D[,-1])
spans <- sapply(1:ncol(X), \(i)fANCOVA::loess.as(X[,i], y, criterion = "gcv")$pars$span)
los <- lapply(seq_along(forms), \(i)do.call(loess, list(formula=forms[[i]], data=D, span=spans[i])))
lms <- lapply(seq_along(forms), \(i)do.call(lm, list(formula=forms[[i]], data=D)))
n <- nrow(D)
rss0 <- sapply(lms, \(x)sum(x$residuals^2))
rss1 <- sapply(los, \(x)sum(x$residuals^2))
df.linmod <- sapply(lms, \(x)x$rank)
df.lomod <- sapply(los, \(x)x$trace.hat)
df.res <- n-df.lomod
F0 <- ((rss0-rss1)/(df.lomod-df.linmod))/(rss1 / df.res)
pval <- pf(F0, (df.lomod-df.linmod), df.res, lower.tail=FALSE)
nl_r2 <- sapply(los, \(m)cor(m$y, m$fitted)^2)
lin_r2 <- sapply(lms, \(m)cor(m$fitted.values, model.response(model.frame(m)))^2)
out <- data.frame(vbl = names(D)[-1], lin_r2 = lin_r2, nonlin_r2 = nl_r2)
out$diff <- out$nonlin_r2 - out$lin_r2
out$p <- pval
class(out) <- c("checklin", class(out))
return(out)
}
#install.packages("GGally")
library(GGally)
# Data
data(countymurders, package="wooldridge")
sub <- countymurders %>%
filter(year==1996) %>%
select(murdrate,arrestrate,percblack,percmale,rpcpersinc)
# Correlations
DAMisc::pwCorrMat(murdrate~arrestrate+percblack+percmale+rpcpersinc,data=sub)
## Pairwise Correlations
## murdrate arrestrate percblack percmale rpcpersinc
## murdrate
## arrestrate 0.567*
## percblack 0.380* 0.381*
## percmale -0.081* -0.095* -0.262*
## rpcpersinc -0.073* -0.105* -0.065* -0.121*
# Visualize
ggpairs(sub,
lower = list(continuous = wrap(custom_smooth,
span=.5,
pt.alpha=1,
jitter=FALSE))) +
theme(legend.position = "bottom")
# Test for non-linearity
check_lin(murdrate~arrestrate+percblack+percmale+rpcpersinc,data=sub)
## vbl lin_r2 nonlin_r2 diff p
## 1 arrestrate 0.321954125 0.35841350 0.0364593719 1.598702e-25
## 2 percblack 0.144494166 0.14486312 0.0003689559 8.261143e-01
## 3 percmale 0.006610729 0.06359355 0.0569828165 1.106390e-24
## 4 rpcpersinc 0.005380532 0.01991495 0.0145344160 3.271076e-06
## vbl lin_r2 nonlin_r2 diff p
## 1 X1 0.6884561 0.6896512 0.001195098 0.2169462
## vbl lin_r2 nonlin_r2 diff p
## 1 X1 1.4076e-05 0.2230931 0.2230791 1.619498e-49
## vbl lin_r2 nonlin_r2 diff p
## 1 X1 0.9314118 0.9477851 0.01637333 2.671061e-48
## vbl lin_r2 nonlin_r2 diff p
## 1 X1 0.01063344 0.01509156 0.004458115 0.1568782
## vbl lin_r2 nonlin_r2 diff p
## 1 X1 0.6864107 0.7244987 0.03808802 1.016584e-24