t.test(x, mu = 0, alternative="less") OR
t.test(x, mu = 0, alternative="greater")t.test(x, mu = 0, alternative="two")prop.test(x,n,p="NULL",alternative=c("two.sided", "less", "greater")),
with x our vector of successes, n our total nb
of observations and p our tested probability of
success.t.test(dat$group1,dat$group2,alternative="two",paired=F/T),
but tTest("group1","group2",data=dat,var.equal=F) is more
flexible.pplFeel_Francophones for PEI folks
different from 50?pplFeel_Francophones for PEI folks
greater than 50?pplFeel_Francophones for PEI folks
smaller than 50?PEI <- CES |> filter(prov=="Prince Edward Island")
# 1.
t.test(PEI$pplFeel_Francophones,mu=50,alternative="two")
##
## One Sample t-test
##
## data: PEI$pplFeel_Francophones
## t = 0.42165, df = 11, p-value = 0.6814
## alternative hypothesis: true mean is not equal to 50
## 95 percent confidence interval:
## 36.28513 70.21487
## sample estimates:
## mean of x
## 53.25
# 2.
t.test(PEI$pplFeel_Francophones,mu=50,alternative="greater")
##
## One Sample t-test
##
## data: PEI$pplFeel_Francophones
## t = 0.42165, df = 11, p-value = 0.3407
## alternative hypothesis: true mean is greater than 50
## 95 percent confidence interval:
## 39.40759 Inf
## sample estimates:
## mean of x
## 53.25
# 3.
t.test(PEI$pplFeel_Francophones,mu=50,alternative="less")
##
## One Sample t-test
##
## data: PEI$pplFeel_Francophones
## t = 0.42165, df = 11, p-value = 0.6593
## alternative hypothesis: true mean is less than 50
## 95 percent confidence interval:
## -Inf 67.09241
## sample estimates:
## mean of x
## 53.25
pplFeel_Francophones for PEI folks
different than the one for folks from Alberta?pplFeel_Francophones for PEI folks
greater than the one for folks from Alberta?pplFeel_Francophones for PEI folks
smaller than the one for folks from Alberta?CES <- CES |>
mutate(PEIvAB =case_when(prov=="Prince Edward Island"~"PEI",
prov=="Alberta"~"AB",
TRUE~NA_character_),
PEIvAB=factor(PEIvAB,levels=c("PEI","AB")))
# 1.
DAMisc::tTest("PEIvAB","pplFeel_Francophones",data=CES,alternative="two")
## Summary:
## mean n se
## PEI 53.25 12 26.70078
## AB 56.42163 906 25.75205
## Difference -3.171634 918 7.755189
## p-value > 0.05
## ---------------------------------
##
## Welch Two Sample t-test
##
## data: pplFeel_Francophones by PEIvAB
## t = -0.40897, df = 11.273, p-value = 0.6902
## alternative hypothesis: true difference in means between group PEI and group AB is not equal to 0
## 95 percent confidence interval:
## -20.19045 13.84718
## sample estimates:
## mean in group PEI mean in group AB
## 53.25000 56.42163
# 2.
DAMisc::tTest("PEIvAB","pplFeel_Francophones",data=CES,alternative="greater")
## Summary:
## mean n se
## PEI 53.25 12 26.70078
## AB 56.42163 906 25.75205
## Difference -3.171634 918 7.755189
## p-value > 0.05
## ---------------------------------
##
## Welch Two Sample t-test
##
## data: pplFeel_Francophones by PEIvAB
## t = -0.40897, df = 11.273, p-value = 0.6549
## alternative hypothesis: true difference in means between group PEI and group AB is greater than 0
## 95 percent confidence interval:
## -17.06829 Inf
## sample estimates:
## mean in group PEI mean in group AB
## 53.25000 56.42163
# 3.
DAMisc::tTest("PEIvAB","pplFeel_Francophones",data=CES,alternative="less")
## Summary:
## mean n se
## PEI 53.25 12 26.70078
## AB 56.42163 906 25.75205
## Difference -3.171634 918 7.755189
## p-value > 0.05
## ---------------------------------
##
## Welch Two Sample t-test
##
## data: pplFeel_Francophones by PEIvAB
## t = -0.40897, df = 11.273, p-value = 0.3451
## alternative hypothesis: true difference in means between group PEI and group AB is less than 0
## 95 percent confidence interval:
## -Inf 10.72503
## sample estimates:
## mean in group PEI mean in group AB
## 53.25000 56.42163
Sub <- CES |>
filter(gender %in% c("A man","A woman")) |>
mutate(voteCPC=ifelse(vt=="CPC",1,
ifelse(is.na(vt),NA,0)))
Proportions <- Sub |>
group_by(gender) |>
drop_na(gender,voteCPC) |>
summarise(freqCPC=sum(voteCPC),
freq=n())
prop.test(Proportions$freqCPC,Proportions$freq)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: Proportions$freqCPC out of Proportions$freq
## X-squared = 57.272, df = 1, p-value = 3.795e-14
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.05884056 0.10042296
## sample estimates:
## prop 1 prop 2
## 0.3188202 0.2391885
pplFeel_LPC and pplFeel_CPC, what do you
find?t.test(CES$pplFeel_LPC,CES$pplFeel_CPC,paired=T)
##
## Paired t-test
##
## data: CES$pplFeel_LPC and CES$pplFeel_CPC
## t = 8.0486, df = 7328, p-value = 9.698e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 3.915775 6.437343
## sample estimates:
## mean difference
## 5.176559
CES <- CES |>
mutate(fifty=ifelse(age>=50,1,0))
t.test(CES$pplFeel_Americans[CES$fifty==1],CES$pplFeel_Americans[CES$fifty==0])
##
## Welch Two Sample t-test
##
## data: CES$pplFeel_Americans[CES$fifty == 1] and CES$pplFeel_Americans[CES$fifty == 0]
## t = -0.77739, df = 4956.4, p-value = 0.437
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.6583763 0.7166098
## sample estimates:
## mean of x mean of y
## 54.21738 54.68827
tTest("fifty","pplFeel_Americans",data=CES)
## Summary:
## mean n se
## 0 54.68827 2531 25.02027
## 1 54.21738 4798 23.95163
## Difference 0.4708833 7329 0.605727
## p-value > 0.05
## ---------------------------------
##
## Welch Two Sample t-test
##
## data: pplFeel_Americans by fifty
## t = 0.77739, df = 4956.4, p-value = 0.437
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.7166098 1.6583763
## sample estimates:
## mean in group 0 mean in group 1
## 54.68827 54.21738
pplFeel_BQ)
above 50 at a higher rate than people who vote for the CPC?Sub <- CES |>
filter(vt %in% c("LPC","PPC")) |>
drop_na(pplFeel_BQ) |>
mutate(goodBQ=ifelse(pplFeel_BQ>50,1,0)) |>
group_by(vt) |>
summarise(freqGoodBQ=sum(goodBQ),
n=n())
prop.test(Sub$freqGoodBQ,Sub$n,alternative = "greater")
##
## 2-sample test for equality of proportions with continuity correction
##
## data: Sub$freqGoodBQ out of Sub$n
## X-squared = 9.0159, df = 1, p-value = 0.001338
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.04172282 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.1832653 0.1048387
Sub
## # A tibble: 2 × 3
## vt freqGoodBQ n
## <chr> <dbl> <int>
## 1 LPC 449 2450
## 2 PPC 26 248
Going back to lab 1:
There are two ways to create new variables:
mutate() from dplyr;$ and assigning the new values.Here are a few functions that come in handy in a
mutate() call when you want to change the structure of a
variable:
ifelse();case_when().For example: Create a dummy variable (binary 0,1) for sex:
cses <- import("cses5.rdata")
table(cses$E2002)
##
## 1 2 3 7 9
## 55939 58329 80 169 197
# Base R
cses$female <- NA
cses$female[cses$E2002==2] <- 1
cses$female[cses$E2002 %in% c(1,3)] <- 0
# dplyr 1
cses <- cses |>
mutate(female = ifelse(E2002==2,1,ifelse(E2002 %in% c(1,3),0, NA)))
# dplyr 2
cses <- cses |>
mutate(female = case_when(E2002 == 2 ~ 1,
E2002 %in% c(1,3) ~ 0,
TRUE ~ NA))
USE table() to check
frequencies before and after cleaning, you can even do
crosstabs:
# Let's say I do this (DON'T EVER DO THIS, DON'T YOU DARE!!!!)
tmp <- cses |>
mutate(female = case_when(E2002 == 2 ~ 1,
TRUE ~ 0))
table(cses$E2002,tmp$female,useNA = "ifany")
##
## 0 1
## 1 55939 0
## 2 0 58329
## 3 80 0
## 7 169 0
## 9 197 0
# If you want to use case_when, then assign every value.
tmp <- cses |>
mutate(female = case_when(E2002 == 2 ~ 1,
E2002 %in% c(1,3) ~ 0,
TRUE ~ NA))
table(cses$E2002,tmp$female,useNA = "ifany")
##
## 0 1 <NA>
## 1 55939 0 0
## 2 0 58329 0
## 3 80 0 0
## 7 0 0 169
## 9 0 0 197
# Still, ifelse is much safer in its clunkyness
tmp <- cses |>
mutate(female = ifelse(E2002==2,1,ifelse(E2002 %in% c(1,3),0, NA)))
table(cses$E2002,tmp$female,useNA = "ifany")
##
## 0 1 <NA>
## 1 55939 0 0
## 2 0 58329 0
## 3 80 0 0
## 7 0 0 169
## 9 0 0 197
In the codebook, identify the following variables, and replicate the following table (only print a dataframe with the corresponding values): country; year; gender; education level; employement status; age; place of residence (urban/rural); interest in politics; ideology (self); ideology of the party the respondent voted for; income.
Note:
female was coded 1 for female and 0 for
non-female.educ_low was coded 1 for early childhood education,
primary, lower secondary, or none; and 0 otherwise.educ_mid was coded 1 for upper secondary or
post-secondary; and 0 otherwise.educ_hig was coded 1 for anything above post-secondary;
and 0 otherwise.income_low was coded 1 for income below or equal to the
first tercile of the variable by country and year; and 0 otherwise.income_mid was coded 1 for income above the first
tercile and below or equal to the second tercile; and 0 otherwise.income_hig was coded 1 for income above the second
tercile and 0 otherwise.urban was coded 1 for residence in a small or middle
town, suburbs, or large town or city; 0 otherwise.int_pol, ideo_lr_self,
ideo_vote were all rescaled to vary between 0 and 1, with
one corresponding to right in the ideology variables, and to very
interested for int_pol.library(tidyverse)
library(rio)
dat <- import("cses5.rdata") |>
rename(country=E1006_NAM,
year=E1008) |>
mutate(female=ifelse(E2002==2,1,
ifelse(E2002 > 3, NA, 0)),
E2003=ifelse(E2003 >96,NA,E2003),
educ_low=ifelse(E2003 %in% c(1:3,96),1,0),
educ_mid=ifelse(E2003 %in% 4:5,1,0),
educ_hig=ifelse(E2003 %in% 6:9,1,0),
employed=ifelse(E2006 %in% c(1,2,3),1,
ifelse(E2006 %in% c(97,98,99),NA,0)),
E2001_A=ifelse(E2001_A >120,NA,E2001_A),
age_25less=ifelse(E2001_A<=25,1,0),
age_26to35=ifelse(E2001_A>25 & E2001_A<=35,1,0),
age_36to45=ifelse(E2001_A>35 & E2001_A<=45,1,0),
age_46to55=ifelse(E2001_A>45 & E2001_A<=55,1,0),
age_56to65=ifelse(E2001_A>55 & E2001_A<=65,1,0),
age_66plus=ifelse(E2001_A>65,1,0),
E2011=ifelse(E2011>=99999997,NA,E2011),
urban=ifelse(E2022 %in% c(2,3,4),1,
ifelse(E2022 >=7,NA,0)),
int_pol=ifelse(E3001>=7,NA,1-((E3001-1)/3)),
ideo_lr_self=ifelse(E3020>10,NA,E3020/10),
ideo_vote=ifelse(E3013_LR_CSES==9,NA,(E3013_LR_CSES-1)/2)) |>
group_by(country,year) |>
mutate(income_low=ifelse(E2011<=quantile(E2011,c(1/3,2/3),na.rm=T)[1],1,0),
income_mid=ifelse(E2011>quantile(E2011,c(1/3,2/3),na.rm=T)[1] &
E2011<=quantile(E2011,c(1/3,2/3),na.rm=T)[2],1,0),
income_hig=ifelse(E2011>quantile(E2011,c(1/3,2/3),na.rm=T)[2],1,0)) |>
select(country,year,female,starts_with(c("educ","age","income")),urban,int_pol,ideo_lr_self,ideo_vote) |>
na.omit() |>
ungroup()
table <- DAMisc::sumStats(dat,names(dat)[c(-1,-2)])
table
## # A tibble: 17 × 11
## variable mean sd iqr min q25 q50 q75 max n nNA
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 female 0.488 0.500 1 0 0 0 1 1 52741 0
## 2 educ_low 0.211 0.408 0 0 0 0 0 1 52741 0
## 3 educ_mid 0.369 0.482 1 0 0 0 1 1 52741 0
## 4 educ_hig 0.403 0.491 1 0 0 0 1 1 52741 0
## 5 age_25less 0.0879 0.283 0 0 0 0 0 1 52741 0
## 6 age_26to35 0.160 0.367 0 0 0 0 0 1 52741 0
## 7 age_36to45 0.179 0.384 0 0 0 0 0 1 52741 0
## 8 age_46to55 0.178 0.382 0 0 0 0 0 1 52741 0
## 9 age_56to65 0.181 0.385 0 0 0 0 0 1 52741 0
## 10 age_66plus 0.213 0.410 0 0 0 0 0 1 52741 0
## 11 income_low 0.349 0.477 1 0 0 0 1 1 52741 0
## 12 income_mid 0.354 0.478 1 0 0 0 1 1 52741 0
## 13 income_hig 0.297 0.457 1 0 0 0 1 1 52741 0
## 14 urban 0.702 0.457 1 0 0 1 1 1 52741 0
## 15 int_pol 0.644 0.283 0.667 0 0.333 0.667 1 1 52741 0
## 16 ideo_lr_self 0.519 0.277 0.4 0 0.3 0.5 0.7 1 52741 0
## 17 ideo_vote 0.584 0.404 1 0 0 0.5 1 1 52741 0