Let’s have a population distribution:
library(tidyverse)
set.seed(2543)
pop <- runif(10000,0,1)
ggplot()+
geom_histogram(aes(x=pop),bins=24,color="white")+
theme_minimal()+
xlab("Population Distribution of X")
Where the true population mean \(\mu\) is:
mean(pop)
## [1] 0.49728
and the true population SD \(\sigma\) is:
sqrt(sum((pop - mean(pop))^2)/length(pop))
## [1] 0.2872507
Let’s take a sample of size 100 at random from the population:
samp <- sample(pop, 100, replace = TRUE)
# Sample mean
mean(samp)
## [1] 0.4871019
# Sample sd
sd(samp)
## [1] 0.2748858
Hence, the sampling error is \(\bar x- \mu\):
mean(samp)-mean(pop)
## [1] -0.0101781
QUESTIONS:
So, let’s take a 1000 sample of a 100:
means <- replicate(1000,
mean(
sample(pop, 100, replace=TRUE)
)
)
samp.errors <- means - mean(pop)
ggplot() +
geom_histogram(aes(x=samp.errors),bins=25,col="white") +
theme_minimal() +
geom_vline(xintercept=0, col="red") +
labs(x="Distribution of Sampling Errors")
We get a sampling distribution, the distribution of the sample means around the true, but generally unkown, population mean.
THE ISSUE IS (as the real slim shady would say) :
We would like to be able to identify how likely it is that we would observe our sample statistic (\(\bar x=.45\)) in the population :
pnorm(.45, mean(pop), true.se)
## [1] 0.04988717
And we have:
If we know \(\sigma_{\bar{x}}\) (unlikely)
\[\bar{x} \pm z_{\text{crit}}\frac{\sigma_{x}}{\sqrt{n}}\] If we have to estimate \(\hat{\sigma}_{\bar{x}} = s_{\bar{x}}\), then:
\[\bar{x} \pm z_{\text{crit}}\frac{s_{x}}{\sqrt{n}}\]
If we have small samples \((n < 120)\):
\[\bar{x} \pm t_{\text{crit}}\frac{s_{x}}{\sqrt{n}}\] Note: the formula doesn’t require us to know \(\mu\), which is nice.
From ggplot2, use mean_cl_normal():
mean_cl_normal(samp)
## y ymin ymax
## 1 0.4871019 0.4325586 0.5416452
Or, from uwo4419, use confidenceInterval()
(Allows you to choose the distribution you want):
uwo4419::confidenceInterval(samp,distr = "normal")
## Estimate CI lower CI upper Std. Error
## 0.48710189 0.43322527 0.54097851 0.02748858
uwo4419::confidenceInterval(samp,distr = "t")
## Estimate CI lower CI upper Std. Error
## 0.48710189 0.43255859 0.54164519 0.02748858
set.seed(122)
# Take 100 sample of size n and compute their means and CI
it <- 100
n <- 100
cis <- lapply(1:it,function(x)mean_cl_normal(sample(pop, n, replace=TRUE)))
cis <- do.call(rbind.data.frame, cis) |>
mutate(Type=ifelse(ymin>mean(pop) | ymax < mean(pop),"Bad","Good"),
sample=1:it)
ggplot(cis,aes(x=sample,y=y,ymin=ymin,ymax=ymax,color=Type))+
geom_hline(yintercept=mean(pop))+
geom_pointrange(size=0.1)+
scale_color_manual("Type of sample",values=c("#F23030","#4F2683"))+
labs(y="Sample Means",
x="Sample Id")+
theme_minimal()+
theme(legend.position = "top")
From the CES data:
CES |>
select(UUID,prov,starts_with("pplFeel_")) |>
select(-pplFeel_Americans,-pplFeel_Francophones) |>
pivot_longer(c(-UUID,-prov),names_pattern = "pplFeel_(.*)",names_to = "Party", values_to = "score") |>
group_by(prov,Party) |>
summarise(ci=mean_cl_normal(score)) |>
unnest_wider(ci) |>
mutate(Party=factor(Party,levels=c("LPC","CPC","NDP","BQ","GPC","PPC"))) |>
ggplot(aes(x=prov,y=y,ymin=ymin,ymax=ymax,color=Party))+
annotate('rect', xmin = seq(.5,8.5,2), xmax = seq(1.5,9.5,2),
ymin=-Inf, ymax=Inf, alpha=0.08, fill="black")+
geom_pointrange(size=0.1,position=position_dodge(0.5)) +
scale_color_manual(values=c("red","blue","orange","cyan","green","purple"))+
theme_minimal()+
theme(legend.position = "top")+
coord_flip(ylim=c(0,100))+
labs(x="",y="\nFeeling Thermometer Average Score (0-100)")+
guides(color = guide_legend(nrow = 1))
mean_cl_normal(CES$pplFeel_Americans)
## y ymin ymax
## 1 54.38 53.823 54.937
From the CES data:
uwo4419::confidenceInterval())#devtools::install_github("https://github.com/davidaarmstrong/uwo4419.git")
library(uwo4419)
norm <- CES |>
select(age,vt) |>
group_by(vt) |>
summarise(ci_normal=list(confidenceInterval(age,distr="normal"))) |>
unnest_wider(ci_normal) |>
mutate(dist="Normal")
tdist <- CES |>
select(age,vt) |>
group_by(vt) |>
summarise(ci_t=list(confidenceInterval(age,distr="t"))) |>
unnest_wider(ci_t) |>
mutate(dist="Student's t")
rbind(norm,tdist) |>
setNames(c("vt", "y", "ymin", "ymax", "se","dist")) |>
ggplot(aes(x=reorder(vt,y,mean),y=y,ymin=ymin,ymax=ymax,color=dist))+
geom_pointrange(position=position_dodge(0.5))+
scale_color_manual("",values=c("#F23030","#4F2683"))+
theme_minimal()+
theme(legend.position = "top")+
labs(x="",y="Mean age")