0. What we’ll be doing

  1. Confidence Intervals
  2. Exercises

1. Confidence Intervals

1.1 The reality of inference

Let’s have a population distribution:

library(tidyverse)

set.seed(2543)
pop <- runif(10000,0,1)

ggplot()+
  geom_histogram(aes(x=pop),bins=24,color="white")+
  theme_minimal()+
  xlab("Population Distribution of X")

Where the true population mean \(\mu\) is:

mean(pop)
## [1] 0.49728

and the true population SD \(\sigma\) is:

sqrt(sum((pop - mean(pop))^2)/length(pop))
## [1] 0.2872507

Let’s take a sample of size 100 at random from the population:

samp <- sample(pop, 100, replace = TRUE)

# Sample mean
mean(samp)
## [1] 0.4871019
# Sample sd
sd(samp)
## [1] 0.2748858

Hence, the sampling error is \(\bar x- \mu\):

mean(samp)-mean(pop)
## [1] -0.0101781

QUESTIONS:

  1. What would happen if I were to take bigger and bigger samples? Why?
  2. What would happen if I took 1000 samples of a 100 individuals? Why?

So, let’s take a 1000 sample of a 100:

means <- replicate(1000,
                   mean(
                     sample(pop, 100, replace=TRUE)
                     )
                   )
samp.errors <- means - mean(pop)
ggplot() +
  geom_histogram(aes(x=samp.errors),bins=25,col="white") +
  theme_minimal() +
  geom_vline(xintercept=0, col="red") +
  labs(x="Distribution of Sampling Errors")

We get a sampling distribution, the distribution of the sample means around the true, but generally unkown, population mean.

THE ISSUE IS (as the real slim shady would say) :

We would like to be able to identify how likely it is that we would observe our sample statistic (\(\bar x=.45\)) in the population :

pnorm(.45, mean(pop), true.se)
## [1] 0.04988717

And we have:

  1. The sample mean (\(\bar x\));
  2. The sample standard deviation (\(sd\)).

1.2 Formulas & R

If we know \(\sigma_{\bar{x}}\) (unlikely)

\[\bar{x} \pm z_{\text{crit}}\frac{\sigma_{x}}{\sqrt{n}}\] If we have to estimate \(\hat{\sigma}_{\bar{x}} = s_{\bar{x}}\), then:

\[\bar{x} \pm z_{\text{crit}}\frac{s_{x}}{\sqrt{n}}\]

If we have small samples \((n < 120)\):

\[\bar{x} \pm t_{\text{crit}}\frac{s_{x}}{\sqrt{n}}\] Note: the formula doesn’t require us to know \(\mu\), which is nice.

From ggplot2, use mean_cl_normal():

mean_cl_normal(samp)
##           y      ymin      ymax
## 1 0.4871019 0.4325586 0.5416452

Or, from uwo4419, use confidenceInterval() (Allows you to choose the distribution you want):

uwo4419::confidenceInterval(samp,distr = "normal")
##   Estimate   CI lower   CI upper Std. Error 
## 0.48710189 0.43322527 0.54097851 0.02748858
uwo4419::confidenceInterval(samp,distr = "t")
##   Estimate   CI lower   CI upper Std. Error 
## 0.48710189 0.43255859 0.54164519 0.02748858

1.3 Building the intuition

  • There can only be ONE!!! How do we know where our sample stand in the sampling distribution???
set.seed(122)
# Take 100 sample of size n and compute their means and CI
it <- 100
n <- 100
cis <- lapply(1:it,function(x)mean_cl_normal(sample(pop, n, replace=TRUE)))
cis <- do.call(rbind.data.frame, cis) |> 
  mutate(Type=ifelse(ymin>mean(pop) | ymax < mean(pop),"Bad","Good"),
         sample=1:it)

ggplot(cis,aes(x=sample,y=y,ymin=ymin,ymax=ymax,color=Type))+
  geom_hline(yintercept=mean(pop))+
  geom_pointrange(size=0.1)+
  scale_color_manual("Type of sample",values=c("#F23030","#4F2683"))+
  labs(y="Sample Means",
       x="Sample Id")+
  theme_minimal()+
  theme(legend.position = "top")

  • Confidence intervals: created such that \((1-\alpha)\%\) of the intervals we could make from different samples will cover the true, but unknown population value.
    • Hence, the only thing we can say is that, if \(alpha\) is small enough, we are willing to bet that we are in one of the good samples.

2. Exercise 1

From the CES data:

  1. Compute the average score given to each party per province and their confidence intervals.
  2. Present these in a graph where the Y axis are the provinces and the X axis the averages.
CES |> 
  select(UUID,prov,starts_with("pplFeel_")) |> 
  select(-pplFeel_Americans,-pplFeel_Francophones) |> 
  pivot_longer(c(-UUID,-prov),names_pattern = "pplFeel_(.*)",names_to = "Party", values_to = "score") |> 
  group_by(prov,Party) |> 
  summarise(ci=mean_cl_normal(score)) |> 
  unnest_wider(ci) |> 
  mutate(Party=factor(Party,levels=c("LPC","CPC","NDP","BQ","GPC","PPC"))) |> 
  ggplot(aes(x=prov,y=y,ymin=ymin,ymax=ymax,color=Party))+
  annotate('rect', xmin = seq(.5,8.5,2), xmax = seq(1.5,9.5,2), 
  ymin=-Inf, ymax=Inf, alpha=0.08, fill="black")+
  geom_pointrange(size=0.1,position=position_dodge(0.5)) +
  scale_color_manual(values=c("red","blue","orange","cyan","green","purple"))+
  theme_minimal()+
  theme(legend.position = "top")+
  coord_flip(ylim=c(0,100))+
  labs(x="",y="\nFeeling Thermometer Average Score (0-100)")+
  guides(color = guide_legend(nrow = 1))

mean_cl_normal(CES$pplFeel_Americans)
##       y   ymin   ymax
## 1 54.38 53.823 54.937

3. Exercise 2

From the CES data:

  1. Compute the average age of voters for each party and the corresponding CI.
  2. Do so using both the normal distribution and the Student’s t distribution. (Hint: use uwo4419::confidenceInterval())
  3. Present this in a graph where the Y axis is the averages and the X axis is the parties.
#devtools::install_github("https://github.com/davidaarmstrong/uwo4419.git")
library(uwo4419)
norm <- CES |> 
  select(age,vt) |> 
  group_by(vt) |> 
  summarise(ci_normal=list(confidenceInterval(age,distr="normal"))) |> 
  unnest_wider(ci_normal) |> 
  mutate(dist="Normal")
 
tdist <- CES |> 
  select(age,vt) |> 
  group_by(vt) |> 
  summarise(ci_t=list(confidenceInterval(age,distr="t"))) |> 
  unnest_wider(ci_t) |> 
  mutate(dist="Student's t")

rbind(norm,tdist) |> 
  setNames(c("vt", "y", "ymin", "ymax", "se","dist")) |> 
  ggplot(aes(x=reorder(vt,y,mean),y=y,ymin=ymin,ymax=ymax,color=dist))+
  geom_pointrange(position=position_dodge(0.5))+
  scale_color_manual("",values=c("#F23030","#4F2683"))+
  theme_minimal()+
  theme(legend.position = "top")+
  labs(x="",y="Mean age")