library(tidyverse)
load("france.rda")
france <- france %>%
mutate(lr_group = case_when(lrself < 4 ~ "Left", lrself >= 4 & lrself < 8 ~ "Centre", lrself >= 8 ~ "Right", TRUE ~ NA_character_),
lr_group = factor(lr_group, levels=c("Left", "Centre", "Right")))
france %>%
group_by(lr_group) %>%
summarise(across(starts_with("rp"), ~mean(.x, na.rm=TRUE))) %>%
ungroup %>%
pivot_longer(-lr_group, names_pattern = "rplace_(.*)", names_to = "party", values_to = "place") %>%
mutate(party = ifelse(party == "green", "Green", toupper(party))) %>%
filter(!is.na(lr_group)) %>%
ggplot(aes(x=place, y=lr_group, colour=party)) +
geom_point() +
theme_bw() +
labs(x="Party Placement", y="Respondent Placement", colour="Party")
As a reference, for random variable \(y\):
makeNormal <- function(x,mu,sigma){
out <- 1/(sqrt(2*pi*sigma^2))*exp(-(x-mu)^2/(2*sigma^2))
return(out)
}
range <- seq(-4,4,0.001)
dist <- data.frame(x=range,
dist_Normal=dnorm(range,0,1),
dist_Normal_manual=makeNormal(range,0,1),
dist_t1=dt(range,1),
dist_t5=dt(range,5)) %>%
pivot_longer(-x,names_pattern = "dist_(.*)",names_to = "distribution",values_to = "y")
ggplot(dist,aes(x=x,y=y,color=distribution))+
geom_line()+
theme_minimal()+
labs(title="",y="Density",color="")+
theme(legend.position = "top")
# Population and sample size
N <- 41288599
n <- 1000
# Forming the population
Population <- sample(1:nrow(CES),N,replace = T)
# Taking a sample
Sample <- sample(Population,n,replace=F)
# How far from the population is the sample?
## For the mean
mu <- mean(CES$pplFeel_Francophones[Population])
mean <- mean(CES$pplFeel_Francophones[Sample])
mu-mean
## [1] -0.4996597
## For the standard deviation
sigma <- ((N-1)/N)*sd(CES$pplFeel_Francophones[Population])
sd <- sd(CES$pplFeel_Francophones[Sample])
sigma-sd
## [1] 0.175219
rm(mean,sd)
## Data to be filled in the loop
WLLN <- data.frame(x=1:100,
y1=NA,
y2=NA)
## Do 100 coin flips
for (i in 1:100){
WLLN[i,2] <- sum(sample(c(0,1),i,replace=TRUE,prob = c(0.5,0.5)))/i
WLLN[i,3] <- sum(sample(c(0,1),i,replace=TRUE,prob = c(1/3,2/3)))/i
}
## Plot the realized proportions of heads (1).
WLLN %>%
pivot_longer(-1,names_pattern = "y(.*)", names_to = "Coin",values_to = "y") %>%
ggplot(aes(x=x,y=y,color=Coin))+
geom_hline(yintercept=0.5,color="#4F2683",linetype="dashed")+
geom_hline(yintercept=2/3,color="#F23030",linetype="dashed")+
geom_line()+
scale_color_manual(values=c("#4F2683","#F23030"))+
labs(x="Coin flips",y="Proportion of heads")+
theme_minimal()+
theme(legend.position = "top")
# Observe that the variable isn't normally distributed
ggplot(CES,aes(x=pplFeel_Francophones))+
geom_histogram(fill="#4F2683",alpha=0.2)+
theme_minimal()+
labs(x="Feelings towards francophones",y="Frequency")
# Take 1000 sample of size n and compute their means
n <- 10000
set.seed(122)
for(i in 1:1000){
if(i==1){
means <- c()
}
Sample <- sample(Population,n,replace=F)
means <- c(means,mean(CES$pplFeel_Francophones[Sample]))
}
# The expected standard error of the sampling distribution is:
standardError <- sigma/sqrt(n)
# We can then form the expected sampling distribution:
range <- seq(mu-standardError*4,mu+standardError*4,0.1)
normalData <- data.frame(x=range,
y=dnorm(range,mu,standardError))
# Graphing the obtained means on top of the expected values
data.frame(means) %>%
ggplot(aes(x=means))+
geom_line(data=normalData,aes(x=x,y=y),color="#F23030",inherit.aes = F,size=1)+
geom_vline(xintercept = mu,color="#F23030",size=1)+
geom_histogram(aes(y=..density..),fill="#4F2683",alpha=0.2)+
geom_density(color="#4F2683",linetype="dashed",size=1)+
geom_vline(xintercept = mean(means),linetype="dashed",color="#4F2683",size=1)+
scale_y_continuous("Density",expand = c(0,0))+
scale_x_continuous("Sample means for feelings towards francophones")+
theme_minimal()+
coord_cartesian(clip = "off")
# 0. Setting seed such that we all have the same results
set.seed(122)
n_episodes <- seq(10,1000,10)
for (n in 1:length(n_episodes)){
# 1. Simulate a bunch of episodes
mh_dat <- data.frame(winning_door=sample(1:3,replace = T,n_episodes[n]))
# 2. Participant that always keeps
keep <- sample(1:3,replace=T,size=n_episodes[n])
# 3. Participant that always switches
switch1 <- sample(1:3,replace=T,size=n_episodes[n])
switch_loss <- apply(as.matrix(switch1), 1, function(x) sample(c(1,2,3)[-x],1))
# 4. Putting things together
mh_dat <- mh_dat |>
mutate(keep=keep,
switch1=switch1,
switch=ifelse(switch1==winning_door,switch_loss,winning_door),
keep_win=ifelse(keep==winning_door,1,0),
switch_win=ifelse(switch==winning_door,1,0))
if(n==1){
# 5. Results
out <- mh_dat |>
summarise(keep_win_rate=mean(keep_win),
switch_win_rate=mean(switch_win)) |>
mutate(n=n_episodes[n])
}
else{
tmp <- mh_dat |>
summarise(keep_win_rate=mean(keep_win),
switch_win_rate=mean(switch_win)) |>
mutate(n=n_episodes[n])
out <- rbind(out,tmp)
}
}
out |> pivot_longer(ends_with("_rate"),names_to = "strat",values_to = "rate",
names_pattern = "(.*)_win_rate") |>
ggplot(aes(x=n,y=rate,color=strat))+
geom_hline(yintercept = 1/3,color="#4F2683",linetype="dashed")+
geom_hline(yintercept = 2/3,color="#F23030",linetype="dashed")+
geom_line()+
scale_color_manual(values=c("#4F2683","#F23030"))+
theme_minimal()+
theme(legend.position = "top",
legend.title = element_blank())+
xlab("\nSample Size")+
ylab("Winning Rate\n")
n_episodes <- 100
for (j in 1:1000){
# 1. Simulate a bunch of episodes
mh_dat <- data.frame(winning_door=sample(1:3,replace = T,n_episodes))
# 2. Participant that always keeps
keep <- sample(1:3,replace=T,size=n_episodes)
# 3. Participant that always switches
switch1 <- sample(1:3,replace=T,size=n_episodes)
switch_loss <- apply(as.matrix(switch1), 1, function(x) sample(c(1,2,3)[-x],1))
# 4. Putting things together
mh_dat <- mh_dat |>
mutate(keep=keep,
switch1=switch1,
switch=ifelse(switch1==winning_door,switch_loss,winning_door),
keep_win=ifelse(keep==winning_door,1,0),
switch_win=ifelse(switch==winning_door,1,0))
if(j==1){
# 5. Results
out <- mh_dat |>
summarise(keep_win_rate=mean(keep_win),
switch_win_rate=mean(switch_win)) |>
mutate(iteration=j)
}
else{
tmp <- mh_dat |>
summarise(keep_win_rate=mean(keep_win),
switch_win_rate=mean(switch_win)) |>
mutate(iteration=j)
out <- rbind(out,tmp)
}
}
gdat <- out |> pivot_longer(ends_with("_rate"),names_to = "strat",values_to = "rate",
names_pattern = "(.*)_win_rate")
mses <- out |>
summarise(mse_keep=sum((keep_win_rate-1/3)^2)/1000,
mse_switch=sum((switch_win_rate-2/3)^2)/1000)
ggplot(gdat,aes(x=rate, color=strat,fill=strat))+
geom_vline(xintercept = 1/3,color="black")+
geom_vline(xintercept = 2/3,color="black")+
geom_vline(xintercept = mean(out$keep_win_rate),color="#4F2683",linetype="dashed")+
geom_vline(xintercept = mean(out$switch_win_rate),color="#F23030",linetype="dashed")+
geom_density(alpha=.2)+
annotate("text",x = 1/3-.1,y=5,label=paste0("MSE=",round(mses$mse_keep,5)),color="#4F2683")+
annotate("text",x = 2/3+.1,y=5,label=paste0("MSE=",round(mses$mse_switch,5)),color="#F23030")+
scale_color_manual(values=c("#4F2683","#F23030"))+
scale_fill_manual(values=c("#4F2683","#F23030"))+
theme_minimal()+
theme(legend.position = "top",
legend.title = element_blank(),
axis.title.y=element_blank())+
xlab("Winning Rate\n")
Aronow, P. M., & Miller, B. T. (2019). Foundations of agnostic statistics. Cambridge University Press.↩︎