Open lab 0.
You have seen this in many forms:
rio::import()
, or other functions depending on the file
format.library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data(Arrests, package="carData")
Arrests <- as_tibble(Arrests)
|>
/%>%
# Instead of doing
## Select some columns
dat <- select(Arrests,colour,age,sex)
## Add a new variable made from an existing one
dat$male <- ifelse(dat$sex=="Male",1,0)
## Summarizing the frequencies
table(dat$colour[dat$male==1])
##
## Black White
## 1216 3567
# You can do
dat <- Arrests |>
select(colour,age,sex) |>
mutate(male=ifelse(sex=="Male",1,0)) |>
filter(male==1) |>
group_by(colour) |>
summarise(n=n())
dat
## # A tibble: 2 × 2
## colour n
## <fct> <int>
## 1 Black 1216
## 2 White 3567
dplyr
verbsselect()
: to select columns;filter()
: to filter data according to some
condition;mutate()
: create a variable or modify an existing
one;rename()
: change the label/name of an existing
variable;group_by()
: group the data according to some
variable(s) such that the following operations are made
“group-wise”;summarise()
: reduce the data to some statistics in
accordance with groups if previously specified;arrange()
: order the rows according to the values of a
particular variable;slice()
: keep a set of rows;slice_head()
: keep the first n rows;slice_tail()
: keep the last n rows;na.omit()
: removes any row for which one of the
available columns contains a NA;drop_na()
: removes any row for which NA appears in the
specified column.Black
and
White
?table(Arrests$released)
##
## No Yes
## 892 4334
head(Arrests$employed=="Yes",n=10)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
c("Yes","No","Maybe")=="Yes"
## [1] TRUE FALSE FALSE
Arrests |>
mutate(released_bin=ifelse(released=="Yes",1,0)) |>
filter(employed=="Yes") |>
group_by(colour) |>
summarise(average=mean(released_bin))
## # A tibble: 2 × 2
## colour average
## <fct> <dbl>
## 1 Black 0.797
## 2 White 0.888
Arrests |>
mutate(released_bin=ifelse(released=="Yes",1,0)) |>
filter(employed=="Yes") |>
group_by(colour,sex,citizen) |>
summarise(average=mean(released_bin))
## `summarise()` has grouped output by 'colour', 'sex'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 4
## # Groups: colour, sex [4]
## colour sex citizen average
## <fct> <fct> <fct> <dbl>
## 1 Black Female No 1
## 2 Black Female Yes 0.857
## 3 Black Male No 0.716
## 4 Black Male Yes 0.820
## 5 White Female No 1
## 6 White Female Yes 0.932
## 7 White Male No 0.818
## 8 White Male Yes 0.891
arrestFreqYear <- Arrests |>
group_by(year) |>
summarise(n=n()) |>
arrange(-n)
arrestFreqYear |> slice_head(n=2)
## # A tibble: 2 × 2
## year n
## <int> <int>
## 1 2000 1270
## 2 2001 1211
arrestFreqYear |> slice_tail(n=2)
## # A tibble: 2 × 2
## year n
## <int> <int>
## 1 1997 492
## 2 2002 277
UN
dataset found in the carData
package, extract the following variables
(country
,ppgdp
,lifeExpF
,pctUrban
,infantMortality
)
and find the top 5 countries for infantMortality
among full
information countries.data(UN, package="carData")
UN <- as_tibble(UN,rownames="country")
UN |>
select(country,ppgdp,lifeExpF,pctUrban,infantMortality) |>
na.omit() |>
arrange(-infantMortality) |>
slice_head(n=5)
## # A tibble: 5 × 5
## country ppgdp lifeExpF pctUrban infantMortality
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 499 49.5 23 125.
## 2 Chad 727. 51.6 28 124.
## 3 Guinea-Bissau 539. 50.4 30 110.
## 4 Democratic Republic of the Congo 201. 50.6 36 109.
## 5 Sierra Leone 352. 48.9 39 103.
Sometimes, datasets ship with nonsensical variable names. You have two options:
rename()
from dplyr
;names()
.data(CES11, package="carData")
# Option 1
CleanCES <- CES11 |> rename(relImportant = importance)
names(CleanCES)
## [1] "id" "province" "population" "weight" "gender"
## [6] "abortion" "relImportant" "education" "urban"
# Option 2
CleanCES <- CES11
names(CleanCES)
## [1] "id" "province" "population" "weight" "gender"
## [6] "abortion" "importance" "education" "urban"
names(CleanCES)[7] <- "relImportant"
names(CleanCES)
## [1] "id" "province" "population" "weight" "gender"
## [6] "abortion" "relImportant" "education" "urban"
This is different from renaming variables in that you create a new column in the dataset. This is done to:
There are two ways to create new variables:
mutate()
from dplyr
;$
and assigning the new values.Here are a few functions that come in handy in a
mutate()
call when you want to change the structure of a
variable:
ifelse()
;case_when()
.Let’s see some examples.
# Base R
CleanCES$female <- 0
CleanCES$female[CleanCES$gender=="Female"] <- 1
# dplyr
CleanCES <- CleanCES |>
mutate(female = ifelse(gender=="Female",1,0))
Note that you must always be carefull with NAs!
# Adding fake NAs for the example
set.seed(122)
fakeNA <- sample(c(T,F),size=nrow(CES11),replace=T,prob = c(.01,.99))
tmp <- CES11
tmp$gender[fakeNA] <- NA
# See if it worked
table(tmp$gender, useNA = "ifany")
##
## Female Male <NA>
## 1234 979 18
# Another way
sum(is.na(tmp$gender))
## [1] 18
# Let's recode using ifelse
Clean_tmp <- tmp |>
mutate(female = ifelse(gender=="Female",1,0))
table(Clean_tmp$female, useNA = "ifany")
##
## 0 1 <NA>
## 979 1234 18
# Let's recode using base R
Clean_tmp$female <- 0
Clean_tmp$female[Clean_tmp$gender=="Female"] <- 1
table(Clean_tmp$female, useNA = "ifany")
##
## 0 1
## 997 1234
table(CleanCES$education, useNA = "ifany")
##
## bachelors college higher HS lessHS somePS
## 506 491 246 467 267 254
CleanCES <- CleanCES |>
mutate(educ_3l = case_when(education %in% c("bachelors","college","higher") ~ "uni",
education %in% c("HS","somePS") ~ "hs",
education == "lessHS" ~ "no hs"))
table(CleanCES$educ_3l, useNA = "ifany")
##
## hs no hs uni
## 721 267 1243
Remember how I told you there were 4 main data types?
TRUE
(1) or
FALSE
(0) – abbreviate using T
and
F
Well there’s a fifth one that you need to know about: factors.
Factors place a numerical value on characters. Very usefull when computers use to have limited RAM, but it’s not use for this anymore. Today we use it to impose a new ordering to character variables, instead of them being oredered alphabetically.
See for example the order of the levels
of our
educ_3l
variable:
table(CleanCES$educ_3l)
##
## hs no hs uni
## 721 267 1243
This means that if I want to produce a graph out of the data, it’s going to be in that order.
ggplot(CleanCES,aes(x=educ_3l)) +
geom_bar()
To prevent this, we can use the factor
function:
CleanCES <- CleanCES |>
mutate(educ_3l_fctr = factor(educ_3l, levels=c("no hs", "hs", "uni")))
ggplot(CleanCES,aes(x=educ_3l_fctr)) +
geom_bar()
fertility
rate and ppgdp
of
the 4 countries with the highest female life expectancy
(lifeExpF
)? Make sure you are using no country where
region
is missing.UN |>
drop_na(region) |>
arrange(-lifeExpF) |>
select(country,fertility,ppgdp,lifeExpF) |>
slice_head(n=4)
## # A tibble: 4 × 4
## country fertility ppgdp lifeExpF
## <chr> <dbl> <dbl> <dbl>
## 1 Japan 1.42 43141. 87.1
## 2 Hong Kong 1.14 31824. 86.4
## 3 France 1.99 39546. 84.9
## 4 Spain 1.50 30543. 84.8
What is the fertility
rate and ppgdp
of
the 2 most urbanized country for each region
? Make sure you
are using no country where region
is missing.
Create a variable called fertility1000
, which is the
result of this operation fertility*1000-infantMortality
.
What is the highest country on fertility1000
in each
region?