5 Introduction to R and the Tidyverse
5.1 Exercise R script
The R script can be downloaded here and is also reproduced below.
## We load Tidyverse here, which includes a suite of packages that can work together.
## When loading directly like below, you get a table which tells you what packages are attached
library(tidyverse)
## you will also see a list of conflicted functions - that is, function names that appear
## in multiple packages. For example, there is a "filter" function in both the stats package
## and in the dplyr package. See the course notes about namespace prefixes (and e.g.
## the "dplyr::filter()" usages below)
## Load the starwars dataset found in the dplyr package (we'll use this later!)
data(starwars)
#################################################################
#### Data importing ####
########################
setwd("c:/EGABIcourse19-master/docs/course_material/tidyverse")
### Using base R read.csv the column classes are imported poorly and we would have to
## convert them ourselves later!!
X.csv <- read.csv('tidyverse_dummy_import.csv')
sapply(X.csv, class)
## the readr library can read in csvs as well, except this is done intelligently and fields
## can be defined better
X.csv_r <- readr::read_csv('tidyverse_dummy_import.csv',
col_types=list(Occurrence_field = col_factor()))
## The readxl library is for reading other excel formats and WORKS WELL FOR DATETIME columns
## However, it does not have a column type option for factors
## (if you wanted to define a column as a factor)
X.xlsx_t <- readxl::read_xlsx('tidyverse_dummy_import.xlsx',1,
col_types = c('guess','numeric','numeric','text','date'))
## you could make a column a factor, though, if you needed to:
X.xlsx_t$Occurrence_field <- as.factor(X.xlsx_t$Occurrence_field)
###################################################################
#### Data wrangling ####
########################
# select the name, gender and homeworld columns
starwars %>% dplyr::select(name,gender,homeworld)
## note that we use dplyr::select(), not just select(), because there are several "select"
## functions in different packages. Using the namespace prefix makes it clear which one
## we are using and less prone to errors caused by e.g. packages being loaded in a
## different order
# r base equivalent
starwars[,c('name','gender','homeworld')]
# filter rows with characters from Tatooine
starwars %>% dplyr::filter(homeworld=='Tatooine')
# r base equivalent
starwars[which(starwars$homeworld=='Tatooine'),]
# stringing arguments
starwars %>%
dplyr::select(name,gender,homeworld)%>%
dplyr::filter(homeworld=='Tatooine')
#r base equivalent
starwars[which(starwars$homeworld=='Tatooine'),c('name','gender','homeworld')]
##################################
#### grouping and summarising ####
##################################
starwars %>% group_by(homeworld)
## This gets the mean height of characters, grouped by homeworld, and adds a count to
## get sample size
starwars %>%
group_by(homeworld) %>%
summarise(mean_height = mean(height),n = n())
## This gets the mean height of characters, grouped by homeworld, and adds a count to
## get sample size
starwars %>%
group_by(homeworld,gender) %>%
summarise(mean_height = mean(height),n = n()) %>%
dplyr::filter(!is.na(homeworld))
### Other ways to count!
starwars %>% count(homeworld,gender) %>% dplyr::filter(!is.na(homeworld))
starwars %>% add_count(homeworld,gender) %>% dplyr::filter(!is.na(homeworld))
################
#### TASKS ####
## Create a table with mean median and standard deviation of mass for all humans with brown hair
starwars %>%
group_by(hair_color,species) %>%
summarise(meanmass=mean(mass,na.rm = T),
medmass=median(mass,na.rm = T),
sdmass=sd(mass,na.rm = T)) %>%
dplyr::filter(species == 'Human',hair_color=='brown')
## Get the tallest and shortest non-human character
starwars %>%
dplyr::filter(species != 'Human') %>%
arrange(height)
starwars %>%
dplyr::filter(species != 'Human') %>%
arrange(-height)
## How many droids were in Attack of the Clones?
starwars %>%
unnest(films) %>%
count(species,films) %>%
dplyr::filter(species=='Droid',films=='Attack of the Clones')
######################################
#### Mutate ####
# This will calculate a new column that is the ration of height to mass
starwars %>%
mutate(hm_ratio = height/mass)
######################################
#### join ####
df1 <- tibble(
category = c('a','a','b','c','c','a','b'),
id = c(1,2,3,4,5,6,7)
)
df2 <- tibble(
category = c('b','a','b','a','a','c','c'),
id = c(7,6,5,4,3,2,1)
)
## This will join the two tibbles by the id value.
df1 %>% left_join(df2,by='id')
#################################################################
#### Gather and Spread ####
###########################
### This is a dataset that you might come across - different devices taking
## measurements on the same day
sst_data <- data.frame(
time = as.Date('2009-01-01') + 0:9,
unit_A = round(runif(10, 6, 9),2),
unit_B = round(runif(10, 7, 10),2),
unit_C = round(runif(10, 6, 10),2)
)
## Gathering makes the dataframe go from wide to LONG format (Long format
## typically used by ggplot2)
sst_data %>%
gather(unit,sst,-time)
## spread is the opposite function to gather and goes from LONG to WIDE format
sst_data %>%
gather(unit,sst,-time) %>%
spread(unit,sst)
###############################
#### Nesting ####
## This nests the unit and sst columns into a table for every time column.
nested_sst <- sst_data %>%
gather(unit,sst,-time) %>%
nest(unit,sst)
## This can be used to view a specific data row table
nested_sst$data[[1]]
###############################
#### Tasks 2 ####
#################################
## A tibble with star wars films ranked (by my preference)
film.rankings <- tibble(
films = c('The Phantom Menace','Attack of the Clones','Revenge of the Sith',
'A New Hope','The Empire Strikes Back','Return of the Jedi',
'The Force Awakens'),
rank = c(7,6,5,3,4,1,2)
)
## What films does the character Plo Koon appear in, ordered by rank? The final result
## should be a single table drawn from a list column
Plo.Koon <-starwars %>%
dplyr::select(name,films) %>%
unnest(films) %>%
left_join(film.rankings,by='films') %>%
arrange(name,rank) %>%
nest(films,rank) %>%
dplyr::filter(name == 'Plo Koon')
## Create a table that nests height and mass data for only humans in a column nested
## by film name.
nested.mass.height <- starwars %>%
unnest(films)%>%
dplyr::select(name,height,mass,films,species) %>%
dplyr::filter(species=='Human')%>%
nest(name,height,mass)
################################################################################
#### Final task - functional programming
## Create an object that is a list of vectors
X <- c(list(seq(1,50,5)),list(seq(40,100,2)))
## base R
lapply(X,mean)
## tidyverse
map(X,mean)
### This shows how you can use map to run functions over lists to get interesting model
## results for large arrays of data. Very helpful for summarising data quickly
nested.mass.height %>%
mutate(
model = map(
data,~lm(mass~height,data=.x)
),
slope = map_dbl(
model, ~coef(.x)['height']
),
r.sq = map_dbl(
model,~glance(.x)$'r.squared'
),
sample_size = map_dbl(
data,~nrow(.x)
),
tallest = map(
data,~.x[which.max(.x$height),'name']
),
heaviest = map(
data,~.x[which.max(.x$mass),'name']
)
) %>%
unnest(tallest,heaviest) %>%
rename(tallest=name,heaviest=name1)
#########################################################################
#### ggplot2 link ####
starwars %>%
dplyr::filter(species=='Human',gender=='male') %>%
ggplot(aes(x=mass,y=height))+
geom_point()+
geom_smooth(method=lm)
5.2 See also
- The Software Carpentry Introduction to R and RStudio.
- An Introduction to R by the Ocean Biogeographic Information System (OBIS).
- Getting Started in R — Tidyverse Edition by Saghir Bashir.