### Davis R Users Group: Teaching R Live Code
### https://tinyurl.com/teachingR     # link to dropbox version
### https://tinyurl.com/teachingRnew  # link to raw script
### FYI: This is a frequently updated R script used for "live-coding" while 
### teaching. Contents may be removed/altered/changed and saved to the 
### Davis R Users Group website: https://d-rug.github.io/ 

################################################################################
## Manipulating, analyzing, and exporting data with tidyverse
################################################################################


# Getting started ---------------------------------------------------------

# First let's download a data set called "portal_data_joined.csv"
# We'll put it in our data folder inside of our project directory.
# If you don't already have a data directory, you can create one in 
# the files pane. Alternatively, you can download the data by hand:
# https://tinyurl.com/y36xgftg

download.file(url = "https://tinyurl.com/y36xgftg", 
              destfile = "data/portal_data_joined.csv")

install.packages("tidyverse")
library(tidyverse)


surveys <- read_csv("data/portal_data_joined.csv")
str(surveys)
View(surveys)


# selecting columns and filtering rows ------------------------------------

select(surveys, plot_id, species_id, weight)
select(surveys, -record_id, -species_id)
# check the column names to make sure I didn't have a typo
colnames(surveys)

filter(surveys, year == 1995)


# Pipes -------------------------------------------------------------------

surveys2 <- filter(surveys, weight < 5)
surveys_sml <- select(surveys2, species_id, sex, weight)

surveys_sml <- select(filter(surveys, weight < 5), species_id, sex, weight)

surveys %>% 
  filter(weight < 5) %>% 
  select(species_id, sex, weight) 

surveys_sml <- surveys %>% 
  filter(weight < 5) %>% 
  select(species_id, sex, weight)

head(surveys_sml)

## Challenge 1:
## Using pipes, subset the surveys data to include animals collected
## before 1995 and retain only the columns year, sex, and weight

surveys %>% 
  filter(year < 1995) %>% 
  select(year, sex, weight) %>% 
  arrange(desc(year)) 


# Mutate ------------------------------------------------------------------

surveys %>% 
  mutate(weight_kg = weight/1000)


surveys %>% 
  mutate(weight_kg = weight / 1000,  
         weight_lb = weight_kg * 2.2) %>%
  select(weight, weight_kg, weight_lb)


surveys %>% 
  filter(!is.na(weight)) %>% 
  mutate(weight_kg = weight / 1000)

surveys %>% 
  mutate(weight_kg = weight / 1000) %>% 
  filter(!is.na(weight))

## Challenge 2:
## Create a new data frame from the surveys data that meets the following
## criteria: contains only the species_id column and a new column called
## hindfoot_half containing values that are half the hindfoot_length values.
## In this hindfoot_half column, there are no NAs and all values are less
## than 30. 
## Hint: think about how the commands should be ordered to produce this 
## data frame!

surveys_hindfoot_half <- surveys %>%
  mutate(hindfoot_half = hindfoot_length / 2) %>%
  filter(!is.na(hindfoot_half)) %>%
  filter(hindfoot_half < 30) %>%
  select(species_id, hindfoot_half)

surveys_hindfoot_half <- surveys %>%
  mutate(hindfoot_half = hindfoot_length / 2) %>%
  filter(!is.na(hindfoot_half), hindfoot_half < 30) %>%
  select(species_id, hindfoot_half)

# the comma in filter signifies "and". If you're filtering on one
# single common and you want two criteria to be met, it won't work!
# In that case, you need to use the "or" (|) criteria. 

# how to check if dplyr did what we wanted it to do
# make a histogram
hist(surveys_hindfoot_half$hindfoot_half)
# use the summary function
summary(surveys_hindfoot_half)
# fancier:
# skimmr package
install.packages("skimr")
library(skimr)
skim(surveys_hindfoot_half)