################################################################################
## Intro to plotting with ggplot2: building plots iteratively 2020-02-06
################################################################################


# Getting started ---------------------------------------------------------

# First let's download a data set called "portal_data_joined.csv"
# We'll put it in our data folder inside of our project directory.
# If you don't already have a data directory, you can create one in 
# the files pane. Alternatively, you can download the data by hand:
# https://tinyurl.com/y36xgftg

download.file(url = "https://tinyurl.com/y36xgftg", 
              destfile = "data/portal_data_joined.csv")

install.packages("tidyverse")
library(tidyverse)

surveys <- read_csv("data/portal_data_joined.csv")
str(surveys)
View(surveys)


# ggplot! -----------------------------------------------------------------

ggplot(data = surveys)

ggplot(data = surveys, 
       mapping = aes(x = weight, y = hindfoot_length)) +
  geom_point()

# GGplot automatically removes empty observations, e.g. NAs
# Warning message:
# Removed 4048 rows containing missing values (geom_point). 

library(dpylr)
# Let's see the NAs
survey_weight_nas <- surveys %>%
  filter(is.na(weight))

# Let's filter the NAs
surveys_complete <- surveys %>%
  filter(!is.na(weight),
         !is.na(hindfoot_length),
         !is.na(sex))

# now we no longer get a warning message because we filtered
# all of the NAs out of our data.
ggplot(data = surveys_complete, 
       mapping = aes(x = weight, y = hindfoot_length)) +
  geom_point()

# let's continue plotting with surveys

surveys_plot <- ggplot(data = surveys,
                       mapping = aes(x = weight, y = hindfoot_length))

surveys_plot + geom_point()

# let's build on our plot iteratively to make it better

ggplot(data = surveys, 
       mapping = aes(x = weight, y = hindfoot_length)) +
  geom_point(alpha = 0.1)

ggplot(data = surveys,
       mapping = aes(x = weight, y = hindfoot_length)) +
  geom_point(alpha = 0.1, color = "blue")


ggplot(data = surveys,
       mapping = aes(x = weight, y = hindfoot_length)) +
  geom_point(alpha = 0.1, aes(color = species_id))

ggplot(data = surveys,
       mapping = aes(x = weight, y = hindfoot_length)) +
  geom_point(alpha = 0.1, aes(color = species_id)) +
  scale_color_brewer(palette = "Paired")

# a pallete with 20 colors is Tableau20. 
# I think it's in the ggthemes package

# to add specific colors for your data, add a layer
ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) +
  geom_point(aes(color = Species)) +
  scale_color_manual(values = c(setosa = "red", virginica = "blue", versicolor ="green"))

ggplot(surveys, aes(x = species_id, y = weight)) +
  geom_boxplot()

ggplot(surveys, aes(x = species_id, y = weight)) +
  geom_boxplot() +
  geom_jitter(alpha = .3, color = "tomato")

ggplot(surveys, aes(x = species_id, y = weight)) +
  geom_jitter(alpha = .1, color = "tomato") +
  geom_boxplot

ggplot(surveys, aes(x = species_id, y = weight)) +
  geom_violin()


# Challenges --------------------------------------------------------------

## Challenge (optional)
## Scatter plots can be useful exploratory tools for small datasets. 
## For data sets with large numbers of observations, such as the 
## surveys_complete data set, overplotting of points can be a limitation 
## of scatter plots. One strategy for handling such settings is to use 
## hexagonal binning of observations. The plot space is tessellated into 
## hexagons. Each hexagon is assigned a color based on the number of 
## observations that fall within its boundaries. To use hexagonal binning with 
## ggplot2, first install the R package hexbin from CRAN:

install.packages("hexbin")
library(hexbin)
## Then use the geom_hex() function:
surveys_plot +
  geom_hex()

## What are the relative strengths and weaknesses of a hexagonal bin plot 
## compared to a scatter plot? Examine the above scatter plot and compare
## it with the hexagonal bin plot that you created.

## Challenge
## Use what you just learned to create a scatter plot of weight over 
## species_id with the plot types showing in different colors. Is this a good 
## way to show this type of data? 


## Challenges
# Boxplots are useful summaries, but hide the shape of the distribution.
# For example, if the distribution is bimodal, we would not see it in a boxplot.
# An alternative to the boxplot is the violin plot, where the shape 
# (of the density of points) is drawn.

# 1. a. Replace the box plot with a violin plot; see geom_violin().

# 2. In many types of data, it is important to consider the scale of the 
#    observations. For example, it may be worth changing the scale of the 
#    axis to better distribute the observations in the space of the plot. 
#    Changing the scale of the axes is done similarly to adding/modifying 
#    other components (i.e., by incrementally adding commands). Try making 
#    these modifications:
#  
#   a. Represent weight on the log10 scale; see scale_y_log10().
#
# 3. So far, we’ve looked at the distribution of weight within species. 
#    Try making a new plot to explore the distribution of another variable
#    within each species.
#   a. Create a boxplot for hindfoot_length. Overlay the boxplot layer on a 
#      jitter layer to show actual measurements.
#   b. Add color to the data points on your boxplot according to the plot from 
#      which the sample was taken (plot_id).
# 
# Hint: Check the class for plot_id. Consider changing the class of plot_id 
# from integer to factor. Why does this change how R makes the graph?