Midterm Project

#loading dock
library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(describedata)
library(ggplot2)
library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(devtools)

## Loading required package: usethis

library(skimr)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

tuesdata <- tidytuesdayR::tt_load('2020-03-31')

## --- Compiling #TidyTuesday Information for 2020-03-31 ----

## --- There are 4 files available ---

## --- Starting Download ---

## 
##  Downloading file 1 of 4: `beer_states.csv`
##  Downloading file 2 of 4: `beer_taxed.csv`
##  Downloading file 3 of 4: `brewer_size.csv`
##  Downloading file 4 of 4: `brewing_materials.csv`

## --- Download complete ---

tuesdata <- tidytuesdayR::tt_load(2020,week = 14)

## --- Compiling #TidyTuesday Information for 2020-03-31 ----

## --- There are 4 files available ---

## --- Starting Download ---

## 
##  Downloading file 1 of 4: `beer_states.csv`
##  Downloading file 2 of 4: `beer_taxed.csv`
##  Downloading file 3 of 4: `brewer_size.csv`
##  Downloading file 4 of 4: `brewing_materials.csv`

## --- Download complete ---

brewing_materials <- tuesdata$brewing_materials
beer_states <- tuesdata$beer_states

brewer_size <- tuesdata$brewer_size

Define Your Research Question (10 points)

Define your research question below. What about the data interests you? What is a specific question you want to find out about the data?

The data we are looking at has to do with beer production. The research question we would like to ask is which months have the highest production of beer? Are there months that have higher production? We will do this by looking at the barrels produced per month. Is beer production higher in the summer months? This data interests me because we like beer!

Given your question, what is your expectation about the data?

We expect that the summer months will have a higher production.

Loading the Data (10 points)

Load the data below and use dplyr::glimpse() or skimr::skim() on the data. You should upload the data file into the data directory.

beer_taxed <- tuesdata$beer_taxed
glimpse(beer_taxed)

## Rows: 1,580
## Columns: 10
## $ data_type        <chr> "Barrels Produced", "Barrels Produced", "Barrels Prod~
## $ tax_status       <chr> "Totals", "Taxable", "Taxable", "Taxable", "Sub Total~
## $ year             <dbl> 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008,~
## $ month            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,~
## $ type             <chr> "Production", "In bottles and cans", "In barrels and ~
## $ month_current    <dbl> 16211480, 13222104, 1371239, 7426, 14600769, 262985, ~
## $ month_prior_year <dbl> 15880125, 12824278, 1357372, 8419, 14190069, 268473, ~
## $ ytd_current      <dbl> 16211480, 13222104, 1371239, 7426, 14600769, 262985, ~
## $ ytd_prior_year   <dbl> 15880125, 12824278, 1357372, 8419, 14190069, 268473, ~
## $ tax_rate         <chr> "$7/$18 per barrel", "$7/$18 per barrel", "$7/$18 per~

skim(beer_taxed)

Data summary
Name	beer_taxed
Number of rows	1580
Number of columns	10
_______________________
Column type frequency:
character	4
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
data_type	1	16	16	1
tax_status	1	6	18	5
type	1	7	28	12
tax_rate	1	17	20	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2013.49	3.45	2008	2010.0	2013	2016	2019	▇▅▅▅▇
month	0	1.00	6.51	3.45	1	4.0	7	10	12	▇▅▅▅▇
month_current	0	1.00	6544097.78	6904094.42	0	111379.8	1420096	13745839	19131217	▇▁▂▃▂
month_prior_year	0	1.00	6597077.26	6964634.60	0	113821.0	1454573	13826875	18667697	▇▁▁▃▂
ytd_current	24	0.98	42394946.37	56287222.56	0	417666.8	8321667	79415265	199466287	▇▂▁▁▁
ytd_prior_year	24	0.98	42804142.80	56840130.66	0	381690.0	8489038	79866082	199618070	▇▂▁▁▁

If there are any quirks that you have to deal with NA coded as something else, or it is multiple tables, please make some notes here about what you need to do before you start transforming the data in the next section.

Make sure your data types are correct!

Transforming the data (15 points)

If the data needs to be transformed in any way (values recoded, pivoted, etc), do it here. Examples include transforming a continuous variable into a categorical using case_when(), etc.

#Mia did this section :)
#change month current to divide by 1000 so easier to look at
beer_transformed <- beer_taxed %>% 
  mutate(month_current = month_current/1000)


#take beer transformed and group by month. then summarize so that there is a new variable sum_barrel_per_month that is now the sum
beer_transformed_2 <- beer_transformed %>% 
  group_by(month) %>% 
  summarize(sum_barrel_per_month = sum(month_current))

#assign the months names
beer_transformed_3 <- beer_transformed_2 %>% 
  mutate(
    month_wnames= case_when(
      month == 1~"January",
      month == 2~"February",
      month == 3~"March",
      month == 4~"April",
      month == 5~"May",
      month == 6~"June",
      month == 7~"July",
      month == 8~"August",
      month == 9~"September",
      month == 10~"October",
      month == 11~"November",
      month == 12~"December",
    )
  )
beer_transformed_3

## # A tibble: 12 x 3
##    month sum_barrel_per_month month_wnames
##    <dbl>                <dbl> <chr>       
##  1     1              788530. January     
##  2     2              773915. February    
##  3     3              898996. March       
##  4     4              882658. April       
##  5     5              941987. May         
##  6     6              970459. June        
##  7     7              926359. July        
##  8     8              917067. August      
##  9     9              874489. September   
## 10    10              829143. October     
## 11    11              757803. November    
## 12    12              778268. December

#transform to make by season
beer_transformed_4 <- beer_transformed_3 %>% 
  mutate(
    seasons = case_when(
      month_wnames == "March"| month_wnames == "April"|month_wnames == "May"~ "Spring",
      month_wnames == "June"| month_wnames == "July"|month_wnames == "August" ~ "Summer",
      month_wnames == "September"| month_wnames == "October"|month_wnames == "November" ~ "Fall",
      month_wnames == "December"| month_wnames == "January"|month_wnames == "February"~ "Winter",
    )
  )
#Transform by amount
beer_transformed_4 <- beer_transformed_4 %>% 
  mutate(
    barrel_amount= case_when(
      sum_barrel_per_month < 800000~ "Less than 800,000e^3",
      sum_barrel_per_month >= 800000 & sum_barrel_per_month < 900000 ~ "800,000e^3 - 900,000e^3",
      sum_barrel_per_month >= 900000~ "More than 900000e^3",
    )
  )
beer_transformed_4

## # A tibble: 12 x 5
##    month sum_barrel_per_month month_wnames seasons barrel_amount          
##    <dbl>                <dbl> <chr>        <chr>   <chr>                  
##  1     1              788530. January      Winter  Less than 800,000e^3   
##  2     2              773915. February     Winter  Less than 800,000e^3   
##  3     3              898996. March        Spring  800,000e^3 - 900,000e^3
##  4     4              882658. April        Spring  800,000e^3 - 900,000e^3
##  5     5              941987. May          Spring  More than 900000e^3    
##  6     6              970459. June         Summer  More than 900000e^3    
##  7     7              926359. July         Summer  More than 900000e^3    
##  8     8              917067. August       Summer  More than 900000e^3    
##  9     9              874489. September    Fall    800,000e^3 - 900,000e^3
## 10    10              829143. October      Fall    800,000e^3 - 900,000e^3
## 11    11              757803. November     Fall    Less than 800,000e^3   
## 12    12              778268. December     Winter  Less than 800,000e^3

#another transformation to make a dataframe of just the seasons: 
seasons_only <- beer_transformed_4 %>% 
  group_by(seasons) %>% 
  summarize(season_total = sum(sum_barrel_per_month), seasons_mean = mean(sum_barrel_per_month),season_sd = sd(sum_barrel_per_month))

seasons_only

## # A tibble: 4 x 4
##   seasons season_total seasons_mean season_sd
##   <chr>          <dbl>        <dbl>     <dbl>
## 1 Fall        2461436.      820479.    58824.
## 2 Spring      2723640.      907880.    30646.
## 3 Summer      2813885.      937962.    28525.
## 4 Winter      2340713.      780238.     7504.

Bonus points (5 points) for datasets that require merging of tables, but only if you reason through whether you should use left_join, inner_join, or right_join on these tables. No credit will be provided if you don’t.

Show your transformed table here. Use tools such as glimpse(), skim() or head() to illustrate your point.

#look at transformed data
glimpse(beer_transformed_4)

## Rows: 12
## Columns: 5
## $ month                <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
## $ sum_barrel_per_month <dbl> 788529.8, 773915.2, 898995.9, 882657.5, 941986.9,~
## $ month_wnames         <chr> "January", "February", "March", "April", "May", "~
## $ seasons              <chr> "Winter", "Winter", "Spring", "Spring", "Spring",~
## $ barrel_amount        <chr> "Less than 800,000e^3", "Less than 800,000e^3", "~

head(seasons_only)

## # A tibble: 4 x 4
##   seasons season_total seasons_mean season_sd
##   <chr>          <dbl>        <dbl>     <dbl>
## 1 Fall        2461436.      820479.    58824.
## 2 Spring      2723640.      907880.    30646.
## 3 Summer      2813885.      937962.    28525.
## 4 Winter      2340713.      780238.     7504.

Are the values what you expected for the variables? Why or Why not?

The values above are pretty close to what we expected. From the seasons data, we can see that the summer season had the highest barrel production, which aligns with our hypothesis. Although, we expected that the summer month production would be even more drastic compared to the other seasons.

Visualizing and Summarizing the Data (15 points)

Use group_by() and summarize() to make a summary of the data here. The summary should be relevant to your research question

#summary of average barrels produced by month -  shows that May, June & July have the highest production
beer_taxed %>%
  group_by(month) %>%
  summarize(average_barrel = mean(month_current, na.rm = TRUE))

## # A tibble: 12 x 2
##    month average_barrel
##    <dbl>          <dbl>
##  1     1       6065614.
##  2     2       5907750.
##  3     3       6810575.
##  4     4       6686799.
##  5     5       7136265.
##  6     6       7351964.
##  7     7       7071440.
##  8     8       6947479.
##  9     9       6624919.
## 10    10       6281390.
## 11    11       5740930.
## 12    12       5895973.

What are your findings about the summary? Are they what you expected?

We predicted that the highest amount of barrels produced would be during the summer months (June, July & August). This summary partly supports our hypothesis because it includes June & July as the highest production months, but not August.

Make at least two plots that help you answer your question on the transformed or summarized data. Use scales and/or labels to make each plot informative.

#Ashley did this :) 

#do this code so the months are in order on the graph
beer_transformed_4$month_wnames <- factor(beer_transformed_4$month_wnames, levels = c("January", "February","March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))

#make a bar plot of it
bar_graph <- ggplot(beer_transformed_4)+
  aes(x = month_wnames,
      y = sum_barrel_per_month,
      color = seasons,
      fill = barrel_amount)+
  geom_bar(stat='identity')+
  ggtitle("Number of Barrels of Beer produced Per Month")+
  xlab("month")+
  ylab("total # barrels *10^3")+
  theme(plot.title = element_text(hjust = 0.5))+
  scale_fill_manual(values = c("brown1","deeppink3","darksalmon","grey"))+
  scale_color_manual(values= c("white", "white", "black", "white"))+
  scale_x_discrete(guide = guide_axis(n.dodge=2)) #this spaced out the months on the x-axis

bar_graph

line_plot <- ggplot(beer_transformed_4)+
  aes(x = month_wnames,
      y = sum_barrel_per_month)+
  ggtitle("Number of Barrels of Beer produced Per Month")+
  xlab("month")+
  ylab("total # barrels *10^3")+
  theme(plot.title = element_text(hjust = 0.5))+
  scale_x_discrete(guide = guide_axis(n.dodge=2))+ #this spaced out the months on the x-axis
  geom_point()
line_plot

bar_graph3 <- ggplot(seasons_only)+
  aes(x = seasons, 
      y = seasons_mean,
      fill = seasons)+
  geom_bar(stat = "identity")+
  scale_fill_manual(values = c("brown1","deeppink3", "darksalmon", "grey"))
bar_graph3

Final Summary (10 points)

Summarize your research question and findings below.

Our findings suggest that beer barrel production is highest during the summer months. From the plots, we can see a trend of higher production in the warmer months compared to the cooler months.

Are your findings what you expected? Why or Why not?

Yes, these findings are what we expected. We hypothesized that the summer months would have higher production, which you can see from the months plot and the seasons plot.