#loading dock
library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(describedata)
library(ggplot2)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(devtools)
## Loading required package: usethis
library(skimr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
tuesdata <- tidytuesdayR::tt_load('2020-03-31')
## --- Compiling #TidyTuesday Information for 2020-03-31 ----
## --- There are 4 files available ---
## --- Starting Download ---
##
## Downloading file 1 of 4: `beer_states.csv`
## Downloading file 2 of 4: `beer_taxed.csv`
## Downloading file 3 of 4: `brewer_size.csv`
## Downloading file 4 of 4: `brewing_materials.csv`
## --- Download complete ---
tuesdata <- tidytuesdayR::tt_load(2020,week = 14)
## --- Compiling #TidyTuesday Information for 2020-03-31 ----
## --- There are 4 files available ---
## --- Starting Download ---
##
## Downloading file 1 of 4: `beer_states.csv`
## Downloading file 2 of 4: `beer_taxed.csv`
## Downloading file 3 of 4: `brewer_size.csv`
## Downloading file 4 of 4: `brewing_materials.csv`
## --- Download complete ---
brewing_materials <- tuesdata$brewing_materials
beer_states <- tuesdata$beer_states
brewer_size <- tuesdata$brewer_size
Define your research question below. What about the data interests you? What is a specific question you want to find out about the data?
The data we are looking at has to do with beer production. The research question we would like to ask is which months have the highest production of beer? Are there months that have higher production? We will do this by looking at the barrels produced per month. Is beer production higher in the summer months? This data interests me because we like beer!
Given your question, what is your expectation about the data?
We expect that the summer months will have a higher production.
Load the data below and use
dplyr::glimpse()
orskimr::skim()
on the data. You should upload the data file into thedata
directory.
beer_taxed <- tuesdata$beer_taxed
glimpse(beer_taxed)
## Rows: 1,580
## Columns: 10
## $ data_type <chr> "Barrels Produced", "Barrels Produced", "Barrels Prod~
## $ tax_status <chr> "Totals", "Taxable", "Taxable", "Taxable", "Sub Total~
## $ year <dbl> 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008,~
## $ month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,~
## $ type <chr> "Production", "In bottles and cans", "In barrels and ~
## $ month_current <dbl> 16211480, 13222104, 1371239, 7426, 14600769, 262985, ~
## $ month_prior_year <dbl> 15880125, 12824278, 1357372, 8419, 14190069, 268473, ~
## $ ytd_current <dbl> 16211480, 13222104, 1371239, 7426, 14600769, 262985, ~
## $ ytd_prior_year <dbl> 15880125, 12824278, 1357372, 8419, 14190069, 268473, ~
## $ tax_rate <chr> "$7/$18 per barrel", "$7/$18 per barrel", "$7/$18 per~
skim(beer_taxed)
Name | beer_taxed |
Number of rows | 1580 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
character | 4 |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
data_type | 0 | 1 | 16 | 16 | 0 | 1 | 0 |
tax_status | 0 | 1 | 6 | 18 | 0 | 5 | 0 |
type | 0 | 1 | 7 | 28 | 0 | 12 | 0 |
tax_rate | 0 | 1 | 17 | 20 | 0 | 2 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1.00 | 2013.49 | 3.45 | 2008 | 2010.0 | 2013 | 2016 | 2019 | ▇▅▅▅▇ |
month | 0 | 1.00 | 6.51 | 3.45 | 1 | 4.0 | 7 | 10 | 12 | ▇▅▅▅▇ |
month_current | 0 | 1.00 | 6544097.78 | 6904094.42 | 0 | 111379.8 | 1420096 | 13745839 | 19131217 | ▇▁▂▃▂ |
month_prior_year | 0 | 1.00 | 6597077.26 | 6964634.60 | 0 | 113821.0 | 1454573 | 13826875 | 18667697 | ▇▁▁▃▂ |
ytd_current | 24 | 0.98 | 42394946.37 | 56287222.56 | 0 | 417666.8 | 8321667 | 79415265 | 199466287 | ▇▂▁▁▁ |
ytd_prior_year | 24 | 0.98 | 42804142.80 | 56840130.66 | 0 | 381690.0 | 8489038 | 79866082 | 199618070 | ▇▂▁▁▁ |
If there are any quirks that you have to deal with
NA
coded as something else, or it is multiple tables, please make some notes here about what you need to do before you start transforming the data in the next section.
Make sure your data types are correct!
If the data needs to be transformed in any way (values recoded, pivoted, etc), do it here. Examples include transforming a continuous variable into a categorical using
case_when()
, etc.
#Mia did this section :)
#change month current to divide by 1000 so easier to look at
beer_transformed <- beer_taxed %>%
mutate(month_current = month_current/1000)
#take beer transformed and group by month. then summarize so that there is a new variable sum_barrel_per_month that is now the sum
beer_transformed_2 <- beer_transformed %>%
group_by(month) %>%
summarize(sum_barrel_per_month = sum(month_current))
#assign the months names
beer_transformed_3 <- beer_transformed_2 %>%
mutate(
month_wnames= case_when(
month == 1~"January",
month == 2~"February",
month == 3~"March",
month == 4~"April",
month == 5~"May",
month == 6~"June",
month == 7~"July",
month == 8~"August",
month == 9~"September",
month == 10~"October",
month == 11~"November",
month == 12~"December",
)
)
beer_transformed_3
## # A tibble: 12 x 3
## month sum_barrel_per_month month_wnames
## <dbl> <dbl> <chr>
## 1 1 788530. January
## 2 2 773915. February
## 3 3 898996. March
## 4 4 882658. April
## 5 5 941987. May
## 6 6 970459. June
## 7 7 926359. July
## 8 8 917067. August
## 9 9 874489. September
## 10 10 829143. October
## 11 11 757803. November
## 12 12 778268. December
#transform to make by season
beer_transformed_4 <- beer_transformed_3 %>%
mutate(
seasons = case_when(
month_wnames == "March"| month_wnames == "April"|month_wnames == "May"~ "Spring",
month_wnames == "June"| month_wnames == "July"|month_wnames == "August" ~ "Summer",
month_wnames == "September"| month_wnames == "October"|month_wnames == "November" ~ "Fall",
month_wnames == "December"| month_wnames == "January"|month_wnames == "February"~ "Winter",
)
)
#Transform by amount
beer_transformed_4 <- beer_transformed_4 %>%
mutate(
barrel_amount= case_when(
sum_barrel_per_month < 800000~ "Less than 800,000e^3",
sum_barrel_per_month >= 800000 & sum_barrel_per_month < 900000 ~ "800,000e^3 - 900,000e^3",
sum_barrel_per_month >= 900000~ "More than 900000e^3",
)
)
beer_transformed_4
## # A tibble: 12 x 5
## month sum_barrel_per_month month_wnames seasons barrel_amount
## <dbl> <dbl> <chr> <chr> <chr>
## 1 1 788530. January Winter Less than 800,000e^3
## 2 2 773915. February Winter Less than 800,000e^3
## 3 3 898996. March Spring 800,000e^3 - 900,000e^3
## 4 4 882658. April Spring 800,000e^3 - 900,000e^3
## 5 5 941987. May Spring More than 900000e^3
## 6 6 970459. June Summer More than 900000e^3
## 7 7 926359. July Summer More than 900000e^3
## 8 8 917067. August Summer More than 900000e^3
## 9 9 874489. September Fall 800,000e^3 - 900,000e^3
## 10 10 829143. October Fall 800,000e^3 - 900,000e^3
## 11 11 757803. November Fall Less than 800,000e^3
## 12 12 778268. December Winter Less than 800,000e^3
#another transformation to make a dataframe of just the seasons:
seasons_only <- beer_transformed_4 %>%
group_by(seasons) %>%
summarize(season_total = sum(sum_barrel_per_month), seasons_mean = mean(sum_barrel_per_month),season_sd = sd(sum_barrel_per_month))
seasons_only
## # A tibble: 4 x 4
## seasons season_total seasons_mean season_sd
## <chr> <dbl> <dbl> <dbl>
## 1 Fall 2461436. 820479. 58824.
## 2 Spring 2723640. 907880. 30646.
## 3 Summer 2813885. 937962. 28525.
## 4 Winter 2340713. 780238. 7504.
Bonus points (5 points) for datasets that require merging of tables, but only if you reason through whether you should use
left_join
,inner_join
, orright_join
on these tables. No credit will be provided if you don’t.
Show your transformed table here. Use tools such as
glimpse()
,skim()
orhead()
to illustrate your point.
#look at transformed data
glimpse(beer_transformed_4)
## Rows: 12
## Columns: 5
## $ month <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
## $ sum_barrel_per_month <dbl> 788529.8, 773915.2, 898995.9, 882657.5, 941986.9,~
## $ month_wnames <chr> "January", "February", "March", "April", "May", "~
## $ seasons <chr> "Winter", "Winter", "Spring", "Spring", "Spring",~
## $ barrel_amount <chr> "Less than 800,000e^3", "Less than 800,000e^3", "~
head(seasons_only)
## # A tibble: 4 x 4
## seasons season_total seasons_mean season_sd
## <chr> <dbl> <dbl> <dbl>
## 1 Fall 2461436. 820479. 58824.
## 2 Spring 2723640. 907880. 30646.
## 3 Summer 2813885. 937962. 28525.
## 4 Winter 2340713. 780238. 7504.
Are the values what you expected for the variables? Why or Why not?
The values above are pretty close to what we expected. From the seasons data, we can see that the summer season had the highest barrel production, which aligns with our hypothesis. Although, we expected that the summer month production would be even more drastic compared to the other seasons.
Use
group_by()
andsummarize()
to make a summary of the data here. The summary should be relevant to your research question
#summary of average barrels produced by month - shows that May, June & July have the highest production
beer_taxed %>%
group_by(month) %>%
summarize(average_barrel = mean(month_current, na.rm = TRUE))
## # A tibble: 12 x 2
## month average_barrel
## <dbl> <dbl>
## 1 1 6065614.
## 2 2 5907750.
## 3 3 6810575.
## 4 4 6686799.
## 5 5 7136265.
## 6 6 7351964.
## 7 7 7071440.
## 8 8 6947479.
## 9 9 6624919.
## 10 10 6281390.
## 11 11 5740930.
## 12 12 5895973.
What are your findings about the summary? Are they what you expected?
We predicted that the highest amount of barrels produced would be during the summer months (June, July & August). This summary partly supports our hypothesis because it includes June & July as the highest production months, but not August.
Make at least two plots that help you answer your question on the transformed or summarized data. Use scales and/or labels to make each plot informative.
#Ashley did this :)
#do this code so the months are in order on the graph
beer_transformed_4$month_wnames <- factor(beer_transformed_4$month_wnames, levels = c("January", "February","March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))
#make a bar plot of it
bar_graph <- ggplot(beer_transformed_4)+
aes(x = month_wnames,
y = sum_barrel_per_month,
color = seasons,
fill = barrel_amount)+
geom_bar(stat='identity')+
ggtitle("Number of Barrels of Beer produced Per Month")+
xlab("month")+
ylab("total # barrels *10^3")+
theme(plot.title = element_text(hjust = 0.5))+
scale_fill_manual(values = c("brown1","deeppink3","darksalmon","grey"))+
scale_color_manual(values= c("white", "white", "black", "white"))+
scale_x_discrete(guide = guide_axis(n.dodge=2)) #this spaced out the months on the x-axis
bar_graph
line_plot <- ggplot(beer_transformed_4)+
aes(x = month_wnames,
y = sum_barrel_per_month)+
ggtitle("Number of Barrels of Beer produced Per Month")+
xlab("month")+
ylab("total # barrels *10^3")+
theme(plot.title = element_text(hjust = 0.5))+
scale_x_discrete(guide = guide_axis(n.dodge=2))+ #this spaced out the months on the x-axis
geom_point()
line_plot
bar_graph3 <- ggplot(seasons_only)+
aes(x = seasons,
y = seasons_mean,
fill = seasons)+
geom_bar(stat = "identity")+
scale_fill_manual(values = c("brown1","deeppink3", "darksalmon", "grey"))
bar_graph3
Summarize your research question and findings below.
Our findings suggest that beer barrel production is highest during the summer months. From the plots, we can see a trend of higher production in the warmer months compared to the cooler months.
Are your findings what you expected? Why or Why not?
Yes, these findings are what we expected. We hypothesized that the summer months would have higher production, which you can see from the months plot and the seasons plot.