Nobel Prize Winners
Georgios Karamanis gathered and shared data on Nobel prize winners over the years, with a fair amount of detail, and used in the tidytuesday
series a while back. These data are to be used for the questions that follow.
readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-14/nobel_winners.csv") -> nobel_winners
prize_year |
double |
Year that Nobel Prize was awarded |
category |
character |
Field of study/category |
prize |
character |
Prize Name |
motivation |
character |
Motivation of the award |
prize_share |
character |
Share eg 1 of 1, 1 of 2, 1 of 4, etc |
laureate_id |
double |
ID assigned to each winner |
laureate_type |
character |
Individual or organization |
full_name |
character |
name of the winner |
birth_date |
double |
birth date of winner |
birth_city |
character |
birth city/state of winner |
birth_country |
character |
birth country of winner |
gender |
character |
binary gender of the winner |
organization_name |
character |
organization name |
organization_city |
character |
organization city |
organization_country |
character |
organization country |
death_date |
double |
death date of the winner (if dead) |
death_city |
character |
death city (if dead) |
death_country |
character |
death country (if dead) |
- First create
nobel.df
that keeps only records starting in the year 1960, and only for the “Physics” category. Now generate an appropriate chart that shows the distribution of winners by birth_country
library(tidyverse)
library(tidylog)
nobel_winners %>%
filter(category == "Physics", prize_year >= 1960) -> nobel.df
ggplot() +
geom_bar(
data = nobel.df,
aes(x = birth_country)
) +
labs(x = "Birth Country", y = "Number of Nobel Laureates",
title = "Bar-chart of Nobel Laureates by Country of Birth") +
coord_flip()
- Now break this distribution out by
gender
to see how winners by country differs across gender
ggplot() +
geom_bar(
data = nobel.df,
aes(x = birth_country, group = gender, fill = gender)
) +
labs(x = "Birth Country", y = "Number of Nobel Laureates",
title = "Bar-chart of Nobel Laureates by Country of Birth") +
coord_flip()
- Now go back to
noble_winners
, the full data-set, and create a simple plot that shows the distribution of prize winners by death_country
, gender
, and category
nobel_winners %>%
filter(!is.na(death_country)) -> nobel.df
ggplot() +
geom_bar(
data = nobel.df,
aes(x = death_country, group = gender, fill = gender)
) +
labs(x = "Death Country", y = "Number of Nobel Laureates",
title = "Bar-chart of Nobel Laureates by Country of Death, Gender, and Prize category") +
coord_flip() +
facet_wrap(~ category)
Water levels in the Great Lakes
Use the data-set given below. Note that water level is in meters.
library(readxl)
url <- "https://aniruhil.github.io/avsr/teaching/dataviz/greatlakes.xlsx"
destfile <- "greatlakes.xlsx"
curl::curl_download(url, destfile)
read_excel(destfile, col_types = c("date",
"numeric", "numeric", "numeric", "numeric",
"numeric")) -> greatlakes
Now use an appropriate chart to show the water level for Lake Superior.
ggplot(
data = greatlakes,
aes(
x = monthyear,
y = Superior
)
) +
geom_line() +
labs(
x = "Date",
y = "Water level, in meters",
title = "Lake Superior's water levels over time"
)
County Health Rankings
Download the 2017 County Health Rankings data SPSS format from here, Excel format from here and the accompanying codebook.
These data can be downloaded with the code provided below:
library(readxl)
url <- "https://aniruhil.github.io/avsr/teaching/dataviz/CountyHealthRankings2017.xlsx"
destfile <- "CountyHealthRankings2017.xlsx"
curl::curl_download(url, destfile)
read_excel(destfile) -> chr.df
Construct appropriate plots that shows the relationship between the following pairs of variables
- Adult obesity and High school graduation
ggplot(
data = chr.df,
aes(x = Adult_obesity, y = High_school_graduation)
) +
geom_point() +
labs(x = "Adult Obesity (%)", y = "High School Graduation (%)") +
scale_y_continuous(label = scales::percent)
- Children in poverty and High school graduation
ggplot(
data = chr.df,
aes(x = Children_in_poverty, y = High_school_graduation)
) +
geom_point() +
labs(x = "Children in Poverty (%)", y = "High School Graduation (%)") +
scale_y_continuous(label = scales::percent)
- Preventable hospital stays and Unemployment rate
ggplot(
data = chr.df,
aes(x = Preventable_hospital_stays, y = Unemployment_rate)
) +
geom_point() +
labs(x = "Preventable Hospital Stays", y = "Unemployment Rate (%)") +
scale_y_continuous(label = scales::percent)
Unemployment Rates
Use the unemployment data given to you (unemprate.RData)
and construct appropriate plots that show the distribution of unemployment rates across years for each of the four educational attainment groups.
load(here::here("data", "unemprate.RData"))
names(urate)
## [1] "yearmonth" "educ_group" "rate"
Be sure to use a unique color for each educational attainment group
ggplot(
data = urate,
aes(x = yearmonth, y = rate, color = educ_group)
) +
geom_line() +
labs(
x = "Date",
y = "Unemployment Rate",
color = "",
title = "Unemployment Rate by Educational Attainment"
) +
theme(legend.position = "bottom")
---
title: "MPA 5830 - Solutions to Practice Exercises"
subtitle: "Spring 2020"
author: "INSERT YOUR NAME HERE"
date: "Updated on `r Sys.Date()`"
output: 
  html_document: 
    fig_caption: yes
    highlight: zenburn
    number_sections: yes
    theme: flatly
    toc: yes
    toc_float: true
    code_download: true
    code_folding: hide
    self_contained: yes
editor_options: 
  chunk_output_type: console
---

<style type="text/css">

body{ /* Normal  */
/*    font-family: Lato, sans-serif;  
      font-family: Mukta, sans-serif; 
      font-family: 'Nunito Sans', sans-serif;
      font-family: Karla, sans-serif;  */
      font-family: 'Merriweather Sans', sans-serif; 
      font-size: 18px;
  }

h1.title {
  font-size: 38px;
  color: DarkRed;
}

h1 { /* Header 1 */
  font-size: 28px;
  color: DarkBlue;
}

h2 { /* Header 2 */
    font-size: 22px;
  color: DarkBlue;
}

h3 { /* Header 3 */
  font-size: 18px;
  color: DarkBlue;
}

code.r{ /* Code block */
    font-family: Mukta, sans-serif; 
    font-weight: 600;  
    font-size: 18px;
}

/* pre { /* Code block - determines code spacing between lines */
    font-size: 16px;
} */
</style>


```{r klippy, echo = FALSE, include = TRUE}
klippy::klippy(tooltip_message = 'Click to copy', tooltip_success = 'Done', color = 'cornflowerblue', position = c('top', 'right'))
```

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, dpi = 300, cache = FALSE, fig.align = "center", fig.width = 10, fig.height = 8, out.width = "100%", highlight = TRUE) 
```


# Nobel Prize Winners 
Georgios Karamanis gathered and shared data on Nobel prize winners over the years, with a fair amount of detail, and used in the `tidytuesday` series a while back. These data are to be used for the questions that follow. 

```{r nobel-winners}
readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-14/nobel_winners.csv") -> nobel_winners 
```

|variable             |class     |description |
|:---|:---|:-----------|
|prize_year           |double    | Year that Nobel Prize was awarded|
|category             |character | Field of study/category|
|prize                |character | Prize Name |
|motivation           |character | Motivation of the award |
|prize_share          |character | Share eg 1 of 1, 1 of 2, 1 of 4, etc |
|laureate_id          |double    | ID assigned to each winner |
|laureate_type        |character | Individual or organization  |
|full_name            |character | name of the winner|
|birth_date           |double    | birth date of winner |
|birth_city           |character | birth city/state of winner |
|birth_country        |character | birth country of winner |
|gender               |character | binary gender of the winner |
|organization_name    |character | organization name |
|organization_city    |character | organization city |
|organization_country |character | organization country |
|death_date           |double    | death date of the winner (if dead) |
|death_city           |character | death city (if dead) |
|death_country        |character | death country (if dead) |


(a) First create `nobel.df` that keeps only records starting in the year 1960, and only for the "Physics" category. Now generate an appropriate chart that shows the distribution of winners by `birth_country` 

```{r q1}
library(tidyverse)
library(tidylog)

nobel_winners %>%
  filter(category == "Physics", prize_year >= 1960) -> nobel.df

ggplot() +
  geom_bar(
    data = nobel.df,
    aes(x = birth_country)
    ) +
  labs(x = "Birth Country", y = "Number of Nobel Laureates", 
       title = "Bar-chart of Nobel Laureates by Country of Birth") + 
  coord_flip() 
```


(b) Now break this distribution out by `gender` to see how winners by country differs across gender 

```{r q2}
ggplot() +
  geom_bar(
    data = nobel.df,
    aes(x = birth_country, group = gender, fill = gender)
    ) +
  labs(x = "Birth Country", y = "Number of Nobel Laureates", 
       title = "Bar-chart of Nobel Laureates by Country of Birth") + 
  coord_flip() 
```


(c) Now go back to `noble_winners`, the full data-set, and create a simple plot that shows the distribution of prize winners by `death_country`, `gender`, and `category`  

```{r q3}
nobel_winners %>%
  filter(!is.na(death_country)) -> nobel.df

ggplot() +
  geom_bar(
    data = nobel.df,
    aes(x = death_country, group = gender, fill = gender)
    ) +
  labs(x = "Death Country", y = "Number of Nobel Laureates", 
       title = "Bar-chart of Nobel Laureates by Country of Death, Gender, and Prize category") + 
  coord_flip() +
  facet_wrap(~ category)
```


# Water levels in the Great Lakes

Use the data-set given below. *Note that water level is in meters.* 

```{r greatlakes}
library(readxl)
url <- "https://aniruhil.github.io/avsr/teaching/dataviz/greatlakes.xlsx"
destfile <- "greatlakes.xlsx"
curl::curl_download(url, destfile)
read_excel(destfile, col_types = c("date", 
     "numeric", "numeric", "numeric", "numeric", 
     "numeric")) -> greatlakes 
```

Now use an appropriate chart to show the water level for Lake Superior. 

```{r lakes}
ggplot(
  data = greatlakes,
  aes(
    x = monthyear,
    y = Superior
    )
  ) +
  geom_line() +
  labs(
    x = "Date",
    y = "Water level, in meters",
    title = "Lake Superior's water levels over time"
  )
```


# County Health Rankings
Download the 2017 County Health Rankings data [SPSS format from here](https://aniruhil.github.io/avsr/teaching/dataviz/CountyHealthRankings2017.sav), [Excel format from here](https://aniruhil.github.io/avsr/teaching/dataviz/CountyHealthRankings2017.xlsx) and the [accompanying codebook](http://www.countyhealthrankings.org/sites/default/files/2017TrendsDocumentation.pdf). 

These data can be downloaded with the code provided below: 

```{r great-lakes}
library(readxl)
url <- "https://aniruhil.github.io/avsr/teaching/dataviz/CountyHealthRankings2017.xlsx"
destfile <- "CountyHealthRankings2017.xlsx"
curl::curl_download(url, destfile)
read_excel(destfile) -> chr.df 
```

Construct appropriate plots that shows the relationship between the following pairs of variables 

(a) Adult obesity and High school graduation 

```{r chr1}
ggplot(
  data = chr.df,
  aes(x = Adult_obesity, y = High_school_graduation)
  ) +
  geom_point() +
  labs(x = "Adult Obesity (%)", y = "High School Graduation (%)") +
  scale_y_continuous(label = scales::percent) 
```

(b) Children in poverty and High school graduation 

```{r chr2}
ggplot(
  data = chr.df,
  aes(x = Children_in_poverty, y = High_school_graduation)
  ) +
  geom_point() +
  labs(x = "Children in Poverty (%)", y = "High School Graduation (%)") +
  scale_y_continuous(label = scales::percent) 
```

(c) Preventable hospital stays and Unemployment rate 

```{r chr3}
ggplot(
  data = chr.df,
  aes(x = Preventable_hospital_stays, y = Unemployment_rate)
  ) +
  geom_point() +
  labs(x = "Preventable Hospital Stays", y = "Unemployment Rate (%)") +
  scale_y_continuous(label = scales::percent) 
```

# Unemployment Rates
Use the unemployment data given to you `(unemprate.RData)` and construct appropriate plots that show the distribution of unemployment rates across years for each of the four educational attainment groups. 

```{r urate}
load(here::here("data", "unemprate.RData"))

names(urate)
```

Be sure to use a unique color for each educational attainment group

```{r urate2}
ggplot(
  data = urate,
  aes(x = yearmonth, y = rate, color = educ_group)
  ) +
  geom_line() +
  labs(
    x = "Date",
    y = "Unemployment Rate",
    color = "",
    title = "Unemployment Rate by Educational Attainment"
  ) +
  theme(legend.position = "bottom")
```

