Nobel Prize Winners
Georgios Karamanis gathered and shared data on Nobel prize winners over the years, with a fair amount of detail, and used in the tidytuesday
series a while back. These data are to be used for the questions that follow.
readr:: read_csv ("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-14/nobel_winners.csv" ) -> nobel_winners
prize_year
double
Year that Nobel Prize was awarded
category
character
Field of study/category
prize
character
Prize Name
motivation
character
Motivation of the award
prize_share
character
Share eg 1 of 1, 1 of 2, 1 of 4, etc
laureate_id
double
ID assigned to each winner
laureate_type
character
Individual or organization
full_name
character
name of the winner
birth_date
double
birth date of winner
birth_city
character
birth city/state of winner
birth_country
character
birth country of winner
gender
character
binary gender of the winner
organization_name
character
organization name
organization_city
character
organization city
organization_country
character
organization country
death_date
double
death date of the winner (if dead)
death_city
character
death city (if dead)
death_country
character
death country (if dead)
First create nobel.df
that keeps only records starting in the year 1960, and only for the “Physics” category. Now generate an appropriate chart that shows the distribution of winners by birth_country
library (tidyverse)
library (tidylog)
nobel_winners %>%
filter (category == "Physics" , prize_year >= 1960 ) -> nobel.df
ggplot () +
geom_bar (
data = nobel.df,
aes (x = birth_country)
) +
labs (x = "Birth Country" , y = "Number of Nobel Laureates" ,
title = "Bar-chart of Nobel Laureates by Country of Birth" ) +
coord_flip ()
Now break this distribution out by gender
to see how winners by country differs across gender
ggplot () +
geom_bar (
data = nobel.df,
aes (x = birth_country, group = gender, fill = gender)
) +
labs (x = "Birth Country" , y = "Number of Nobel Laureates" ,
title = "Bar-chart of Nobel Laureates by Country of Birth" ) +
coord_flip ()
Now go back to noble_winners
, the full data-set, and create a simple plot that shows the distribution of prize winners by death_country
, gender
, and category
nobel_winners %>%
filter (! is.na (death_country)) -> nobel.df
ggplot () +
geom_bar (
data = nobel.df,
aes (x = death_country, group = gender, fill = gender)
) +
labs (x = "Death Country" , y = "Number of Nobel Laureates" ,
title = "Bar-chart of Nobel Laureates by Country of Death, Gender, and Prize category" ) +
coord_flip () +
facet_wrap (~ category)
Water levels in the Great Lakes
Use the data-set given below. Note that water level is in meters.
library (readxl)
url <- "https://aniruhil.github.io/avsr/teaching/dataviz/greatlakes.xlsx"
destfile <- "greatlakes.xlsx"
curl:: curl_download (url, destfile)
read_excel (destfile, col_types = c ("date" ,
"numeric" , "numeric" , "numeric" , "numeric" ,
"numeric" )) -> greatlakes
Now use an appropriate chart to show the water level for Lake Superior.
ggplot (
data = greatlakes,
aes (
x = monthyear,
y = Superior
)
) +
geom_line () +
labs (
x = "Date" ,
y = "Water level, in meters" ,
title = "Lake Superior's water levels over time"
)
County Health Rankings
Download the 2017 County Health Rankings data SPSS format from here , Excel format from here and the accompanying codebook .
These data can be downloaded with the code provided below:
library (readxl)
url <- "https://aniruhil.github.io/avsr/teaching/dataviz/CountyHealthRankings2017.xlsx"
destfile <- "CountyHealthRankings2017.xlsx"
curl:: curl_download (url, destfile)
read_excel (destfile) -> chr.df
Construct appropriate plots that shows the relationship between the following pairs of variables
Adult obesity and High school graduation
ggplot (
data = chr.df,
aes (x = Adult_obesity, y = High_school_graduation)
) +
geom_point () +
labs (x = "Adult Obesity (%)" , y = "High School Graduation (%)" ) +
scale_y_continuous (label = scales:: percent)
Children in poverty and High school graduation
ggplot (
data = chr.df,
aes (x = Children_in_poverty, y = High_school_graduation)
) +
geom_point () +
labs (x = "Children in Poverty (%)" , y = "High School Graduation (%)" ) +
scale_y_continuous (label = scales:: percent)
Preventable hospital stays and Unemployment rate
ggplot (
data = chr.df,
aes (x = Preventable_hospital_stays, y = Unemployment_rate)
) +
geom_point () +
labs (x = "Preventable Hospital Stays" , y = "Unemployment Rate (%)" ) +
scale_y_continuous (label = scales:: percent)
Unemployment Rates
Use the unemployment data given to you (unemprate.RData)
and construct appropriate plots that show the distribution of unemployment rates across years for each of the four educational attainment groups.
load (here:: here ("data" , "unemprate.RData" ))
names (urate)
## [1] "yearmonth" "educ_group" "rate"
Be sure to use a unique color for each educational attainment group
ggplot (
data = urate,
aes (x = yearmonth, y = rate, color = educ_group)
) +
geom_line () +
labs (
x = "Date" ,
y = "Unemployment Rate" ,
color = "" ,
title = "Unemployment Rate by Educational Attainment"
) +
theme (legend.position = "bottom" )
---
title: "MPA 5830 - Solutions to Practice Exercises"
subtitle: "Spring 2020"
author: "INSERT YOUR NAME HERE"
date: "Updated on `r Sys.Date()`"
output: 
  html_document: 
    fig_caption: yes
    highlight: zenburn
    number_sections: yes
    theme: flatly
    toc: yes
    toc_float: true
    code_download: true
    code_folding: hide
    self_contained: yes
editor_options: 
  chunk_output_type: console
---

<style type="text/css">

body{ /* Normal  */
/*    font-family: Lato, sans-serif;  
      font-family: Mukta, sans-serif; 
      font-family: 'Nunito Sans', sans-serif;
      font-family: Karla, sans-serif;  */
      font-family: 'Merriweather Sans', sans-serif; 
      font-size: 18px;
  }

h1.title {
  font-size: 38px;
  color: DarkRed;
}

h1 { /* Header 1 */
  font-size: 28px;
  color: DarkBlue;
}

h2 { /* Header 2 */
    font-size: 22px;
  color: DarkBlue;
}

h3 { /* Header 3 */
  font-size: 18px;
  color: DarkBlue;
}

code.r{ /* Code block */
    font-family: Mukta, sans-serif; 
    font-weight: 600;  
    font-size: 18px;
}

/* pre { /* Code block - determines code spacing between lines */
    font-size: 16px;
} */
</style>


```{r klippy, echo = FALSE, include = TRUE}
klippy::klippy(tooltip_message = 'Click to copy', tooltip_success = 'Done', color = 'cornflowerblue', position = c('top', 'right'))
```

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, dpi = 300, cache = FALSE, fig.align = "center", fig.width = 10, fig.height = 8, out.width = "100%", highlight = TRUE) 
```


# Nobel Prize Winners 
Georgios Karamanis gathered and shared data on Nobel prize winners over the years, with a fair amount of detail, and used in the `tidytuesday` series a while back. These data are to be used for the questions that follow. 

```{r nobel-winners}
readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-14/nobel_winners.csv") -> nobel_winners 
```

|variable             |class     |description |
|:---|:---|:-----------|
|prize_year           |double    | Year that Nobel Prize was awarded|
|category             |character | Field of study/category|
|prize                |character | Prize Name |
|motivation           |character | Motivation of the award |
|prize_share          |character | Share eg 1 of 1, 1 of 2, 1 of 4, etc |
|laureate_id          |double    | ID assigned to each winner |
|laureate_type        |character | Individual or organization  |
|full_name            |character | name of the winner|
|birth_date           |double    | birth date of winner |
|birth_city           |character | birth city/state of winner |
|birth_country        |character | birth country of winner |
|gender               |character | binary gender of the winner |
|organization_name    |character | organization name |
|organization_city    |character | organization city |
|organization_country |character | organization country |
|death_date           |double    | death date of the winner (if dead) |
|death_city           |character | death city (if dead) |
|death_country        |character | death country (if dead) |


(a) First create `nobel.df` that keeps only records starting in the year 1960, and only for the "Physics" category. Now generate an appropriate chart that shows the distribution of winners by `birth_country` 

```{r q1}
library(tidyverse)
library(tidylog)

nobel_winners %>%
  filter(category == "Physics", prize_year >= 1960) -> nobel.df

ggplot() +
  geom_bar(
    data = nobel.df,
    aes(x = birth_country)
    ) +
  labs(x = "Birth Country", y = "Number of Nobel Laureates", 
       title = "Bar-chart of Nobel Laureates by Country of Birth") + 
  coord_flip() 
```


(b) Now break this distribution out by `gender` to see how winners by country differs across gender 

```{r q2}
ggplot() +
  geom_bar(
    data = nobel.df,
    aes(x = birth_country, group = gender, fill = gender)
    ) +
  labs(x = "Birth Country", y = "Number of Nobel Laureates", 
       title = "Bar-chart of Nobel Laureates by Country of Birth") + 
  coord_flip() 
```


(c) Now go back to `noble_winners`, the full data-set, and create a simple plot that shows the distribution of prize winners by `death_country`, `gender`, and `category`  

```{r q3}
nobel_winners %>%
  filter(!is.na(death_country)) -> nobel.df

ggplot() +
  geom_bar(
    data = nobel.df,
    aes(x = death_country, group = gender, fill = gender)
    ) +
  labs(x = "Death Country", y = "Number of Nobel Laureates", 
       title = "Bar-chart of Nobel Laureates by Country of Death, Gender, and Prize category") + 
  coord_flip() +
  facet_wrap(~ category)
```


# Water levels in the Great Lakes

Use the data-set given below. *Note that water level is in meters.* 

```{r greatlakes}
library(readxl)
url <- "https://aniruhil.github.io/avsr/teaching/dataviz/greatlakes.xlsx"
destfile <- "greatlakes.xlsx"
curl::curl_download(url, destfile)
read_excel(destfile, col_types = c("date", 
     "numeric", "numeric", "numeric", "numeric", 
     "numeric")) -> greatlakes 
```

Now use an appropriate chart to show the water level for Lake Superior. 

```{r lakes}
ggplot(
  data = greatlakes,
  aes(
    x = monthyear,
    y = Superior
    )
  ) +
  geom_line() +
  labs(
    x = "Date",
    y = "Water level, in meters",
    title = "Lake Superior's water levels over time"
  )
```


# County Health Rankings
Download the 2017 County Health Rankings data [SPSS format from here](https://aniruhil.github.io/avsr/teaching/dataviz/CountyHealthRankings2017.sav), [Excel format from here](https://aniruhil.github.io/avsr/teaching/dataviz/CountyHealthRankings2017.xlsx) and the [accompanying codebook](http://www.countyhealthrankings.org/sites/default/files/2017TrendsDocumentation.pdf). 

These data can be downloaded with the code provided below: 

```{r great-lakes}
library(readxl)
url <- "https://aniruhil.github.io/avsr/teaching/dataviz/CountyHealthRankings2017.xlsx"
destfile <- "CountyHealthRankings2017.xlsx"
curl::curl_download(url, destfile)
read_excel(destfile) -> chr.df 
```

Construct appropriate plots that shows the relationship between the following pairs of variables 

(a) Adult obesity and High school graduation 

```{r chr1}
ggplot(
  data = chr.df,
  aes(x = Adult_obesity, y = High_school_graduation)
  ) +
  geom_point() +
  labs(x = "Adult Obesity (%)", y = "High School Graduation (%)") +
  scale_y_continuous(label = scales::percent) 
```

(b) Children in poverty and High school graduation 

```{r chr2}
ggplot(
  data = chr.df,
  aes(x = Children_in_poverty, y = High_school_graduation)
  ) +
  geom_point() +
  labs(x = "Children in Poverty (%)", y = "High School Graduation (%)") +
  scale_y_continuous(label = scales::percent) 
```

(c) Preventable hospital stays and Unemployment rate 

```{r chr3}
ggplot(
  data = chr.df,
  aes(x = Preventable_hospital_stays, y = Unemployment_rate)
  ) +
  geom_point() +
  labs(x = "Preventable Hospital Stays", y = "Unemployment Rate (%)") +
  scale_y_continuous(label = scales::percent) 
```

# Unemployment Rates
Use the unemployment data given to you `(unemprate.RData)` and construct appropriate plots that show the distribution of unemployment rates across years for each of the four educational attainment groups. 

```{r urate}
load(here::here("data", "unemprate.RData"))

names(urate)
```

Be sure to use a unique color for each educational attainment group

```{r urate2}
ggplot(
  data = urate,
  aes(x = yearmonth, y = rate, color = educ_group)
  ) +
  geom_line() +
  labs(
    x = "Date",
    y = "Unemployment Rate",
    color = "",
    title = "Unemployment Rate by Educational Attainment"
  ) +
  theme(legend.position = "bottom")
```

