library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.4     ✔ dplyr   1.0.7
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   2.0.1     ✔ forcats 0.5.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


read.csv(
    "data/ImportDataCSV.csv",
    sep = ",",
    header = TRUE
    ) -> df.csv


read.csv(
  "data/ImportDataTAB.txt",
  sep = "\t",
  header = TRUE
  ) -> df.tab


names(df.csv) 

glimpse(df.csv)

Rows: 7
Columns: 3
$ x <int> 1, 4, 7, 10, 13, 16, 19
$ y <int> 2, 5, 8, 11, 14, 17, 20
$ z <int> 3, 6, 9, 12, 15, 18, 21


# And now we repeat the preceding commands for the df.tab file

names(df.tab)

glimpse(df.tab)

Rows: 7
Columns: 3
$ x <int> 1, 4, 7, 10, 13, 16, 19
$ y <int> 2, 5, 8, 11, 14, 17, 20
$ z <int> 3, 6, 9, 12, 15, 18, 21


library(readxl)

read_excel(
  "data/ImportDataXLS.xls"
  ) -> df.xls 

read_excel(
  "data/ImportDataXLSX.xlsx"
  ) -> df.xlsx


names(df.xls)

names(df.xlsx)

glimpse(df.xlsx)

glimpse(df.xlsx)

Rows: 7
Columns: 3
$ x <dbl> 1, 4, 7, 10, 13, 16, 19
$ y <dbl> 2, 5, 8, 11, 14, 17, 20
$ z <dbl> 3, 6, 9, 12, 15, 18, 21
Rows: 7
Columns: 3
$ x <dbl> 1, 4, 7, 10, 13, 16, 19
$ y <dbl> 2, 5, 8, 11, 14, 17, 20
$ z <dbl> 3, 6, 9, 12, 15, 18, 21


library(haven)

read_stata(
  "data/ImportDataStata.dta"
  ) -> df.stata

read_sas(
  "data/ImportDataSAS.sas7bdat"
  ) -> df.sas

read_sav(
  "data/ImportDataSPSS.sav"
  ) -> df.spss


# Check the files

names(df.stata)

glimpse(df.sas)

Rows: 7
Columns: 3
$ x <dbl> 1, 4, 7, 10, 13, 16, 19
$ y <dbl> 2, 5, 8, 11, 14, 17, 20
$ z <dbl> 3, 6, 9, 12, 15, 18, 21


read.fwf(
  "data/fwfdata.txt",
  widths = c(4, 9, 2, 4),
  header = FALSE,
  col.names = c("Name", "Month", "Day", "Year")
  ) -> df.fw


glimpse(df.fw)


read.table(
  "http://data.princeton.edu/wws509/datasets/effort.dat"
  ) -> fpe 

fpe # This command asks R to show us what fpe contains


# Some more files read from the web

read.table(
  "https://stats.idre.ucla.edu/stat/data/test.txt",
  header = TRUE
  ) -> test.txt 

read_csv(
  "https://stats.idre.ucla.edu/stat/data/test.csv"
  ) -> test.csv

read_sav(
  "https://stats.idre.ucla.edu/stat/data/hsb2.sav"
  ) -> hsb2.spss

Rows: 8 Columns: 6

── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): prgtype
dbl (5): gender, id, ses, schtyp, level


ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


temp = tempfile()

download.file("ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/bridgepop/2016/pcen_v2016_y1016.sas7bdat.zip", temp)

oursasdata = haven::read_sas(unz(temp, "pcen_v2016_y1016.sas7bdat"))

unlink(temp)


dim(oursasdata)


head(oursasdata)


tail(oursasdata)


read.table(
  'https://stats.idre.ucla.edu/stat/data/hsb2.csv',
  header = TRUE,
  sep = ","
  ) -> hsb2


summary(hsb2)

       id             female           race           ses            schtyp    
 Min.   :  1.00   Min.   :0.000   Min.   :1.00   Min.   :1.000   Min.   :1.00  
 1st Qu.: 50.75   1st Qu.:0.000   1st Qu.:3.00   1st Qu.:2.000   1st Qu.:1.00  
 Median :100.50   Median :1.000   Median :4.00   Median :2.000   Median :1.00  
 Mean   :100.50   Mean   :0.545   Mean   :3.43   Mean   :2.055   Mean   :1.16  
 3rd Qu.:150.25   3rd Qu.:1.000   3rd Qu.:4.00   3rd Qu.:3.000   3rd Qu.:1.00  
 Max.   :200.00   Max.   :1.000   Max.   :4.00   Max.   :3.000   Max.   :2.00  
      prog            read           write            math      
 Min.   :1.000   Min.   :28.00   Min.   :31.00   Min.   :33.00  
 1st Qu.:2.000   1st Qu.:44.00   1st Qu.:45.75   1st Qu.:45.00  
 Median :2.000   Median :50.00   Median :54.00   Median :52.00  
 Mean   :2.025   Mean   :52.23   Mean   :52.77   Mean   :52.65  
 3rd Qu.:2.250   3rd Qu.:60.00   3rd Qu.:60.00   3rd Qu.:59.00  
 Max.   :3.000   Max.   :76.00   Max.   :67.00   Max.   :75.00  
    science          socst      
 Min.   :26.00   Min.   :26.00  
 1st Qu.:44.00   1st Qu.:46.00  
 Median :53.00   Median :52.00  
 Mean   :51.85   Mean   :52.41  
 3rd Qu.:58.00   3rd Qu.:61.00  
 Max.   :74.00   Max.   :71.00


summary(hsb2)


factor(hsb2$female,
       levels = c(0, 1),
       labels = c("Male", "Female")
       ) -> hsb2$female.f 

factor(hsb2$race,
       levels = c(1:4),
       labels = c("Hispanic", "Asian", "African American", "White")
       ) -> hsb2$race.f

factor(hsb2$ses,
       levels = c(1:3),
       labels = c("Low", "Middle", "High")
       ) -> hsb2$ses.f

factor(hsb2$schtyp,
       levels = c(1:2),
       labels = c("Public", "Private")
       ) -> hsb2$schtyp.f

factor(hsb2$prog,
       levels = c(1:3),
       labels = c("General", "Academic", "Vocational")
       ) -> hsb2$prog.f


summary(hsb2)

       id             female           race           ses            schtyp    
 Min.   :  1.00   Min.   :0.000   Min.   :1.00   Min.   :1.000   Min.   :1.00  
 1st Qu.: 50.75   1st Qu.:0.000   1st Qu.:3.00   1st Qu.:2.000   1st Qu.:1.00  
 Median :100.50   Median :1.000   Median :4.00   Median :2.000   Median :1.00  
 Mean   :100.50   Mean   :0.545   Mean   :3.43   Mean   :2.055   Mean   :1.16  
 3rd Qu.:150.25   3rd Qu.:1.000   3rd Qu.:4.00   3rd Qu.:3.000   3rd Qu.:1.00  
 Max.   :200.00   Max.   :1.000   Max.   :4.00   Max.   :3.000   Max.   :2.00  
      prog            read           write            math      
 Min.   :1.000   Min.   :28.00   Min.   :31.00   Min.   :33.00  
 1st Qu.:2.000   1st Qu.:44.00   1st Qu.:45.75   1st Qu.:45.00  
 Median :2.000   Median :50.00   Median :54.00   Median :52.00  
 Mean   :2.025   Mean   :52.23   Mean   :52.77   Mean   :52.65  
 3rd Qu.:2.250   3rd Qu.:60.00   3rd Qu.:60.00   3rd Qu.:59.00  
 Max.   :3.000   Max.   :76.00   Max.   :67.00   Max.   :75.00  
    science          socst         female.f                race.f   
 Min.   :26.00   Min.   :26.00   Male  : 91   Hispanic        : 24  
 1st Qu.:44.00   1st Qu.:46.00   Female:109   Asian           : 11  
 Median :53.00   Median :52.00                African American: 20  
 Mean   :51.85   Mean   :52.41                White           :145  
 3rd Qu.:58.00   3rd Qu.:61.00                                      
 Max.   :74.00   Max.   :71.00                                      
    ses.f       schtyp.f          prog.f   
 Low   :47   Public :168   General   : 45  
 Middle:95   Private: 32   Academic  :105  
 High  :58                 Vocational: 50


save(hsb2, file = "data/hsb2.RData")


load("data/hsb2.RData")


summary(hsb2)

       id             female           race           ses            schtyp    
 Min.   :  1.00   Min.   :0.000   Min.   :1.00   Min.   :1.000   Min.   :1.00  
 1st Qu.: 50.75   1st Qu.:0.000   1st Qu.:3.00   1st Qu.:2.000   1st Qu.:1.00  
 Median :100.50   Median :1.000   Median :4.00   Median :2.000   Median :1.00  
 Mean   :100.50   Mean   :0.545   Mean   :3.43   Mean   :2.055   Mean   :1.16  
 3rd Qu.:150.25   3rd Qu.:1.000   3rd Qu.:4.00   3rd Qu.:3.000   3rd Qu.:1.00  
 Max.   :200.00   Max.   :1.000   Max.   :4.00   Max.   :3.000   Max.   :2.00  
      prog            read           write            math      
 Min.   :1.000   Min.   :28.00   Min.   :31.00   Min.   :33.00  
 1st Qu.:2.000   1st Qu.:44.00   1st Qu.:45.75   1st Qu.:45.00  
 Median :2.000   Median :50.00   Median :54.00   Median :52.00  
 Mean   :2.025   Mean   :52.23   Mean   :52.77   Mean   :52.65  
 3rd Qu.:2.250   3rd Qu.:60.00   3rd Qu.:60.00   3rd Qu.:59.00  
 Max.   :3.000   Max.   :76.00   Max.   :67.00   Max.   :75.00  
    science          socst         female.f                race.f   
 Min.   :26.00   Min.   :26.00   Male  : 91   Hispanic        : 24  
 1st Qu.:44.00   1st Qu.:46.00   Female:109   Asian           : 11  
 Median :53.00   Median :52.00                African American: 20  
 Mean   :51.85   Mean   :52.41                White           :145  
 3rd Qu.:58.00   3rd Qu.:61.00                                      
 Max.   :74.00   Max.   :71.00                                      
    ses.f       schtyp.f          prog.f   
 Low   :47   Public :168   General   : 45  
 Middle:95   Private: 32   Academic  :105  
 High  :58                 Vocational: 50


library(palmerpenguins)

data(penguins, package = 'palmerpenguins')

head(penguins)


library(ggplot2)

data(diamonds, package = 'ggplot2')

head(diamonds)

	setting	effort	change
	<int>	<int>	<int>
Bolivia	46	0	1
Brazil	74	0	10
Chile	89	16	29
Colombia	77	16	25
CostaRica	84	21	29
Cuba	89	15	40
DominicanRep	68	14	21
Ecuador	70	6	0
ElSalvador	60	13	13
Guatemala	55	9	4
Haiti	35	3	0
Honduras	51	7	7
Jamaica	87	23	21
Mexico	83	4	9
Nicaragua	68	0	7
Panama	84	19	22
Paraguay	74	3	6
Peru	73	0	2
TrinidadTobago	84	15	29
Venezuela	91	7	11

age	hisp	RACESEX	VINTAGE	POP2010_apr	POP2010_jul	POP2011	POP2012	POP2013	POP2014	POP2015	POP2016	ST_FIPS	CO_FIPS
<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
0	1	1	2016	236	242	237	229	224	240	236	225	1	1
1	1	1	2016	299	284	241	254	231	249	238	230	1	1
2	1	1	2016	287	292	294	233	240	237	234	255	1	1
3	1	1	2016	286	286	310	283	241	244	229	240	1	1
4	1	1	2016	270	273	280	319	278	247	252	239	1	1
5	1	1	2016	279	277	285	279	305	287	265	254	1	1

Column Name	Values and Labels\Meanings
female	(0/1)
race	(1=hispanic 2=asian 3=african-amer 4=white)
ses	socioeconomic status (1=low 2=middle 3=high)
schtyp	type of school (1=public 2=private)
prog	type of program (1=general 2=academic 3=vocational)
read	standardized reading score
write	standardized writing score
math	standardized math score
science	standardized science score
socst	standardized social studies score

species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex	year
<fct>	<fct>	<dbl>	<dbl>	<int>	<int>	<fct>	<int>
Adelie	Torgersen	39.1	18.7	181	3750	male	2007
Adelie	Torgersen	39.5	17.4	186	3800	female	2007
Adelie	Torgersen	40.3	18.0	195	3250	female	2007
Adelie	Torgersen	NA	NA	NA	NA	NA	2007
Adelie	Torgersen	36.7	19.3	193	3450	female	2007
Adelie	Torgersen	39.3	20.6	190	3650	male	2007

carat	cut	color	clarity	depth	table	price	x	y	z
<dbl>	<ord>	<ord>	<ord>	<dbl>	<dbl>	<int>	<dbl>	<dbl>	<dbl>
0.23	Ideal	E	SI2	61.5	55	326	3.95	3.98	2.43
0.21	Premium	E	SI1	59.8	61	326	3.89	3.84	2.31
0.23	Good	E	VS1	56.9	65	327	4.05	4.07	2.31
0.29	Premium	I	VS2	62.4	58	334	4.20	4.23	2.63
0.31	Good	J	SI2	63.3	58	335	4.34	4.35	2.75
0.24	Very Good	J	VVS2	62.8	57	336	3.94	3.96	2.48

MPA 5830 - Module 01 (Fall 2021)¶

What will you learn?¶

Reading data¶

CSV data files¶

MS Excel files¶

SPSS, Stata, SAS files¶

Fixed-width files¶

Reading Files from the Web¶

Labeling data values¶

Saving R data files¶

Minimal example of data processing¶

Loading RData files¶

Data in packages¶

Exercises for practice¶

Exercise 01: Reading in some data files¶

Exercise 02: Reading in local data and labeling some values¶

Exercise 03: Welcome to Kaggle & Mass Shootings¶

Exercise 04: Animal Shelters¶