2 Tabular Data

Aliases:
- Tabular files
- Flat
- Delimited
Includes:
- Comma-Separated Value (.csv)
- Tab-Separated Value (.tsv)

2.1 Basics

library(readr)

Here’s some example data, modified from http://www.gapminder.org/data/

country,continent,year,lifeExp,pop,gdpPercap       # header/column names, separated by commas
Afghanistan,Asia,1952,28.801,8425333,779.4453145
Afghanistan,Asia,1957,30.332,9240934,820.8530296   # comma-separated values
Afghanistan,Asia,1962,31.997,10267083,853.10071
Afghanistan,Asia,1967,34.02,11537966,836.1971382
Afghanistan,Asia,1972,36.088,13079460,739.9811058
Afghanistan,Asia,1977,38.438,14880372,786.11336
Afghanistan,Asia,1982,39.854,12881816,978.0114388
Afghanistan,Asia,1987,40.822,13867957,852.3959448

csv_text <- 
'country,continent,year,lifeExp,pop,gdpPercap     
Afghanistan,Asia,1952,28.801,8425333,779.4453145
Afghanistan,Asia,1957,30.332,9240934,820.8530296
Afghanistan,Asia,1962,31.997,10267083,853.10071
Afghanistan,Asia,1967,34.02,11537966,836.1971382
Afghanistan,Asia,1972,36.088,13079460,739.9811058
Afghanistan,Asia,1977,38.438,14880372,786.11336
Afghanistan,Asia,1982,39.854,12881816,978.0114388
Afghanistan,Asia,1987,40.822,13867957,852.3959448'

csv_file <- tempfile(fileext = ".csv")      
csv_file # a temporary file path

## [1] "/tmp/RtmpvuZSAF/file7cf8340d8eef.csv"

writeLines(text = csv_text, con = csv_file) # write `csv_text` to `csv_file`

read_csv(file = csv_file)

## Parsed with column specification:
## cols(
##   country = col_character(),
##   continent = col_character(),
##   year = col_double(),
##   lifeExp = col_double(),
##   pop = col_double(),
##   gdpPercap = col_double()
## )

## # A tibble: 8 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <chr>       <chr>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
## 7 Afghanistan Asia       1982    39.9 12881816      978.
## 8 Afghanistan Asia       1987    40.8 13867957      852.

You may encounter Tab-Delimited data where values are separated by \t instead of ,. Instead of readr::read_csv(), we can use readr::read_tsv().

tsv_text <- 
'country\tcontinent\tyear\tlifeExp\tpop\tgdpPercap     
Afghanistan\tAsia\t1952\t28.801\t8425333\t779.4453145
Afghanistan\tAsia\t1957\t30.332\t9240934\t820.8530296
Afghanistan\tAsia\t1962\t31.997\t10267083\t853.10071
Afghanistan\tAsia\t1967\t34.02\t11537966\t836.1971382
Afghanistan\tAsia\t1972\t36.088\t13079460\t739.9811058
Afghanistan\tAsia\t1977\t38.438\t14880372\t786.11336
Afghanistan\tAsia\t1982\t39.854\t12881816\t978.0114388
Afghanistan\tAsia\t1987\t40.822\t13867957\t852.3959448'

tsv_file <- tempfile(fileext = ".tsv")
writeLines(text = tsv_text, con = tsv_file)

read_tsv(file = tsv_file)

## Parsed with column specification:
## cols(
##   country = col_character(),
##   continent = col_character(),
##   year = col_double(),
##   lifeExp = col_double(),
##   pop = col_double(),
##   gdpPercap = col_double()
## )

## # A tibble: 8 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <chr>       <chr>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
## 7 Afghanistan Asia       1982    39.9 12881816      978.
## 8 Afghanistan Asia       1987    40.8 13867957      852.

If we find ourselves reading delmited data that uses something other than \t or , to separate values, we can use readr::read_delim().

pipe_separated_values_text <- 
'country|continent|year|lifeExp|pop|gdpPercap     
Afghanistan|Asia|1952|28.801|8425333|779.4453145
Afghanistan|Asia|1957|30.332|9240934|820.8530296
Afghanistan|Asia|1962|31.997|10267083|853.10071
Afghanistan|Asia|1967|34.02|11537966|836.1971382
Afghanistan|Asia|1972|36.088|13079460|739.9811058
Afghanistan|Asia|1977|38.438|14880372|786.11336
Afghanistan|Asia|1982|39.854|12881816|978.0114388
Afghanistan|Asia|1987|40.822|13867957|852.3959448'

psv_file <- tempfile(fileext = ".tsv")
writeLines(text = pipe_separated_values_text, con = psv_file)

read_delim(file = psv_file, delim = "|")

## Parsed with column specification:
## cols(
##   country = col_character(),
##   continent = col_character(),
##   year = col_double(),
##   lifeExp = col_double(),
##   pop = col_double(),
##   `gdpPercap     ` = col_double()
## )

## # A tibble: 8 x 6
##   country     continent  year lifeExp      pop `gdpPercap     `
##   <chr>       <chr>     <dbl>   <dbl>    <dbl>            <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333             779.
## 2 Afghanistan Asia       1957    30.3  9240934             821.
## 3 Afghanistan Asia       1962    32.0 10267083             853.
## 4 Afghanistan Asia       1967    34.0 11537966             836.
## 5 Afghanistan Asia       1972    36.1 13079460             740.
## 6 Afghanistan Asia       1977    38.4 14880372             786.
## 7 Afghanistan Asia       1982    39.9 12881816             978.
## 8 Afghanistan Asia       1987    40.8 13867957             852.

country,continent,year,lifeExp,pop,gdpPercap       # header/column names
Afghanistan,Asia,1952,28.801,8425333,779.4453145
Afghanistan,Asia,1957,30.332,9240934,820.8530296
Afghanistan,Asia,1962,31.997,10267083,853.10071
Afghanistan,Asia,1967,34.02,11537966,836.1971382
Afghanistan,Asia,1972,36.088,13079460,739.9811058
Afghanistan,Asia,1977,38.438,14880372,786.11336
Afghanistan,Asia,1982,39.854,12881816,978.0114388
Afghanistan,Asia,1987,40.822,13867957,852.3959448
Afghanistan,,,N/A,,                                # notice that we're missing values

csv_text <- 
'country,continent,year,lifeExp,pop,gdpPercap
Afghanistan,Asia,1952,28.801,8425333,779.4453145
Afghanistan,Asia,1957,30.332,9240934,820.8530296
Afghanistan,Asia,1962,31.997,10267083,853.10071
Afghanistan,Asia,1967,34.02,11537966,836.1971382
Afghanistan,Asia,1972,36.088,13079460,739.9811058
Afghanistan,Asia,1977,38.438,14880372,786.11336
Afghanistan,Asia,1982,39.854,12881816,978.0114388
Afghanistan,Asia,1987,40.822,13867957,852.3959448
Afghanistan,,,N/A,,'

csv_file <- tempfile(fileext = ".csv")
writeLines(text = csv_text, con = csv_file)

2.2 Common Pitfalls

2.2.1 Incorrect Column Types

data_frame_from_csv <- read_csv(file = csv_file)

## Parsed with column specification:
## cols(
##   country = col_character(),
##   continent = col_character(),
##   year = col_double(),
##   lifeExp = col_character(),
##   pop = col_double(),
##   gdpPercap = col_double()
## )

data_frame_from_csv

## # A tibble: 9 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <chr>       <chr>     <dbl> <chr>      <dbl>     <dbl>
## 1 Afghanistan Asia       1952 28.801   8425333      779.
## 2 Afghanistan Asia       1957 30.332   9240934      821.
## 3 Afghanistan Asia       1962 31.997  10267083      853.
## 4 Afghanistan Asia       1967 34.02   11537966      836.
## 5 Afghanistan Asia       1972 36.088  13079460      740.
## 6 Afghanistan Asia       1977 38.438  14880372      786.
## 7 Afghanistan Asia       1982 39.854  12881816      978.
## 8 Afghanistan Asia       1987 40.822  13867957      852.
## 9 Afghanistan <NA>         NA N/A           NA       NA

Notice that our year column says <dbl>, referring to it being of type double, yet all of our year values are whole numbers.

typeof(data_frame_from_csv$year)

## [1] "double"

data_frame_from_csv$year

## [1] 1952 1957 1962 1967 1972 1977 1982 1987   NA

We also have "N/A" in our lifeExp column, forcing R to interpret all lifeExp values as characters (<chr>).

typeof(data_frame_from_csv$lifeExp)

## [1] "character"

data_frame_from_csv$lifeExp

## [1] "28.801" "30.332" "31.997" "34.02"  "36.088" "38.438" "39.854" "40.822" "N/A"

2.2.1.1 Solution

read_csv(
  file = csv_file,
  col_types = cols(
    country = col_character(),
    continent = col_character(),
    year = col_integer(),        # read `year` as `integer`
    lifeExp = col_double(),      # read `lifeExp` as `double`
    pop = col_double(),
    gdpPercap = col_double()
  ),
  na = c("", "N/A")              # be explicit about how `csv_file` represents missing values
)

## # A tibble: 9 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <chr>       <chr>     <int>   <dbl>    <dbl>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
## 7 Afghanistan Asia       1982    39.9 12881816      978.
## 8 Afghanistan Asia       1987    40.8 13867957      852.
## 9 Afghanistan <NA>         NA    NA         NA       NA