3 Manipulating Data Frames
## ── Attaching packages ───────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
df <- tibble(
group = c("a", "a", "b", "b", "b"),
a = c(1, 4, NA, 3, 5),
b = c(9, NA, 8, 10, 7),
c = c(TRUE, FALSE, NA, FALSE, TRUE),
d = c(LETTERS[1:3], NA, LETTERS[[5]]),
e = factor(1:5, labels = c("tiny", "small", "medium", "big", "huge")),
f_col = c(as.Date(NA), as.Date("2020-09-23") + c(3, 2, 1, 4)),
g_col = c(as.POSIXct("2020-09-23 00:00:00") + 1:4 * 60 * 60 * 24 * 1.1, NA),
col_h = list(c(1, 10), c(2, NA), c(3, 8), c(4, 7), c(5, 6)),
col_i = list(NULL, pi, month.abb[6:10], iris, as.matrix(mtcars))
)
df
## # A tibble: 5 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 a 1 9 TRUE A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 a 4 NA FALSE B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 3 b NA 8 NA C medium 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>
## 4 b 3 10 FALSE <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 5 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## Rows: 5
## Columns: 10
## $ group <chr> "a", "a", "b", "b", "b"
## $ a <dbl> 1, 4, NA, 3, 5
## $ b <dbl> 9, NA, 8, 10, 7
## $ c <lgl> TRUE, FALSE, NA, FALSE, TRUE
## $ d <chr> "A", "B", "C", NA, "E"
## $ e <fct> tiny, small, medium, big, huge
## $ f_col <date> NA, 2020-09-26, 2020-09-25, 2020-09-24, 2020-09-27
## $ g_col <dttm> 2020-09-24 02:24:00, 2020-09-25 04:48:00, 2020-09-26 07:12:00, 2020-09-27 09:36:00, NA
## $ col_h <list> [<1, 10>, <2, NA>, <3, 8>, <4, 7>, <5, 6>]
## $ col_i <list> [NULL, 3.141593, <"Jun", "Jul", "Aug", "Sep", "Oct">, <data.frame[150 x 5]>, <matrix[32 x 11]>]
3.1 select()
Columns
3.1.1 by Name
## # A tibble: 5 x 1
## a
## <dbl>
## 1 1
## 2 4
## 3 NA
## 4 3
## 5 5
## # A tibble: 5 x 3
## a c e
## <dbl> <lgl> <fct>
## 1 1 TRUE tiny
## 2 4 FALSE small
## 3 NA NA medium
## 4 3 FALSE big
## 5 5 TRUE huge
## # A tibble: 5 x 3
## b d f_col
## <dbl> <chr> <date>
## 1 9 A NA
## 2 NA B 2020-09-26
## 3 8 C 2020-09-25
## 4 10 <NA> 2020-09-24
## 5 7 E 2020-09-27
## # A tibble: 5 x 10
## b c group a d e f_col g_col col_h col_i
## <dbl> <lgl> <chr> <dbl> <chr> <fct> <date> <dttm> <list> <list>
## 1 9 TRUE a 1 A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 NA FALSE a 4 B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 3 8 NA b NA C medium 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>
## 4 10 FALSE b 3 <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 5 7 TRUE b 5 E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 5 x 9
## b c group d e f_col g_col col_h col_i
## <dbl> <lgl> <chr> <chr> <fct> <date> <dttm> <list> <list>
## 1 9 TRUE a A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 NA FALSE a B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 3 8 NA b C medium 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>
## 4 10 FALSE b <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 5 7 TRUE b E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 5 x 3
## a c e
## <dbl> <lgl> <fct>
## 1 1 TRUE tiny
## 2 4 FALSE small
## 3 NA NA medium
## 4 3 FALSE big
## 5 5 TRUE huge
3.1.2 by Index
## # A tibble: 5 x 1
## group
## <chr>
## 1 a
## 2 a
## 3 b
## 4 b
## 5 b
## # A tibble: 5 x 3
## group b d
## <chr> <dbl> <chr>
## 1 a 9 A
## 2 a NA B
## 3 b 8 C
## 4 b 10 <NA>
## 5 b 7 E
## # A tibble: 5 x 3
## a c e
## <dbl> <lgl> <fct>
## 1 1 TRUE tiny
## 2 4 FALSE small
## 3 NA NA medium
## 4 3 FALSE big
## 5 5 TRUE huge
## # A tibble: 5 x 10
## a b group c d e f_col g_col col_h col_i
## <dbl> <dbl> <chr> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 1 9 a TRUE A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 4 NA a FALSE B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 3 NA 8 b NA C medium 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>
## 4 3 10 b FALSE <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 5 5 7 b TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 5 x 9
## a b c d e f_col g_col col_h col_i
## <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 1 9 TRUE A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 4 NA FALSE B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 3 NA 8 NA C medium 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>
## 4 3 10 FALSE <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 5 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 5 x 3
## group b d
## <chr> <dbl> <chr>
## 1 a 9 A
## 2 a NA B
## 3 b 8 C
## 4 b 10 <NA>
## 5 b 7 E
## # A tibble: 5 x 3
## group b d
## <chr> <dbl> <chr>
## 1 a 9 A
## 2 a NA B
## 3 b 8 C
## 4 b 10 <NA>
## 5 b 7 E
3.1.3 by Name Pattern
contains()
selects a column if any part of its name contains match=
.
## # A tibble: 5 x 4
## f_col g_col col_h col_i
## <date> <dttm> <list> <list>
## 1 NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 3 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>
## 4 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 5 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
starts_with()
selects a column if its name starts with match=
.
## # A tibble: 5 x 2
## col_h col_i
## <list> <list>
## 1 <dbl [2]> <NULL>
## 2 <dbl [2]> <dbl [1]>
## 3 <dbl [2]> <chr [5]>
## 4 <dbl [2]> <df[,5] [150 × 5]>
## 5 <dbl [2]> <dbl[,11] [32 × 11]>
starts_with()
selects a column if its name ends with match=
.
## # A tibble: 5 x 2
## f_col g_col
## <date> <dttm>
## 1 NA 2020-09-24 02:24:00
## 2 2020-09-26 2020-09-25 04:48:00
## 3 2020-09-25 2020-09-26 07:12:00
## 4 2020-09-24 2020-09-27 09:36:00
## 5 2020-09-27 NA
matches()
s Selects a column if its name matches a regular expression pattern.
## # A tibble: 5 x 4
## f_col g_col col_h col_i
## <date> <dttm> <list> <list>
## 1 NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 3 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>
## 4 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 5 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
3.1.4 by Data Type
## # A tibble: 5 x 1
## e
## <fct>
## 1 tiny
## 2 small
## 3 medium
## 4 big
## 5 huge
## # A tibble: 5 x 1
## e
## <fct>
## 1 tiny
## 2 small
## 3 medium
## 4 big
## 5 huge
## # A tibble: 5 x 2
## e f_col
## <fct> <date>
## 1 tiny NA
## 2 small 2020-09-26
## 3 medium 2020-09-25
## 4 big 2020-09-24
## 5 huge 2020-09-27
## # A tibble: 5 x 10
## a group b c d e f_col g_col col_h col_i
## <dbl> <chr> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 1 a 9 TRUE A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 4 a NA FALSE B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 3 NA b 8 NA C medium 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>
## 4 3 b 10 FALSE <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 5 5 b 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 5 x 3
## group d e
## <chr> <chr> <fct>
## 1 a A tiny
## 2 a B small
## 3 b C medium
## 4 b <NA> big
## 5 b E huge
## # A tibble: 5 x 6
## a b f_col g_col col_h col_i
## <dbl> <dbl> <date> <dttm> <list> <list>
## 1 1 9 NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 4 NA 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 3 NA 8 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>
## 4 3 10 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 5 5 7 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 5 x 3
## group d e
## <chr> <chr> <fct>
## 1 a A tiny
## 2 a B small
## 3 b C medium
## 4 b <NA> big
## 5 b E huge
3.2 filter()
Rows
3.2.1 by row_number()
## # A tibble: 1 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 a 1 9 TRUE A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## # A tibble: 4 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 a 4 NA FALSE B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 2 b NA 8 NA C medium 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>
## 3 b 3 10 FALSE <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 4 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
3.2.2 by Name
## # A tibble: 0 x 10
## # … with 10 variables: group <chr>, a <dbl>, b <dbl>, c <lgl>, d <chr>, e <fct>, f_col <date>, g_col <dttm>, col_h <list>, col_i <list>
## # A tibble: 4 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 a 1 9 TRUE A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 a 4 NA FALSE B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 3 b 3 10 FALSE <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 4 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 2 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 a 1 9 TRUE A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 2 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 a 4 NA FALSE B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 2 b 3 10 FALSE <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## # A tibble: 1 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 3 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 a 4 NA FALSE B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 2 b 3 10 FALSE <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 3 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 2 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 a 1 9 TRUE A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 2 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 a 1 9 TRUE A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
3.2.3 by Type
## # A tibble: 1 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 1 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## # A tibble: 4 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 a 4 NA FALSE B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 2 b NA 8 NA C medium 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>
## 3 b 3 10 FALSE <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 4 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
3.3 arrange()
Rows
## # A tibble: 5 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 a 1 9 TRUE A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 2 b 3 10 FALSE <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 3 a 4 NA FALSE B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 4 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## 5 b NA 8 NA C medium 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>
## # A tibble: 5 x 10
## group a b c d e f_col g_col col_h col_i
## <chr> <dbl> <dbl> <lgl> <chr> <fct> <date> <dttm> <list> <list>
## 1 b 5 7 TRUE E huge 2020-09-27 NA <dbl [2]> <dbl[,11] [32 × 11]>
## 2 a 4 NA FALSE B small 2020-09-26 2020-09-25 04:48:00 <dbl [2]> <dbl [1]>
## 3 b 3 10 FALSE <NA> big 2020-09-24 2020-09-27 09:36:00 <dbl [2]> <df[,5] [150 × 5]>
## 4 a 1 9 TRUE A tiny NA 2020-09-24 02:24:00 <dbl [2]> <NULL>
## 5 b NA 8 NA C medium 2020-09-25 2020-09-26 07:12:00 <dbl [2]> <chr [5]>