13  tidyselect 열 선택

Tidyverse 메타패키지에 속하는 tidyr, dplyr, purrr과 같은 패키지의 함수들을 모두 tidyselect라는 열 선택 방법을 사용한다. 이것은 그 자체가 아주 작은 언어이기도 한데, 잘 사용하면 작업을 쉽게 할 수 있다. 특히 수십, 수백개의 열 이름을 가진 데이터를 처리할 때 이 방법이 아주 효과적이다.

penguins 데이터셋을 사용해서 열 선택 방법을 살펴보자.

head(penguins)
  species    island bill_len bill_dep flipper_len body_mass    sex year
1  Adelie Torgersen     39.1     18.7         181      3750   male 2007
2  Adelie Torgersen     39.5     17.4         186      3800 female 2007
3  Adelie Torgersen     40.3     18.0         195      3250 female 2007
4  Adelie Torgersen       NA       NA          NA        NA   <NA> 2007
5  Adelie Torgersen     36.7     19.3         193      3450 female 2007
6  Adelie Torgersen     39.3     20.6         190      3650   male 2007
penguins |>
    select(bill_len:body_mass) |>
    slice(1:5)
  bill_len bill_dep flipper_len body_mass
1     39.1     18.7         181      3750
2     39.5     17.4         186      3800
3     40.3     18.0         195      3250
4       NA       NA          NA        NA
5     36.7     19.3         193      3450
penguins |>
    select(starts_with("bill")) |>
    slice(1:5)
  bill_len bill_dep
1     39.1     18.7
2     39.5     17.4
3     40.3     18.0
4       NA       NA
5     36.7     19.3
penguins |>
    select(ends_with("len")) |>
    slice(1:5)
  bill_len flipper_len
1     39.1         181
2     39.5         186
3     40.3         195
4       NA          NA
5     36.7         193
penguins |>
    select(contains("pper")) |>
    slice(1:5)
  flipper_len
1         181
2         186
3         195
4          NA
5         193
penguins |>
    select(matches("a[a-z]{2}")) |>
    slice(1:5)
     island body_mass
1 Torgersen      3750
2 Torgersen      3800
3 Torgersen      3250
4 Torgersen        NA
5 Torgersen      3450
head(billboard)
# A tibble: 6 × 79
  artist      track date.entered   wk1   wk2   wk3   wk4   wk5   wk6   wk7   wk8
  <chr>       <chr> <date>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 Pac       Baby… 2000-02-26      87    82    72    77    87    94    99    NA
2 2Ge+her     The … 2000-09-02      91    87    92    NA    NA    NA    NA    NA
3 3 Doors Do… Kryp… 2000-04-08      81    70    68    67    66    57    54    53
4 3 Doors Do… Loser 2000-10-21      76    76    72    69    67    65    55    59
5 504 Boyz    Wobb… 2000-04-15      57    34    25    17    17    31    36    49
6 98^0        Give… 2000-08-19      51    39    34    26    26    19     2     2
# ℹ 68 more variables: wk9 <dbl>, wk10 <dbl>, wk11 <dbl>, wk12 <dbl>,
#   wk13 <dbl>, wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>,
#   wk19 <dbl>, wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>, wk24 <dbl>,
#   wk25 <dbl>, wk26 <dbl>, wk27 <dbl>, wk28 <dbl>, wk29 <dbl>, wk30 <dbl>,
#   wk31 <dbl>, wk32 <dbl>, wk33 <dbl>, wk34 <dbl>, wk35 <dbl>, wk36 <dbl>,
#   wk37 <dbl>, wk38 <dbl>, wk39 <dbl>, wk40 <dbl>, wk41 <dbl>, wk42 <dbl>,
#   wk43 <dbl>, wk44 <dbl>, wk45 <dbl>, wk46 <dbl>, wk47 <dbl>, wk48 <dbl>, …
billboard |>
    select(num_range("wk", 1:8)) |>
    slice(1:5)
# A tibble: 5 × 8
    wk1   wk2   wk3   wk4   wk5   wk6   wk7   wk8
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1    87    82    72    77    87    94    99    NA
2    91    87    92    NA    NA    NA    NA    NA
3    81    70    68    67    66    57    54    53
4    76    76    72    69    67    65    55    59
5    57    34    25    17    17    31    36    49
sel_vars <- c("bill_len", "bill_dep", "flipper_len")
penguins |>
    select(all_of(sel_vars)) |>
    slice(1:5)
  bill_len bill_dep flipper_len
1     39.1     18.7         181
2     39.5     17.4         186
3     40.3     18.0         195
4       NA       NA          NA
5     36.7     19.3         193
sel_vars <- c("bill_len", "bill_depth", "flipper_len", "body_mass")
penguins |>
    select(any_of(sel_vars)) |>
    slice(1:5)
  bill_len flipper_len body_mass
1     39.1         181      3750
2     39.5         186      3800
3     40.3         195      3250
4       NA          NA        NA
5     36.7         193      3450
penguins |>
    select(everything()) |>
    slice(1:5)
  species    island bill_len bill_dep flipper_len body_mass    sex year
1  Adelie Torgersen     39.1     18.7         181      3750   male 2007
2  Adelie Torgersen     39.5     17.4         186      3800 female 2007
3  Adelie Torgersen     40.3     18.0         195      3250 female 2007
4  Adelie Torgersen       NA       NA          NA        NA   <NA> 2007
5  Adelie Torgersen     36.7     19.3         193      3450 female 2007
penguins |>
    select(last_col()) |>
    slice(1:5)
  year
1 2007
2 2007
3 2007
4 2007
5 2007
penguins |>
    select(where(is.factor)) |>
    slice(1:5)
  species    island    sex
1  Adelie Torgersen   male
2  Adelie Torgersen female
3  Adelie Torgersen female
4  Adelie Torgersen   <NA>
5  Adelie Torgersen female
penguins |>
    select(where(is.numeric)) |>
    slice(1:5)
  bill_len bill_dep flipper_len body_mass year
1     39.1     18.7         181      3750 2007
2     39.5     17.4         186      3800 2007
3     40.3     18.0         195      3250 2007
4       NA       NA          NA        NA 2007
5     36.7     19.3         193      3450 2007

이런 선택에서 연산자를 추가하여 사용할 수 있다.

penguins |>
    select(!starts_with("bill")) |>
    slice(1:5)
  species    island flipper_len body_mass    sex year
1  Adelie Torgersen         181      3750   male 2007
2  Adelie Torgersen         186      3800 female 2007
3  Adelie Torgersen         195      3250 female 2007
4  Adelie Torgersen          NA        NA   <NA> 2007
5  Adelie Torgersen         193      3450 female 2007
penguins |>
    select(-starts_with("bill")) |>
    slice(1:5)
  species    island flipper_len body_mass    sex year
1  Adelie Torgersen         181      3750   male 2007
2  Adelie Torgersen         186      3800 female 2007
3  Adelie Torgersen         195      3250 female 2007
4  Adelie Torgersen          NA        NA   <NA> 2007
5  Adelie Torgersen         193      3450 female 2007
penguins |>
    select(starts_with("bill") | ends_with("len")) |>
    slice(1:5)
  bill_len bill_dep flipper_len
1     39.1     18.7         181
2     39.5     17.4         186
3     40.3     18.0         195
4       NA       NA          NA
5     36.7     19.3         193
penguins |>
    select(starts_with("bill") & ends_with("len")) |>
    slice(1:5)
  bill_len
1     39.1
2     39.5
3     40.3
4       NA
5     36.7