Создайте новую группировку существующих категорий и суммируйте новые группы

#r

#r

Вопрос:

У меня есть фрейм данных, в котором в настоящее время есть один столбец с 8 типами энергии. Я хочу объединить этот столбец только в два типа, подобных этому.

  1. Типы энергии Гидро, геотермальная, ветровая, солнечная, перекачиваемая гидроэнергия относятся к типу «Возобновляемая энергия»

  2. Остальные 3 типа — «Невозобновляемая энергия»

Создав этот новый набор категорий, я хочу сгруппировать по странам, а затем суммировать для каждого из столбцов «year» в dataframe, которые имеют такие имена, как energyProd_2016

Текущий вид набора данных приведен ниже

 top10ProducersMod <- 
    structure(list(country = c("DE", "DE", "DE", "DE", "DE", "DE", 
    "DE", "DE", "ES", "ES", "ES", "ES", "ES", "ES", "ES", "ES", "FR", 
    "FR", "FR", "FR", "FR", "FR", "FR", "FR", "IT", "IT", "IT", "IT", 
    "IT", "IT", "IT", "IT", "PL", "PL", "PL", "PL", "PL", "PL", "PL", 
    "PL", "SE", "SE", "SE", "SE", "SE", "SE", "SE", "SE", "UK", "UK", 
    "UK", "UK", "UK", "UK", "UK", "UK", "NO", "NO", "NO", "NO", "NO", 
    "NO", "NO", "NO", "TR", "TR", "TR", "TR", "TR", "TR", "TR", "TR", 
    "UA", "UA", "UA", "UA", "UA", "UA", "UA", "UA"), country_name = c("Germany", 
    "Germany", "Germany", "Germany", "Germany", "Germany", "Germany", 
    "Germany", "Spain", "Spain", "Spain", "Spain", "Spain", "Spain", 
    "Spain", "Spain", "France", "France", "France", "France", "France", 
    "France", "France", "France", "Italy", "Italy", "Italy", "Italy", 
    "Italy", "Italy", "Italy", "Italy", "Poland", "Poland", "Poland", 
    "Poland", "Poland", "Poland", "Poland", "Poland", "Sweden", "Sweden", 
    "Sweden", "Sweden", "Sweden", "Sweden", "Sweden", "Sweden", "United Kingdom", 
    "United Kingdom", "United Kingdom", "United Kingdom", "United Kingdom", 
    "United Kingdom", "United Kingdom", "United Kingdom", "Norway", 
    "Norway", "Norway", "Norway", "Norway", "Norway", "Norway", "Norway", 
    "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", 
    "Turkey", "Ukraine", "Ukraine", "Ukraine", "Ukraine", "Ukraine", 
    "Ukraine", "Ukraine", "Ukraine"), type = c("Conventional thermal", 
    "Nuclear", "Hydro", "Pumped hydro power", "Wind", "Solar", "Geothermal", 
    "Other", "Conventional thermal", "Nuclear", "Hydro", "Pumped hydro power", 
    "Wind", "Solar", "Geothermal", "Other", "Conventional thermal", 
    "Nuclear", "Hydro", "Pumped hydro power", "Wind", "Solar", "Geothermal", 
    "Other", "Conventional thermal", "Nuclear", "Hydro", "Pumped hydro power", 
    "Wind", "Solar", "Geothermal", "Other", "Conventional thermal", 
    "Nuclear", "Hydro", "Pumped hydro power", "Wind", "Solar", "Geothermal", 
    "Other", "Conventional thermal", "Nuclear", "Hydro", "Pumped hydro power", 
    "Wind", "Solar", "Geothermal", "Other", "Conventional thermal", 
    "Nuclear", "Hydro", "Pumped hydro power", "Wind", "Solar", "Geothermal", 
    "Other", "Conventional thermal", "Nuclear", "Hydro", "Pumped hydro power", 
    "Wind", "Solar", "Geothermal", "Other", "Conventional thermal", 
    "Nuclear", "Hydro", "Pumped hydro power", "Wind", "Solar", "Geothermal", 
    "Other", "Conventional thermal", "Nuclear", "Hydro", "Pumped hydro power", 
    "Wind", "Solar", "Geothermal", "Other"), energyProd_2016 = c(390141, 
    80038, 25690, 5451, 78218, 38098, 165, 1805, 108210, 56100, 39180, 
    3410, 47712, 13051, 0, 91, 60607.854, 384008.125, 64879.956, 
    4786.979, 21472.916, 8657.27, 87.933, 559.515, 190121, 0, 43784.622, 
    1825.204, 17523, 21757, 5867, 650, 136945, 0, 2591, 474, 12279, 
    124, 0, 64, 14621, 60524, 61764, 119, 15479, 143, 0, 0, 203165.188, 
    65149.08, 8286.51, 2948.89, 37262.699, 10410.948, 0, 0, 3212, 
    0, 143442, 999, 2116, 0, 0, 297, 174166, 0, 66686, 0, 15381, 
    1013, 4010, 681, 65415, 75931, 9025, 1633, 949, 490, 0, 79), 
        energyProd_2017 = c(376128, 72155, 25888, 5910, 103707, 39401, 
        157, 1623, 126885, 55540, 20708, 2249, 47929, 13778, 0, 78, 
        68922.949, 379093.955, 54432.864, 5070.791, 24710.749, 9572.843, 
        119.937, 673.71, 199722.444, 0, 37556.721, 1825.96, 17565.332, 
        24016.821, 5821.46, 582.871, 137068.099, 0, 2999.86, 448.193, 
        14574.192, 165.463, 0, 61.73, 15003, 63008, 64676, 25, 17609, 
        230, 0, 0, 189296.528, 63886.86, 8723.43, 2862.01, 50003.654, 
        11524.87, 0, 0, 3233, 0, 142276, 1157, 2852, 0, 0, 281, 199910.043, 
        0, 57823.851, 0, 17793.167, 2817.837, 5101.487, 811.134, 
        54041.9, 80295.3, 8812, 1559, 974, 737.6, 0, 45), energyProd_2018 = c(320437.701, 
        72274.133, 20041.878, 6661.083, 112174, 46184, 165.25, 522.751, 
        110084.238, 53270.8, 36083.636, 2009.408, 49495.207, 12137.315, 
        0, 0, 53956.44, 393153.254, 68706.552, 9082.043, 26134.02, 
        9651.478, 84.084, 0, 185046, 0, 49275, 1632, 17318, 22887, 
        5708, 0, 138957.387, 0, 2353.336, 402.36, 12487.345, 298.432, 
        0, 0, 14869.986, 65801.155, 60977.602, 0, 16638.345, 0, 0, 
        0, 180837.905, 59097.753, 7678.659, 2515.998, 56903.961, 
        12857.349, 0, 0, 3457.28, 0, 139512.026, 3490.918, 3875.881, 
        0, 0, 0, 195153.546, 0, 59865.351, 0, 19825.697, 7459.485, 
        6275.373, 703.126, 60830.7, 84397.9, 11920.2, 0, 1185.9, 
        737.8, 0, 0)), row.names = c(NA, -80L), class = c("tbl_df", 
    "tbl", "data.frame"))
  

Я хочу, чтобы это было так

 DE Germany Renewable Energy      amount1  amount2 amount3
DE Germany Non-Renewable Energy  amount4  amount5 amount6
  

Ответ №1:

Одним из вариантов может быть:

 library(tidyverse)
top10ProducersMod %>% 
  mutate(type2 = if_else(type %in% c('Hydro', 'Geothermal', 'Wind', 'Solar', 'Pumped hydro power'), 'Renewable_Energy', 'Non-Renewable_Energy')) %>%
  pivot_longer(-c(country:type, type2), names_to = 'energy_year', values_to = 'prod') %>%
  group_by(country, country_name, type2, energy_year) %>%
  summarise(prod = sum(prod)) %>%
  pivot_wider(values_from = prod, names_from = energy_year)

# country country_name type2      energyProd_2016 energyProd_2017 energyProd_2018
# <chr>   <chr>        <chr>                <dbl>           <dbl>           <dbl>
# 1 DE      Germany      Non-Renew~         471984          449906          393235.
# 2 DE      Germany      Renewable~         147622          175063          185226.
# 3 ES      Spain        Non-Renew~         164401          182503          163355.
# 4 ES      Spain        Renewable~         103353           84664           99726.
# 5 FR      France       Non-Renew~         445175.         448691.         447110.
# 6 FR      France       Renewable~          99885.          93907.         113658.
  

Ответ №2:

Используя data.table, создайте столбец группировки для типа энергии, затем сгруппируйте по сумме нескольких столбцов:

 library(data.table)

setDT(top10ProducersMod)

# make a grouping column
top10ProducersMod[, Energy := ifelse(type %in% c("Hydro", "Geothermal", "Wind", "Solar", "Pumped hydro power"), 
                                     "Renewable Energy", "Non-Renewable Energy") ]

# then group by columns, and sum on selected columns
top10ProducersMod[, lapply(.SD, sum), by = .(country, country_name, Energy), .SDcols = 4:6 ]
#     country   country_name               Energy energyProd_2016 energyProd_2017 energyProd_2018
# 1:      DE        Germany Non-Renewable Energy       471984.00       449906.00       393234.59
# 2:      DE        Germany     Renewable Energy       147622.00       175063.00       185226.21
# 3:      ES          Spain Non-Renewable Energy       164401.00       182503.00       163355.04
# 4:      ES          Spain     Renewable Energy       103353.00        84664.00        99725.57
...
  

Ответ №3:

Также dplyr , но без сводки…

 library(dplyr)

df %>% 
   mutate(renewable = case_when(
      type == "Hydro" ~ "Renewable Energy",
      type == "Geothermal" ~ "Renewable Energy",
      type == "Wind" ~ "Renewable Energy",
      type == "Solar" ~ "Renewable Energy",
      type == "Pumped hydro power" ~ "Renewable Energy",
      TRUE ~ "Non-Renewable Energy")
      ) %>%
   group_by(country, country_name, renewable) %>%
   summarise(across(starts_with("energyProd_"), sum, .names = "{.col}"))

#> `summarise()` regrouping output by 'country', 'country_name' (override with `.groups` argument)
#> # A tibble: 20 x 6
#> # Groups:   country, country_name [10]
#>    country country_name renewable energyProd_2016 energyProd_2017
#>    <chr>   <chr>        <chr>               <dbl>           <dbl>
#>  1 DE      Germany      Non-Rene…         471984          449906 
#>  2 DE      Germany      Renewabl…         147622          175063 
#>  3 ES      Spain        Non-Rene…         164401          182503 
#>  4 ES      Spain        Renewabl…         103353           84664 
#>  5 FR      France       Non-Rene…         445175.         448691.
#>  6 FR      France       Renewabl…          99885.          93907.
#>  7 IT      Italy        Non-Rene…         190771          200305.
#>  8 IT      Italy        Renewabl…          90757.          86786.
#>  9 NO      Norway       Non-Rene…           3509            3514 
#> 10 NO      Norway       Renewabl…         146557          146285 
#> 11 PL      Poland       Non-Rene…         137009          137130.
#> 12 PL      Poland       Renewabl…          15468           18188.
#> 13 SE      Sweden       Non-Rene…          75145           78011 
#> 14 SE      Sweden       Renewabl…          77505           82540 
#> 15 TR      Turkey       Non-Rene…         174847          200721.
#> 16 TR      Turkey       Renewabl…          87090           83536.
#> 17 UA      Ukraine      Non-Rene…         141425          134382.
#> 18 UA      Ukraine      Renewabl…          12097           12083.
#> 19 UK      United King… Non-Rene…         268314.         253183.
#> 20 UK      United King… Renewabl…          58909.          73114.
#> # … with 1 more variable: energyProd_2018 <dbl>
  

Комментарии:

1. if_else(... %in% ...) здесь было бы намного меньше ввода.

2. Верно, просто компромисс между вводом текста и восприятием ясности.