#r
#r
Вопрос:
У меня есть фрейм данных, в котором в настоящее время есть один столбец с 8 типами энергии. Я хочу объединить этот столбец только в два типа, подобных этому.
-
Типы энергии Гидро, геотермальная, ветровая, солнечная, перекачиваемая гидроэнергия относятся к типу «Возобновляемая энергия»
-
Остальные 3 типа — «Невозобновляемая энергия»
Создав этот новый набор категорий, я хочу сгруппировать по странам, а затем суммировать для каждого из столбцов «year» в dataframe, которые имеют такие имена, как energyProd_2016
Текущий вид набора данных приведен ниже
top10ProducersMod <-
structure(list(country = c("DE", "DE", "DE", "DE", "DE", "DE",
"DE", "DE", "ES", "ES", "ES", "ES", "ES", "ES", "ES", "ES", "FR",
"FR", "FR", "FR", "FR", "FR", "FR", "FR", "IT", "IT", "IT", "IT",
"IT", "IT", "IT", "IT", "PL", "PL", "PL", "PL", "PL", "PL", "PL",
"PL", "SE", "SE", "SE", "SE", "SE", "SE", "SE", "SE", "UK", "UK",
"UK", "UK", "UK", "UK", "UK", "UK", "NO", "NO", "NO", "NO", "NO",
"NO", "NO", "NO", "TR", "TR", "TR", "TR", "TR", "TR", "TR", "TR",
"UA", "UA", "UA", "UA", "UA", "UA", "UA", "UA"), country_name = c("Germany",
"Germany", "Germany", "Germany", "Germany", "Germany", "Germany",
"Germany", "Spain", "Spain", "Spain", "Spain", "Spain", "Spain",
"Spain", "Spain", "France", "France", "France", "France", "France",
"France", "France", "France", "Italy", "Italy", "Italy", "Italy",
"Italy", "Italy", "Italy", "Italy", "Poland", "Poland", "Poland",
"Poland", "Poland", "Poland", "Poland", "Poland", "Sweden", "Sweden",
"Sweden", "Sweden", "Sweden", "Sweden", "Sweden", "Sweden", "United Kingdom",
"United Kingdom", "United Kingdom", "United Kingdom", "United Kingdom",
"United Kingdom", "United Kingdom", "United Kingdom", "Norway",
"Norway", "Norway", "Norway", "Norway", "Norway", "Norway", "Norway",
"Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey", "Turkey",
"Turkey", "Ukraine", "Ukraine", "Ukraine", "Ukraine", "Ukraine",
"Ukraine", "Ukraine", "Ukraine"), type = c("Conventional thermal",
"Nuclear", "Hydro", "Pumped hydro power", "Wind", "Solar", "Geothermal",
"Other", "Conventional thermal", "Nuclear", "Hydro", "Pumped hydro power",
"Wind", "Solar", "Geothermal", "Other", "Conventional thermal",
"Nuclear", "Hydro", "Pumped hydro power", "Wind", "Solar", "Geothermal",
"Other", "Conventional thermal", "Nuclear", "Hydro", "Pumped hydro power",
"Wind", "Solar", "Geothermal", "Other", "Conventional thermal",
"Nuclear", "Hydro", "Pumped hydro power", "Wind", "Solar", "Geothermal",
"Other", "Conventional thermal", "Nuclear", "Hydro", "Pumped hydro power",
"Wind", "Solar", "Geothermal", "Other", "Conventional thermal",
"Nuclear", "Hydro", "Pumped hydro power", "Wind", "Solar", "Geothermal",
"Other", "Conventional thermal", "Nuclear", "Hydro", "Pumped hydro power",
"Wind", "Solar", "Geothermal", "Other", "Conventional thermal",
"Nuclear", "Hydro", "Pumped hydro power", "Wind", "Solar", "Geothermal",
"Other", "Conventional thermal", "Nuclear", "Hydro", "Pumped hydro power",
"Wind", "Solar", "Geothermal", "Other"), energyProd_2016 = c(390141,
80038, 25690, 5451, 78218, 38098, 165, 1805, 108210, 56100, 39180,
3410, 47712, 13051, 0, 91, 60607.854, 384008.125, 64879.956,
4786.979, 21472.916, 8657.27, 87.933, 559.515, 190121, 0, 43784.622,
1825.204, 17523, 21757, 5867, 650, 136945, 0, 2591, 474, 12279,
124, 0, 64, 14621, 60524, 61764, 119, 15479, 143, 0, 0, 203165.188,
65149.08, 8286.51, 2948.89, 37262.699, 10410.948, 0, 0, 3212,
0, 143442, 999, 2116, 0, 0, 297, 174166, 0, 66686, 0, 15381,
1013, 4010, 681, 65415, 75931, 9025, 1633, 949, 490, 0, 79),
energyProd_2017 = c(376128, 72155, 25888, 5910, 103707, 39401,
157, 1623, 126885, 55540, 20708, 2249, 47929, 13778, 0, 78,
68922.949, 379093.955, 54432.864, 5070.791, 24710.749, 9572.843,
119.937, 673.71, 199722.444, 0, 37556.721, 1825.96, 17565.332,
24016.821, 5821.46, 582.871, 137068.099, 0, 2999.86, 448.193,
14574.192, 165.463, 0, 61.73, 15003, 63008, 64676, 25, 17609,
230, 0, 0, 189296.528, 63886.86, 8723.43, 2862.01, 50003.654,
11524.87, 0, 0, 3233, 0, 142276, 1157, 2852, 0, 0, 281, 199910.043,
0, 57823.851, 0, 17793.167, 2817.837, 5101.487, 811.134,
54041.9, 80295.3, 8812, 1559, 974, 737.6, 0, 45), energyProd_2018 = c(320437.701,
72274.133, 20041.878, 6661.083, 112174, 46184, 165.25, 522.751,
110084.238, 53270.8, 36083.636, 2009.408, 49495.207, 12137.315,
0, 0, 53956.44, 393153.254, 68706.552, 9082.043, 26134.02,
9651.478, 84.084, 0, 185046, 0, 49275, 1632, 17318, 22887,
5708, 0, 138957.387, 0, 2353.336, 402.36, 12487.345, 298.432,
0, 0, 14869.986, 65801.155, 60977.602, 0, 16638.345, 0, 0,
0, 180837.905, 59097.753, 7678.659, 2515.998, 56903.961,
12857.349, 0, 0, 3457.28, 0, 139512.026, 3490.918, 3875.881,
0, 0, 0, 195153.546, 0, 59865.351, 0, 19825.697, 7459.485,
6275.373, 703.126, 60830.7, 84397.9, 11920.2, 0, 1185.9,
737.8, 0, 0)), row.names = c(NA, -80L), class = c("tbl_df",
"tbl", "data.frame"))
Я хочу, чтобы это было так
DE Germany Renewable Energy amount1 amount2 amount3
DE Germany Non-Renewable Energy amount4 amount5 amount6
Ответ №1:
Одним из вариантов может быть:
library(tidyverse)
top10ProducersMod %>%
mutate(type2 = if_else(type %in% c('Hydro', 'Geothermal', 'Wind', 'Solar', 'Pumped hydro power'), 'Renewable_Energy', 'Non-Renewable_Energy')) %>%
pivot_longer(-c(country:type, type2), names_to = 'energy_year', values_to = 'prod') %>%
group_by(country, country_name, type2, energy_year) %>%
summarise(prod = sum(prod)) %>%
pivot_wider(values_from = prod, names_from = energy_year)
# country country_name type2 energyProd_2016 energyProd_2017 energyProd_2018
# <chr> <chr> <chr> <dbl> <dbl> <dbl>
# 1 DE Germany Non-Renew~ 471984 449906 393235.
# 2 DE Germany Renewable~ 147622 175063 185226.
# 3 ES Spain Non-Renew~ 164401 182503 163355.
# 4 ES Spain Renewable~ 103353 84664 99726.
# 5 FR France Non-Renew~ 445175. 448691. 447110.
# 6 FR France Renewable~ 99885. 93907. 113658.
Ответ №2:
Используя data.table, создайте столбец группировки для типа энергии, затем сгруппируйте по сумме нескольких столбцов:
library(data.table)
setDT(top10ProducersMod)
# make a grouping column
top10ProducersMod[, Energy := ifelse(type %in% c("Hydro", "Geothermal", "Wind", "Solar", "Pumped hydro power"),
"Renewable Energy", "Non-Renewable Energy") ]
# then group by columns, and sum on selected columns
top10ProducersMod[, lapply(.SD, sum), by = .(country, country_name, Energy), .SDcols = 4:6 ]
# country country_name Energy energyProd_2016 energyProd_2017 energyProd_2018
# 1: DE Germany Non-Renewable Energy 471984.00 449906.00 393234.59
# 2: DE Germany Renewable Energy 147622.00 175063.00 185226.21
# 3: ES Spain Non-Renewable Energy 164401.00 182503.00 163355.04
# 4: ES Spain Renewable Energy 103353.00 84664.00 99725.57
...
Ответ №3:
Также dplyr
, но без сводки…
library(dplyr)
df %>%
mutate(renewable = case_when(
type == "Hydro" ~ "Renewable Energy",
type == "Geothermal" ~ "Renewable Energy",
type == "Wind" ~ "Renewable Energy",
type == "Solar" ~ "Renewable Energy",
type == "Pumped hydro power" ~ "Renewable Energy",
TRUE ~ "Non-Renewable Energy")
) %>%
group_by(country, country_name, renewable) %>%
summarise(across(starts_with("energyProd_"), sum, .names = "{.col}"))
#> `summarise()` regrouping output by 'country', 'country_name' (override with `.groups` argument)
#> # A tibble: 20 x 6
#> # Groups: country, country_name [10]
#> country country_name renewable energyProd_2016 energyProd_2017
#> <chr> <chr> <chr> <dbl> <dbl>
#> 1 DE Germany Non-Rene… 471984 449906
#> 2 DE Germany Renewabl… 147622 175063
#> 3 ES Spain Non-Rene… 164401 182503
#> 4 ES Spain Renewabl… 103353 84664
#> 5 FR France Non-Rene… 445175. 448691.
#> 6 FR France Renewabl… 99885. 93907.
#> 7 IT Italy Non-Rene… 190771 200305.
#> 8 IT Italy Renewabl… 90757. 86786.
#> 9 NO Norway Non-Rene… 3509 3514
#> 10 NO Norway Renewabl… 146557 146285
#> 11 PL Poland Non-Rene… 137009 137130.
#> 12 PL Poland Renewabl… 15468 18188.
#> 13 SE Sweden Non-Rene… 75145 78011
#> 14 SE Sweden Renewabl… 77505 82540
#> 15 TR Turkey Non-Rene… 174847 200721.
#> 16 TR Turkey Renewabl… 87090 83536.
#> 17 UA Ukraine Non-Rene… 141425 134382.
#> 18 UA Ukraine Renewabl… 12097 12083.
#> 19 UK United King… Non-Rene… 268314. 253183.
#> 20 UK United King… Renewabl… 58909. 73114.
#> # … with 1 more variable: energyProd_2018 <dbl>
Комментарии:
1.
if_else(... %in% ...)
здесь было бы намного меньше ввода.2. Верно, просто компромисс между вводом текста и восприятием ясности.