#r #dataframe #data-cleaning #rowsum
Вопрос:
Мой фрейм данных содержит 2695 наблюдений 195 переменных, и его первый квадрант 100×10 выглядит следующим образом:
structure(list(name = c("Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Albania", "Albania",
"Albania", "Albania", "Albania", "Albania", "Albania", "Albania",
"Albania", "Albania", "Albania", "Albania", "Albania", "Albania",
"Albania", "Albania", "Albania", "Albania", "Albania", "Albania",
"Albania", "Algeria", "Algeria", "Algeria", "Algeria", "Algeria",
"Algeria", "Algeria", "Algeria", "Algeria", "Algeria", "Algeria",
"Algeria", "Algeria", "Algeria", "Algeria", "Algeria", "Algeria",
"Algeria", "Algeria", "Algeria", "Algeria", "Andorra", "Andorra",
"Andorra", "Andorra", "Andorra", "Andorra", "Andorra", "Andorra",
"Andorra", "Andorra", "Andorra", "Andorra", "Andorra", "Andorra",
"Andorra", "Andorra", "Andorra", "Andorra", "Andorra", "Andorra",
"Andorra", "Angola", "Angola", "Angola", "Angola", "Angola",
"Angola", "Angola", "Angola", "Angola", "Angola", "Angola", "Angola",
"Angola", "Angola", "Angola", "Angola"), code = c("AFG", "AFG",
"AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG",
"AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG",
"AFG", "ALB", "ALB", "ALB", "ALB", "ALB", "ALB", "ALB", "ALB",
"ALB", "ALB", "ALB", "ALB", "ALB", "ALB", "ALB", "ALB", "ALB",
"ALB", "ALB", "ALB", "ALB", "DZA", "DZA", "DZA", "DZA", "DZA",
"DZA", "DZA", "DZA", "DZA", "DZA", "DZA", "DZA", "DZA", "DZA",
"DZA", "DZA", "DZA", "DZA", "DZA", "DZA", "DZA", "AND", "AND",
"AND", "AND", "AND", "AND", "AND", "AND", "AND", "AND", "AND",
"AND", "AND", "AND", "AND", "AND", "AND", "AND", "AND", "AND",
"AND", "AGO", "AGO", "AGO", "AGO", "AGO", "AGO", "AGO", "AGO",
"AGO", "AGO", "AGO", "AGO", "AGO", "AGO", "AGO", "AGO"), cluster = c("Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries", "Industries",
"Industries", "Industries", "Industries", "Industries"), sector = c("Agriculture",
"Fishing", "Mining and Quarrying", "Textiles and Wearing Apparel",
"Wood and Paper", "Petroleum, Chemical and Non-Metallic Mineral Products",
"Metal Products", "Transport Equipment", "Other Manufacturing",
"Electricity, Gas and Water", "Construction", "Maintenance and Repair",
"Wholesale Trade", "Retail Trade", "Transport", "Post and Telecommunications",
"Public Administration", "Education, Health and Other Services",
"Private Households", "Others", "Re-export amp; Re-import", "Agriculture",
"Fishing", "Mining and Quarrying", "Textiles and Wearing Apparel",
"Wood and Paper", "Petroleum, Chemical and Non-Metallic Mineral Products",
"Metal Products", "Transport Equipment", "Other Manufacturing",
"Electricity, Gas and Water", "Construction", "Maintenance and Repair",
"Wholesale Trade", "Retail Trade", "Transport", "Post and Telecommunications",
"Public Administration", "Education, Health and Other Services",
"Private Households", "Others", "Re-export amp; Re-import", "Agriculture",
"Fishing", "Mining and Quarrying", "Textiles and Wearing Apparel",
"Wood and Paper", "Petroleum, Chemical and Non-Metallic Mineral Products",
"Metal Products", "Transport Equipment", "Other Manufacturing",
"Electricity, Gas and Water", "Construction", "Maintenance and Repair",
"Wholesale Trade", "Retail Trade", "Transport", "Post and Telecommunications",
"Public Administration", "Education, Health and Other Services",
"Private Households", "Others", "Re-export amp; Re-import", "Agriculture",
"Fishing", "Mining and Quarrying", "Textiles and Wearing Apparel",
"Wood and Paper", "Petroleum, Chemical and Non-Metallic Mineral Products",
"Metal Products", "Transport Equipment", "Other Manufacturing",
"Electricity, Gas and Water", "Construction", "Maintenance and Repair",
"Wholesale Trade", "Retail Trade", "Transport", "Post and Telecommunications",
"Public Administration", "Education, Health and Other Services",
"Private Households", "Others", "Re-export amp; Re-import", "Agriculture",
"Fishing", "Mining and Quarrying", "Textiles and Wearing Apparel",
"Wood and Paper", "Petroleum, Chemical and Non-Metallic Mineral Products",
"Metal Products", "Transport Equipment", "Other Manufacturing",
"Electricity, Gas and Water", "Construction", "Maintenance and Repair",
"Wholesale Trade", "Retail Trade", "Transport", "Post and Telecommunications"
), Total = c(313056.134909551, 1269.32391427775, 6702.10091092139,
20822.9824174335, 3277.35295343427, 7075.57751406419, 5809.54856404444,
6374.97667034069, 6573.16606009056, 1520.47502801121, 10658.977916637,
7414.61509425962, 12290.7967126299, 11398.2992559452, 40272.8802077045,
11195.0970347162, 9608.34006394526, 15892.9447789989, 7520.16190432492,
1309.97105807522, 5863.24413932713, 84798.6701470358, 5287.89543614681,
7019.68464118458, 207555.54211305, 42610.6785141417, 57455.2766420949,
50880.0662736183, 8562.70774978471, 35615.0812816442, 3209.61081796767,
13425.7891844031, 10172.0898252207, 25135.1384491464, 10171.5875982993,
24546.3331711539, 20219.2191239651, 12492.0819211004, 19829.1935665882,
10538.3674396562, 860.212193244676, 10441.8877791697, 109944.342344373,
10130.4848607536, 38399430.0758622, 54346.1819763583, 60588.7451272658,
10333339.8435353, 166914.291830525, 15106.291762901, 6992.51143191634,
2096.00364208704, 178873.707038741, 32430.7567673094, 828119.280778115,
172465.345148761, 2608478.34637721, 975944.341821359, 129890.525779898,
781459.183975379, 13967.9236719995, 1460.95459419176, 1239.25223987981,
2006.16751711888, 579.408605356926, 567.392719967148, 10646.2491986802,
15849.8055842179, 15533.2886586566, 9405.82927410387, 30004.8039316661,
5970.2054602464, 959.45168395392, 10588.6969072962, 7797.41788929008,
11103.6195368155, 11422.3263920103, 11030.0357536182, 11005.3874434708,
10256.8119809788, 11665.7731598811, 6780.08819348487, 939.441815889167,
10049.3382667714, 17194.9845438908, 2105.35872638104, 15962587.4508949,
2681.40283207019, 8290.02901841903, 199873.949286869, 14846.7790432671,
10800.3647727473, 7104.21565574377, 1878.9292703595, 370849.323130881,
22336.8687091073, 189194.987574215, 62353.0494427796, 2465448.35656496,
443041.875133795), Afghanistan = c(304848.2, 1244.815, 5992.698,
18010.06, 2975.027, 6331.944, 5160.831, 5384.333, 5747.278, 1392.815,
10046.36, 6304.916, 11137.87, 10923.54, 36934.41, 10822.21, 8730.08,
15105.91, 5966.117, 1117.582, 26.16316, 1.83796, 0.4187321, 0.7053242,
16.82224, 2.368265, 2.229667, 2.969031, 0.4339618, 3.330972,
0.1409332, 0.3907028, 2.256146, 0.4993595, 0.2239593, 0.8452328,
0.4606935, 0.2311659, 0.4483911, 3.229056, 0.1869243, 14.26183,
0.6368992, 0.1774615, 855.0543, 2.059841, 0.9784637, 326.125,
2.480306, 0.6295102, 0.2723101, 0.009939627, 2.58328, 0.7858758,
4.796921, 2.209655, 29.30604, 7.516025, 2.180713, 15.07286, 1.155397,
0.03922217, 0.6618161, 0.3569212, 0.5384865, 0.156222, 3.320798,
2.33509, 1.42815, 1.389516, 3.446843, 1.356888, 0.1334601, 0.510716,
2.723687, 0.3888807, 0.4487993, 0.8531269, 0.6789875, 0.4435601,
0.4159068, 3.614623, 0.3341556, 7.505752, 0.1033085, 0.08569755,
972.7443, 0.1132736, 0.1158019, 2.354941, 0.2283046, 0.4196883,
0.2569963, 0.01133495, 5.883291, 1.139181, 1.037388, 0.6386013,
38.7435, 3.419991), Albania = c(3.535969, 0.03840783, 0.3863751,
0.9954086, 0.08775854, 0.2030882, 1.391618, 0.5245008, 0.6190842,
0.05073775, 0.3891475, 0.6727891, 0.1539272, 0.1706042, 0.5426898,
0.1037151, 0.1823552, 0.2115925, 1.069002, 0.1062371, 3.949615,
79527.31, 5061.744, 5606.964, 140981.7, 27107.64, 33462.2, 31183.07,
4460.67, 21130.07, 2729.427, 10630.79, 8044.345, 22253.97, 9347.96,
18225.96, 18097.29, 10439.65, 17669.2, 7737.212, 658.327, 2512.097,
0.04379642, 0.007626944, 120.1914, 0.8615488, 0.4047151, 38.60637,
1.123844, 0.1297725, 0.09295459, 0.002695269, 0.5615575, 0.7208675,
1.350622, 0.3834361, 6.671941, 2.181208, 0.3419616, 1.298375,
0.6230328, 0.02060596, 0.7954482, 0.1158716, 0.1792077, 0.05296581,
0.9807616, 0.7176638, 0.4338842, 0.4336918, 1.129224, 0.4319445,
0.04283256, 0.1534184, 0.7950113, 0.1193977, 0.1371983, 0.2555027,
0.1975643, 0.1304942, 0.1132505, 1.009906, 0.1040789, 1.761093,
0.02053985, 0.01377083, 54.03197, 0.05894175, 0.06595316, 0.9770419,
0.1309219, 0.09237173, 0.1061737, 0.004385261, 1.031491, 0.6977874,
0.4370498, 0.1476074, 8.667269, 1.531756), Algeria = c(5.240079,
0.02104366, 0.5161704, 1.742304, 0.1884929, 0.638416, 0.3971941,
0.5894792, 0.5243543, 0.09268831, 0.3778295, 0.8218906, 0.3903957,
0.2782294, 1.946016, 0.2005679, 0.4673061, 0.46685, 1.315667,
0.1372425, 5.728609, 14.15439, 0.6593845, 4.430993, 153.2115,
35.62573, 92.96048, 52.28874, 8.492265, 32.08374, 1.774228, 7.498232,
4.302572, 5.717841, 1.817346, 18.18068, 4.45664, 5.554349, 5.279886,
5.870353, 0.4191698, 17.09926, 107132.8, 9887.484, 33714360,
37499.33, 44093.85, 7940498, 117407.7, 9193.506, 4680.662, 2018.159,
152113.8, 25992.47, 691562.8, 151294.7, 2163505, 848482.5, 102530.9,
690073.7, 11346.21, 1308.455, 98.49957, 1.596688, 0.8939674,
0.5979496, 12.63072, 17.78408, 26.27299, 10.60669, 36.0613, 7.210763,
0.8374489, 4.829583, 4.684588, 2.309686, 2.157866, 8.470586,
3.90022, 4.690984, 3.752759, 5.94401, 0.658466, 9.524497, 0.4095283,
0.09161661, 2198.32, 0.6536236, 1.393105, 58.27179, 3.16544,
3.330465, 1.636288, 0.06272295, 31.69208, 1.537006, 6.454557,
2.202606, 287.937, 26.93717), Andorra = c(2.046408, 0.03199265,
0.1913253, 0.6384646, 0.04210553, 0.05989684, 0.08395811, 0.08547009,
0.1640007, 0.02629837, 0.06544481, 0.5198038, 0.0704934, 0.07354977,
0.2318274, 0.05006094, 0.05805839, 0.07343344, 0.8471854, 0.08000277,
3.281725, 1.166039, 0.1937104, 0.5849786, 14.64168, 2.014269,
1.634169, 2.62503, 0.4598703, 3.004348, 0.1156719, 0.3324458,
2.258714, 0.4214368, 0.1890398, 0.6775366, 0.4166179, 0.1967594,
0.2213797, 3.279177, 0.1841389, 18.42281, 0.06572487, 0.0151666,
241.5491, 0.734705, 0.5635472, 68.69543, 3.025981, 0.328492,
0.1079413, 0.004644561, 1.220245, 0.776872, 4.995991, 0.5595545,
22.07305, 5.319428, 0.9709488, 2.607332, 0.4908999, 0.02019178,
0.5850328, 1497.731, 217.3481, 378.3719, 6415.105, 10549.02,
10058.02, 6282.365, 16703.89, 3597.702, 744.6967, 9176.902, 5960.622,
10163.73, 10653.15, 8799.109, 9645.094, 8923.656, 10560.27, 4362.44,
687.0441, 5951.223, 0.01764571, 0.01243527, 41.01675, 0.04743286,
0.05422601, 0.7896524, 0.1068679, 0.07563721, 0.08683252, 0.003704872,
0.8285007, 0.5872343, 0.3658586, 0.1194013, 7.110707, 1.272627
), Angola = c(7.126427, 0.02126276, 0.7013776, 2.584582, 0.2207993,
0.6455885, 0.5102651, 0.673658, 0.7199301, 0.1172209, 0.4537595,
1.497391, 0.4881724, 0.3670951, 2.803086, 0.2650796, 0.6734943,
0.5744068, 2.549689, 0.2411082, 13.97644, 3.896676, 0.5745994,
1.517034, 45.23805, 6.866533, 8.479498, 9.423369, 1.715862, 9.02622,
0.3268822, 1.231526, 4.494482, 1.333423, 0.5345557, 2.800145,
1.197053, 0.8376675, 1.132491, 6.272173, 0.3792077, 28.0768,
1.509137, 0.1199938, 2864.661, 11.52246, 6.664335, 1731.045,
22.18162, 2.835002, 1.308098, 0.05257571, 13.34905, 4.500684,
29.18065, 8.044516, 211.2769, 45.41144, 12.96821, 51.85862, 4.30742,
0.1704663, 3.274468, 0.9248837, 1.163733, 0.4030822, 7.497041,
6.648192, 7.311064, 4.187048, 11.8977, 3.649344, 0.4054804, 1.667877,
5.738257, 1.079308, 1.176356, 3.087699, 1.857479, 1.600233, 1.335246,
7.249851, 0.7536257, 12.16895, 16902.28, 2053.868, 14629320,
2176.549, 7247.814, 167657.8, 12729.29, 8228.971, 5893.646, 1834.933,
348980.5, 21047.25, 182549, 60673.71, 2279856, 421157.8)), class = "data.frame", row.names = c("V2",
"V3", "V4", "V6", "V7", "V8", "V9", "V11", "V12", "V14", "V15",
"V16", "V17", "V18", "V20", "V21", "V23", "V24", "V25", "V26",
"V27", "V28", "V29", "V30", "V32", "V33", "V34", "V35", "V37",
"V38", "V40", "V41", "V42", "V43", "V44", "V46", "V47", "V49",
"V50", "V51", "V52", "V53", "V54", "V55", "V56", "V58", "V59",
"V60", "V61", "V63", "V64", "V66", "V67", "V68", "V69", "V70",
"V72", "V73", "V75", "V76", "V77", "V78", "V79", "V80", "V81",
"V82", "V84", "V85", "V86", "V87", "V89", "V90", "V92", "V93",
"V94", "V95", "V96", "V98", "V99", "V101", "V102", "V103", "V104",
"V105", "V106", "V107", "V108", "V110", "V111", "V112", "V113",
"V115", "V116", "V118", "V119", "V120", "V121", "V122", "V124",
"V125"))
Как вы можете видеть, в одной стране несколько строк, но только один столбец. Для каждой строки «Общее» значение вычисляется путем суммирования всех значений из столбца 5 в столбец 194.
Моя цель состоит в том, чтобы рассчитать чистую сумму, вычитая соответствующий столбец страны из общей суммы. Например, первые 26 строк относятся к Афганистану. Я хочу вычесть из их «Общего числа» значение 5-й колонки (что соответствует той же стране). Вторые 26 присвоены Албании, поэтому вычитание должно происходить между «итогом» и 6-й колонкой. и т. Д. и т. Д.
Ожидаемый результат должен выглядеть следующим образом (пример является предварительным, так как исходные данные.фрейм огромен):
Имя | код | скопление | сектор | Весь | Чистый итог | Афганистан | Албания |
---|---|---|---|---|---|---|---|
Афганистан | AFG | ОТРАСЛИ | Сельское хозяйство | 100 | 92 | 8 | 1 |
Афганистан | AFG | ОТРАСЛИ | Рыбная ловля | 105 | 99 | 6 | 2 |
Афганистан | AFG | ОТРАСЛИ | Добыча полезных ископаемых | 98 | 96 | 2 | 3 |
Афганистан | AFG | ОТРАСЛИ | Текстиль | 101 | 80 | 21 | 4 |
Афганистан | AFG | ОТРАСЛИ | Дерево | 90 | 79 | 11 | 5 |
Афганистан | AFG | ОТРАСЛИ | Нефть | 101 | 100 | 1 | 1 |
Афганистан | AFG | ОТРАСЛИ | Металлические | 50 | 30 | 20 | 3 |
Важно то, что «Чистый итог» рассчитывается как «Итого» — «Афганистан», но только для строк «Афганистан». Фактически, «Чистый итог» для строк другой страны должен быть рассчитан как «Итого» — столбцы «другая страна».
Как я могу закодировать такого рода операции? Я бы предпочел код, который связывает вычитание с названием страны, а не с номером столбца, так как я заметил, что в столбце «название» 190 столбцов стран, но только 171 страна.
Большое вам спасибо за потраченное время.
Комментарии:
1. Обновленный. К сожалению, data.frame большой, поэтому я могу просто делать небольшие примеры с таблицами.
Ответ №1:
Вы можете попробовать с помощью приведенного ниже кода —
df$net_total <- df$Total - as.numeric(df[cbind(1:nrow(df), match(df$name, names(df)))])
Комментарии:
1. Я буквально только что попробовал тот же код, и он сработал. В любом случае, большое вам спасибо.