Слияние широкого фрейма данных с длинным фреймом данных в R

#r #merge

#r #слияние

Вопрос:

Я пытаюсь добавить данные ВВП в имеющийся у меня фрейм данных. Я включил первые 5 строк фреймов данных для справки.

Я хотел бы добавить ВВП каждого государства по годам. В моем фрейме данных у меня есть столбцы «цикл» и «состояние», которые я хотел бы использовать для «сопоставления» соответствующего значения ВВП, но, похоже, я не могу понять, как это сделать, потому что информация о ВВП представлена в широком формате.

Любая помощь будет оценена!

 
candidates <- structure(list(Cycle = c("1990", "1990", "1990", "1990", "1990"
    ), CandidateName = c("Robert Palmer", "David Elliott Smith", 
    "Gary A Condit", "Cliff Burris", "Jerry M Reiss"), State = c("CA", 
    "CA", "CA", "CA", "CA"), VoteShare = c(0.336755826502931, 0.356542953604619, 
    0.661849966957576, 0.338150033042424, 0.220356681141645), stateGDP = c("NA", 
    "NA", "NA", "NA", "NA")), row.names = c(NA, 5L), class = "data.frame")

stateGDP <- structure(list(X1 = c(1998, 2000, 2002, 2004, 2006), USA = c(5.7, 
    6.5, 3.4, 6.6, 6), AK = c(5.1, 3.7, 4.4, 9.6, 5.1), AL = c(-6.1, 
    8.3, 4.2, 10, 11.7), AZ = c(8.3, 6, 4.6, 6.6, 8.8), AR = c(4.3, 
    2.9, 4.7, 8.2, 5.3), CA = c(7.2, 9.6, 3.8, 6.9, 6.9), CO = c(10, 
    10, 1.8, 3.8, 5.1), CT = c(4.9, 8.9, 1.8, 9.6, 6.8), DE = c(11.6, 
    7.5, -1.5, 7.7, 5.6), DC = c(4.6, 3.8, 6.8, 8.6, 3.8), FL = c(7.1, 
    6.8, 6.2, 8.9, 6.8), GA = c(9.2, 6.4, 2.6, 7, 4), HI = c(-0.1, 
    5.7, 6, 9.9, 6.3), ID = c(5.2, 11.3, 3.6, 8.4, 6.3), IL = c(4.9, 
    6.1, 2.4, 5.5, 6), IN = c(8.2, 5.2, 3.8, 6, 4.6), IA = c(2.4, 
    6.1, 4.4, 11, 3.8), KS = c(5.1, 4.9, 2.9, 3.2, 8.4), KY = c(4.8, 
    -0.8, 4.1, 5.7, 5.9), LA = c(3.4, 5.4, 1.2, 10, 4.2), ME = c(5, 
    7.2, 4.4, 6.4, 4.5), MD = c(6.7, 6.6, 6.3, 7.9, 5.1), MA = c(5, 
    9.5, 2.4, 4.9, 4.5), MI = c(4.3, 4.2, 3.9, 2, 0.3), MN = c(6.7, 
    8.8, 4.2, 7, 2.8), MS = c(4.5, 2.8, 2.7, 5.3, 6), MO = c(3.8, 
    5.6, 2.8, 5, 3.7), MT = c(5.9, 5.1, 4.5, 9, 7.9), NE = c(3.9, 
    5.2, 3.6, 5.8, 5.7), NV = c(7.8, 7.1, 5.4, 15, 8.2), NH = c(6.1, 
    8, 4.4, 5.3, 5.4), NJ = c(3.9, 7.5, 3.9, 4.4, 5.4), NM = c(-3.5, 
    4, 3.7, 10.9, 5.1), NY = c(3.7, 5.7, 1.1, 5.5, 5.6), NC = c(5.1, 
    4.4, 3.3, 6.1, 8.8), ND = c(6.8, 5.8, 6.8, 4.9, 7), OH = c(5.3, 
    4, 4.1, 4.9, 2.4), OK = c(2.7, 8, 2.5, 7.5, 10.1), OR = c(4.2, 
    9, 3.4, 7.1, 8), PA = c(4.7, 4.6, 2.8, 5.9, 4.1), RI = c(6.2, 
    6.9, 5.9, 7.4, 5.6), SC = c(6.4, 4.5, 4.1, 3.7, 5.6), SD = c(6.2, 
    7.8, 12, 8, 4), TN = c(8.4, 3.3, 5.1, 7, 5.3), TX = c(5.8, 8.4, 
    1.8, 9.2, 10), UT = c(8, 6.5, 3.8, 8.3, 12.7), VT = c(4.3, 6.7, 
    4.6, 7.3, 3.4), VA = c(7.3, 6.8, 3.3, 7.2, 5.2), WA = c(8.4, 
    3.2, 3.7, 4.6, 7.1), WV = c(3.2, 1.9, 3.3, 5.6, 6.1), WI = c(6.1, 
    4.8, 3.7, 5.9, 4.4), WY = c(1.4, 9.5, 2.4, 10.3, 19.2)), row.names = c(NA, 
    -5L), class = c("tbl_df", "tbl", "data.frame"))

  

Комментарии:

1. Измените формат с широкого на длинный, затем объедините как обычно

Ответ №1:

Измените формат с широкого на длинный, затем объедините, как обычно.

С вашими примерами данных совпадений нет, но приведенное ниже должно работать для ваших полных данных:

 merge(candidates,
      pivot_longer(stateGDP, cols = -1, names_to = "State", values_to = "GDP"),
      by.x = c("Cycle", "State"), by.y = c("X1", "State"))
  

Ответ №2:

Вот tidyverse версия, в которой я немного изменил ваши входные данные, чтобы убедиться, что совпадения были…

 library(dplyr)
library(tidyr)

candidates$stateGDP <- NULL

stateGDP_long <- 
   stateGDP %>% 
   pivot_longer(cols = c(AK:WY), 
                names_to = "State", 
                values_to = "stateGDP") %>% 
   mutate(Cycle = as.character(X1)) %>% 
   select(-USA, -X1)


left_join(candidates, stateGDP_long)

#> Joining, by = c("Cycle", "State")
#>   Cycle       CandidateName State VoteShare stateGDP
#> 1  1990       Robert Palmer    CA 0.3367558      7.2
#> 2  1990 David Elliott Smith    CA 0.3565430      7.2
#> 3  1990       Gary A Condit    CA 0.6618500      7.2
#> 4  1990        Cliff Burris    CA 0.3381500      7.2
#> 5  1990       Jerry M Reiss    CA 0.2203567      7.2
  

Ваши данные слегка изменены

 candidates <- structure(list(Cycle = c("1990", "1990", "1990", "1990", "1990"
), CandidateName = c("Robert Palmer", "David Elliott Smith", 
                     "Gary A Condit", "Cliff Burris", "Jerry M Reiss"), State = c("CA", 
                                                                                  "CA", "CA", "CA", "CA"), VoteShare = c(0.336755826502931, 0.356542953604619, 
                                                                                                                         0.661849966957576, 0.338150033042424, 0.220356681141645), stateGDP = c("NA", 
                                                                                                                                                                                                "NA", "NA", "NA", "NA")), row.names = c(NA, 5L), class = "data.frame")


stateGDP <- structure(list(X1 = c(1990, 1991, 1992, 1993, 1994), USA = c(5.7, 
                                                                         6.5, 3.4, 6.6, 6), AK = c(5.1, 3.7, 4.4, 9.6, 5.1), AL = c(-6.1, 
                                                                                                                                    8.3, 4.2, 10, 11.7), AZ = c(8.3, 6, 4.6, 6.6, 8.8), AR = c(4.3, 
                                                                                                                                                                                               2.9, 4.7, 8.2, 5.3), CA = c(7.2, 9.6, 3.8, 6.9, 6.9), CO = c(10, 
                                                                                                                                                                                                                                                            10, 1.8, 3.8, 5.1), CT = c(4.9, 8.9, 1.8, 9.6, 6.8), DE = c(11.6, 
                                                                                                                                                                                                                                                                                                                        7.5, -1.5, 7.7, 5.6), DC = c(4.6, 3.8, 6.8, 8.6, 3.8), FL = c(7.1, 
                                                                                                                                                                                                                                                                                                                                                                                      6.8, 6.2, 8.9, 6.8), GA = c(9.2, 6.4, 2.6, 7, 4), HI = c(-0.1, 
                                                                                                                                                                                                                                                                                                                                                                                                                                               5.7, 6, 9.9, 6.3), ID = c(5.2, 11.3, 3.6, 8.4, 6.3), IL = c(4.9, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           6.1, 2.4, 5.5, 6), IN = c(8.2, 5.2, 3.8, 6, 4.6), IA = c(2.4, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    6.1, 4.4, 11, 3.8), KS = c(5.1, 4.9, 2.9, 3.2, 8.4), KY = c(4.8, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                -0.8, 4.1, 5.7, 5.9), LA = c(3.4, 5.4, 1.2, 10, 4.2), ME = c(5, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             7.2, 4.4, 6.4, 4.5), MD = c(6.7, 6.6, 6.3, 7.9, 5.1), MA = c(5, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          9.5, 2.4, 4.9, 4.5), MI = c(4.3, 4.2, 3.9, 2, 0.3), MN = c(6.7, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     8.8, 4.2, 7, 2.8), MS = c(4.5, 2.8, 2.7, 5.3, 6), MO = c(3.8, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              5.6, 2.8, 5, 3.7), MT = c(5.9, 5.1, 4.5, 9, 7.9), NE = c(3.9, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       5.2, 3.6, 5.8, 5.7), NV = c(7.8, 7.1, 5.4, 15, 8.2), NH = c(6.1, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   8, 4.4, 5.3, 5.4), NJ = c(3.9, 7.5, 3.9, 4.4, 5.4), NM = c(-3.5, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              4, 3.7, 10.9, 5.1), NY = c(3.7, 5.7, 1.1, 5.5, 5.6), NC = c(5.1, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          4.4, 3.3, 6.1, 8.8), ND = c(6.8, 5.8, 6.8, 4.9, 7), OH = c(5.3, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     4, 4.1, 4.9, 2.4), OK = c(2.7, 8, 2.5, 7.5, 10.1), OR = c(4.2, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               9, 3.4, 7.1, 8), PA = c(4.7, 4.6, 2.8, 5.9, 4.1), RI = c(6.2, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        6.9, 5.9, 7.4, 5.6), SC = c(6.4, 4.5, 4.1, 3.7, 5.6), SD = c(6.2, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     7.8, 12, 8, 4), TN = c(8.4, 3.3, 5.1, 7, 5.3), TX = c(5.8, 8.4, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           1.8, 9.2, 10), UT = c(8, 6.5, 3.8, 8.3, 12.7), VT = c(4.3, 6.7, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 4.6, 7.3, 3.4), VA = c(7.3, 6.8, 3.3, 7.2, 5.2), WA = c(8.4, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         3.2, 3.7, 4.6, 7.1), WV = c(3.2, 1.9, 3.3, 5.6, 6.1), WI = c(6.1, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      4.8, 3.7, 5.9, 4.4), WY = c(1.4, 9.5, 2.4, 10.3, 19.2)), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             -5L), class = c("tbl_df", "tbl", "data.frame"))