Широкий и длинный фрейм данных со множеством переменных внутри и между объектами

#r #dataframe

Вопрос:

Я пытаюсь создать набор данных с длинным типом структуры, с 2 переменными между объектами и 2 переменными внутри объектов из таблицы Excel. Текущая структура набора данных выглядит следующим образом:

 gt; str(Subset_0)  'data.frame': 54 obs. of 11 variables:  $ Subject : num 1 2 3 4 5 6 7 8 9 10 ...  $ BETWEEN1: num 1 1 1 2 2 2 2 1 1 2 ...  $ BETWEEN2: num 1 1 2 2 2 2 1 1 1 1 ...  $ A_x1 : num 5 1 3 1 0 6 1 2 7 1 ...  $ B_x2 : num 5 1 3 0 3 0 0 2 6 1 ...  $ C_y1 : num 6 9 9 2 2 4 2 2 6 0 ...  $ D_y2 : num 6 15 4 1 2 4 3 1 3 0 ...  $ K_x1 : num 5 1 3 1 0 6 1 2 7 1 ...  $ L_x2 : num 5 1 3 0 3 0 0 2 6 1 ...  $ M_y1 : num 6 9 9 2 2 4 2 2 6 14 ...  $ N_y2 : num 3 1 0 4 0 5 6 5 17 21 ...  

файл данных из dput :

 structure(list(Subject = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,  12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,  28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,  44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55), BETWEEN1 = c(1,  1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1,  2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), BETWEEN2 = c(1,  1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,  1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), A_x1 = c(5,  1, 3, 1, 0, 6, 1, 2, 7, 1, 1, 0, 0, 2, 0, 8, NA, NA, NA, NA,  14, 23, 19, 10, 9, 10, 11, 14, 16, 8, 24, 17, 8, 22, 14, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,  NA, NA), B_x2 = c(5, 1, 3, 0, 3, 0, 0, 2, 6, 1, 0, 0, 0, 0, 1,  7, 14, 23, 19, 10, 14, 29, 15, 7, 13, 16, 7, 9, 17, 6, 7, 16,  6, 11, 13, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA), C_y1 = c(6, 9, 9, 2, 2, 4, 2, 2, 6,  0, 6, 0, 1, 10, 3, 8, 14, 29, 15, 7, 17, 21, 24, 7, 32, 31, 31,  21, 27, 29, 18, 27, 33, 23, 28, NA, NA, NA, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), D_y2 = c(6, 15,  4, 1, 2, 4, 3, 1, 3, 0, 0, 0, 2, 2, 2, 5, 17, 21, 24, 7, 24,  16, 28, 7, 28, 23, 25, 25, 24, 28, 33, 27, 31, 33, 21, NA, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,  NA), K_x1 = c(5, 1, 3, 1, 0, 6, 1, 2, 7, 1, 1, 0, 0, 2, 0, 8,  24, 16, 28, 7, 24, 31, 31, 13, 32, 35, 32, 22, 29, 32, 32, 29,  34, 32, 34, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA), L_x2 = c(5, 1, 3, 0, 3, 0, 0, 2, 6,  1, 0, 0, 0, 0, 1, 7, 24, 31, 31, 13, 30, 30, 34, 12, 31, 27,  23, 25, 33, 28, 31, 29, 30, 36, 24, NA, NA, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), M_y1 = c(6,  9, 9, 2, 2, 4, 2, 2, 6, 14, 23, 19, 10, 9, 10, 11, 14, 16, 8,  24, 17, 8, 22, 14, 33, 28, 31, 14, 23, 19, 10, 9, 10, 11, 14,  16, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,  NA, NA, NA), N_y2 = c(3, 1, 0, 4, 0, 5, 6, 5, 17, 21, 24, 7,  32, 31, 31, 21, NA, NA, NA, NA, 27, 29, 18, 27, NA, NA, 17, 21,  24, 7, 32, 31, 31, 21, 27, 17, NA, NA, NA, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,  -54L))  

Мне нужно разделить его по теме и условиям: по одному на столбец со значениями A, B, C и D в одном столбце и назвать его «Первым»; и K, L, M, N в другом и назвать его «Вторым». Более того, x, y, _1 и _2, присутствующие в этих переменных, представляют факторы внутри субъекта, которые мне также необходимо учитывать в других двух столбцах- «В пределах 1» для x и y; и «В пределах 2» для 1 и 2. И, наконец, с двумя столбцами «Между 1» и «Между 2», которые являются факторами между субъектами.

Мне нужно, чтобы это выглядело так:

 Subject First SecondI Within2 Within2 Between1 Between2 1 Ai Ki 1 x 1 1  1 Bi Li 2 x 1 1  1 Ci Mi 1 y 1 1  1 Di Ni 2 y 1 1  2 Ai Ki 1 x 1 1  2 Bi Li 2 x 1 1  2 Ci Mi 1 y 1 1  2 Di Ni 2 y 1 1  ...  

Я использовал эту reshape функцию дважды, один раз для группировки в один столбец A, B,C,D и отделения от нее переменных внутри темы,и мне это удалось:

   Subset_1 lt;-reshape(Subset_0,   varying = c("A_x1", " B_x2", "C_y1", "D_y2"),  v.names = "First",  timevar = "Within1",  times = c("A_x1", " B_x2", "C_y1", "D_y2"),  direction = "long")  # Next_Trial_Choice column Subset_1$Within1[Subset_1$Within1== "A_x1"] lt;- "x" Subset_1$Within1[Subset_1$Within1== "B_x2"] lt;- "x" Subset_1$Within1[Subset_1$Within1== "C_y1"] lt;- "y" Subset_1$Within1[Subset_1$Within1== "D_y2"] lt;- "y"  #cleaning the names - opponent column Subset_1$Within2[Subset_1$Within2== "A_x1"] lt;- "1" Subset_1$Within2[Subset_1$Within2== "B_x2"] lt;- "2" Subset_1$Within2[Subset_1$Within2== "C_y1"] lt;- "1" Subset_1$Within2[Subset_1$Within2== "D_y2"] lt;- "2"   

Проблема в том, что мне нужно сделать то же самое для другого столбца («Второй»), и я попытался снова использовать reshape, как делал раньше, на этот раз применительно к подмножеству 1. Но это не делает того, что мне нужно.

Есть ли способ сделать это?

Комментарии:

1. Добро пожаловать в Stack Overflow. Это один очень хорошо написанный первый вопрос!

Ответ №1:

Это выглядит так, как будто он получает ваш приведенный пример результата:

 # pipe library(magrittr)   # input data dxyz lt;- structure(list(Subject = c(  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,  12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,  28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,  44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55 ), BETWEEN1 = c(  1,  1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1,  2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA ), BETWEEN2 = c(  1,  1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,  1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA ), A_x1 = c(  5,  1, 3, 1, 0, 6, 1, 2, 7, 1, 1, 0, 0, 2, 0, 8, NA, NA, NA, NA,  14, 23, 19, 10, 9, 10, 11, 14, 16, 8, 24, 17, 8, 22, 14, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,  NA, NA ), B_x2 = c(  5, 1, 3, 0, 3, 0, 0, 2, 6, 1, 0, 0, 0, 0, 1,  7, 14, 23, 19, 10, 14, 29, 15, 7, 13, 16, 7, 9, 17, 6, 7, 16,  6, 11, 13, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA ), C_y1 = c(  6, 9, 9, 2, 2, 4, 2, 2, 6,  0, 6, 0, 1, 10, 3, 8, 14, 29, 15, 7, 17, 21, 24, 7, 32, 31, 31,  21, 27, 29, 18, 27, 33, 23, 28, NA, NA, NA, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA ), D_y2 = c(  6, 15,  4, 1, 2, 4, 3, 1, 3, 0, 0, 0, 2, 2, 2, 5, 17, 21, 24, 7, 24,  16, 28, 7, 28, 23, 25, 25, 24, 28, 33, 27, 31, 33, 21, NA, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,  NA ), K_x1 = c(  5, 1, 3, 1, 0, 6, 1, 2, 7, 1, 1, 0, 0, 2, 0, 8,  24, 16, 28, 7, 24, 31, 31, 13, 32, 35, 32, 22, 29, 32, 32, 29,  34, 32, 34, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA ), L_x2 = c(  5, 1, 3, 0, 3, 0, 0, 2, 6,  1, 0, 0, 0, 0, 1, 7, 24, 31, 31, 13, 30, 30, 34, 12, 31, 27,  23, 25, 33, 28, 31, 29, 30, 36, 24, NA, NA, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA ), M_y1 = c(  6,  9, 9, 2, 2, 4, 2, 2, 6, 14, 23, 19, 10, 9, 10, 11, 14, 16, 8,  24, 17, 8, 22, 14, 33, 28, 31, 14, 23, 19, 10, 9, 10, 11, 14,  16, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,  NA, NA, NA ), N_y2 = c(  3, 1, 0, 4, 0, 5, 6, 5, 17, 21, 24, 7,  32, 31, 31, 21, NA, NA, NA, NA, 27, 29, 18, 27, NA, NA, 17, 21,  24, 7, 32, 31, 31, 21, 27, 17, NA, NA, NA, NA, NA, NA, NA, NA,  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA )), class = "data.frame", row.names = c(  NA,  -54L ))  # extract all abcd in long format with Within seperated abcd lt;- dxyz %gt;%   tidyr::pivot_longer(-c(Subject, BETWEEN1,BETWEEN2)) %gt;%   tidyr::separate(col = name, sep = "_", into = c("First", "Within")) %gt;%   dplyr::filter(First %in% c("A", "B", "C", "D")) %gt;%   dplyr::mutate(  Within21 = stringr::str_extract_all(Within, "[:digit:]") %gt;% unlist(),  Within22 = stringr::str_extract_all(Within, "[:alpha:]") %gt;% unlist()  ) %gt;%   dplyr::select(-Within)  # extract all klmn in long format with Within seperated klmn lt;- dxyz %gt;%   tidyr::pivot_longer(-c(Subject, BETWEEN1,BETWEEN2)) %gt;%   tidyr::separate(col = name, sep = "_", into = c("Second", "Within")) %gt;%   dplyr::filter(Second %in% c("K", "L", "M", "N"))%gt;%   dplyr::mutate(  Within21 = stringr::str_extract_all(Within, "[:digit:]") %gt;% unlist(),  Within22 = stringr::str_extract_all(Within, "[:alpha:]") %gt;% unlist()  ) %gt;%   dplyr::select(-Within)  # join both data sets together abcd %gt;%   dplyr::left_join(  klmn,  by = c("Subject", "BETWEEN1", "BETWEEN2", "Within21", "Within22")  ) %gt;%   dplyr::select(  Subject, First, Second, Within21, Within22, BETWEEN1, BETWEEN2, value.x, value.y  )      

Я разделил преобразование на две части для A, B, C, D и K, L, M, N, а затем объединил данные вместе.

Ответ №2:

вот один из вариантов с pivot_longer . Я знаю, что разделил немного слишком много, но это просто для того, чтобы устранить путаницу с именами. Вы можете настроить их в соответствии с вашими выходными данными.

 library(tidyr) df %gt;% pivot_longer(cols=c("A_x1", "B_x2", "C_y1", "D_y2"), names_to="first") %gt;%  pivot_longer(cols=c("K_x1", "L_x2", "M_y1", "N_y2"), names_to="second",values_to = "value2") %gt;%   separate(first, into = c("first", "Within1"), sep = "_") %gt;%   separate(Within1,into = c("Within1", "Within1_2"), sep = "(?lt;=[A-Za-z])(?=[0-9])") %gt;%   separate(second, into = c("second", "Within2"), sep = "_") %gt;%   separate(Within2,into = c("Within2", "Within2_2"), sep = "(?lt;=[A-Za-z])(?=[0-9])") %gt;%   select(-c(value, value2)) %gt;% distinct()