#r #dataframe #compare
Вопрос:
У меня есть два похожих метаданных, содержащих около 1000 записей и более 500 столбцов . и я хочу проверить согласованность между двумя кадрами данных. теперь я хочу создать новый фрейм данных, для которого он будет отображать все имена столбцов df1 в первой строке и совпадать с именами столбцов df2 во второй строке, а также их опции в столбце 3 и 4 соответственно. а затем измените новые столбцы, чтобы показать TRUE
FALSE
, совпадают ли имена столбцов и их параметры.
в принципе, я должен проверить, совпадают ли имена столбцов в df1 с df 2 и совпадают ли параметры во всех столбцах df1 с df2
df1 <- data.frame(ID =c("DEV2962","KTN2252","ANA2719","ITI2624","DEV2698","HRT2921",NA,"KTN2624","ANA2548","ITI2535","DEV2732","HRT2837","ERV2951","KTN2542","ANA2813","ITI2210"),
city=c("del","mum","nav","pun","bang","chen","triv","vish","del","mum","bang","vish","bhop","kol","noi","gurg"),
Name= c("dev,akash","singh,rahul","abbas,salman","lal,ram","singh,nkunj","garg,prabal","ali,sanu","singh,kunal","tomar,lakhan","thakur,praveen","ali,sarman","khan,zuber","singh,giriraj","sharma,lokesh","pawar,pooja","sharma,nikita"),
gender =c("m","f","m","f","m","m","m","m","m","m","m","f","m","f","m","m"))
df2 <- data.frame(ID =c("DEV2962","KTN2152","ANA2719","ITs2624","DEV2698","HRT2921",NA,"KTN2624","ANA2548","ITI2535","DEV2732","HRT2837","ERV2951","KTN2542","ANA2813","ITI2210"),
city=c("del","MUm"," nav","pun","bang","chen"," ddgy ","vish","del","mum","bang","vish","bhol","nhus","huay","gurg"),
Name= c("dev","singh,rahul","abbas,salman","lal,ram","singh,nkunj","huna,ghalak","khan,fhalt","singh,kunal","tomar,lakhan","thakur,praveen","ali,sarman","khan,zuber","singh,giriraj","sharma,lokesh","pawar,pooja","sharma,nikita"),
gender =c("m","f","m","f","m","m","m","male","m","male","m","f","m","f","m","m"))
df <- select(df1,matches("^[A-Z]"))
comp <- do.call(rbind, lapply(df[, 4:ncol(df)], function(option) as.data.frame(table(option))))
comp$variable <- gsub("[.](.*)","", rownames(comp))
rownames(comp) <- NULL
comp <- comp[, c(3,1,2)]
comp <- comp[order(-comp$Freq), ]
Я не могу прикрепить файл, но я показал рисунок ниже, как должен выглядеть требуемый вывод.
Ответ №1:
- Приведите оба кадра данных в длинном формате с
pivot_longer
cbind
их, чтобыdf_result
mutate
сifelse
инструкцией, если столбцы совпадают
library(dplyr)
library(tidyr)
df1_long <- df1 %>%
pivot_longer(
cols = everything(),
names_to = "Names_df1",
values_to = "options_df1"
) %>%
arrange(Names_df1)
df2_long <- df2 %>%
pivot_longer(
cols = everything(),
names_to = "Names_df2",
values_to = "options_df2"
) %>%
arrange(Names_df2)
df_result <- cbind(df1_long, df2_long) %>%
mutate(Names_matching = ifelse(Names_df1==Names_df2, TRUE, FALSE),
Options_matching = ifelse(options_df1==options_df2, TRUE, FALSE)) %>%
arrange(factor(Names_df1, levels = c("ID", "city", "Name", "gender")))
выход:
Names_df1 options_df1 Names_df2 options_df2 Names_matching Options_matching
1 ID DEV2962 ID DEV2962 TRUE TRUE
2 ID KTN2252 ID KTN2152 TRUE FALSE
3 ID ANA2719 ID ANA2719 TRUE TRUE
4 ID ITI2624 ID ITs2624 TRUE FALSE
5 ID DEV2698 ID DEV2698 TRUE TRUE
6 ID HRT2921 ID HRT2921 TRUE TRUE
7 ID <NA> ID <NA> TRUE NA
8 ID KTN2624 ID KTN2624 TRUE TRUE
9 ID ANA2548 ID ANA2548 TRUE TRUE
10 ID ITI2535 ID ITI2535 TRUE TRUE
11 ID DEV2732 ID DEV2732 TRUE TRUE
12 ID HRT2837 ID HRT2837 TRUE TRUE
13 ID ERV2951 ID ERV2951 TRUE TRUE
14 ID KTN2542 ID KTN2542 TRUE TRUE
15 ID ANA2813 ID ANA2813 TRUE TRUE
16 ID ITI2210 ID ITI2210 TRUE TRUE
17 city del city del TRUE TRUE
18 city mum city MUm TRUE FALSE
19 city nav city nav TRUE FALSE
20 city pun city pun TRUE TRUE
21 city bang city bang TRUE TRUE
22 city chen city chen TRUE TRUE
23 city triv city ddgy TRUE FALSE
24 city vish city vish TRUE TRUE
25 city del city del TRUE TRUE
26 city mum city mum TRUE TRUE
27 city bang city bang TRUE TRUE
28 city vish city vish TRUE TRUE
29 city bhop city bhol TRUE FALSE
30 city kol city nhus TRUE FALSE
31 city noi city huay TRUE FALSE
32 city gurg city gurg TRUE TRUE
33 Name dev,akash Name dev TRUE FALSE
34 Name singh,rahul Name singh,rahul TRUE TRUE
35 Name abbas,salman Name abbas,salman TRUE TRUE
36 Name lal,ram Name lal,ram TRUE TRUE
37 Name singh,nkunj Name singh,nkunj TRUE TRUE
38 Name garg,prabal Name huna,ghalak TRUE FALSE
39 Name ali,sanu Name khan,fhalt TRUE FALSE
40 Name singh,kunal Name singh,kunal TRUE TRUE
41 Name tomar,lakhan Name tomar,lakhan TRUE TRUE
42 Name thakur,praveen Name thakur,praveen TRUE TRUE
43 Name ali,sarman Name ali,sarman TRUE TRUE
44 Name khan,zuber Name khan,zuber TRUE TRUE
45 Name singh,giriraj Name singh,giriraj TRUE TRUE
46 Name sharma,lokesh Name sharma,lokesh TRUE TRUE
47 Name pawar,pooja Name pawar,pooja TRUE TRUE
48 Name sharma,nikita Name sharma,nikita TRUE TRUE
49 gender m gender m TRUE TRUE
50 gender f gender f TRUE TRUE
51 gender m gender m TRUE TRUE
52 gender f gender f TRUE TRUE
53 gender m gender m TRUE TRUE
54 gender m gender m TRUE TRUE
55 gender m gender m TRUE TRUE
56 gender m gender male TRUE FALSE
57 gender m gender m TRUE TRUE
58 gender m gender male TRUE FALSE
59 gender m gender m TRUE TRUE
60 gender f gender f TRUE TRUE
61 gender m gender m TRUE TRUE
62 gender f gender f TRUE TRUE
63 gender m gender m TRUE TRUE
64 gender m gender m TRUE TRUE
Комментарии:
1. при применении к моим исходным данным я получаю эту ошибку Ошибка в data.frame(…, check.names = FALSE) : аргументы подразумевают различное количество строк: 62376, 73283
2. Попробуйте
merge(df1_long, df2_long, by.x=c('Names_df1'), by.y=c('Names_df2'), all = TRUE)
вместоcbind
3. в исходных данных содержится более 200 различных переменных. так что это займет очень много времени
4. > df_result >
Ответ №2:
Вариант tidyverse, аналогичный варианту Tarjae.
library(dplyr)
library(tidyr)
one <- pivot_longer(df1, everything()) %>%
rename(names_df1 = name, options_df1 = value)
two <- pivot_longer(df2, everything()) %>%
rename(names_df2 = name, options_df2 = value)
one %>%
bind_cols(two) %>%
mutate(names_matching = names_df1 == names_df2,
options_matching = options_df1 == options_df2) %>%
arrange(names_df1, names_df2)
# # A tibble: 64 x 6
# names_df1 options_df1 names_df2 options_df2 names_matching options_matching
# <chr> <chr> <chr> <chr> <lgl> <lgl>
# 1 city del city "del" TRUE TRUE
# 2 city mum city "MUm" TRUE FALSE
# 3 city nav city " nav" TRUE FALSE
# 4 city pun city "pun" TRUE TRUE
# 5 city bang city "bang" TRUE TRUE
# 6 city chen city "chen" TRUE TRUE
# 7 city triv city " ddgy " TRUE FALSE
# 8 city vish city "vish" TRUE TRUE
# 9 city del city "del" TRUE TRUE