#r
#r
Вопрос:
У меня есть данные, подобные этому
df <- structure(list(Division = structure(c(1L, 1L, 1L, 2L, 2L, 3L), .Label = c("Main data",
"Second data", "Third data"), class = "factor"), Gene = structure(1:6, .Label = c("ABI3BP",
"ADIPOQ", "AEBP1", "AGRN", "AMBN", "AMELX"), class = "factor"),
IDs = c(17265L, 13633L, 303L, 329L, 452L, 461L), IDs.Links = c(17265L,
13633L, 303L, 329L, 452L, 461L), UniID = structure(c(1L,
4L, 2L, 3L, 6L, 5L), .Label = c("B4DSV9:D3YTG3:E9PPR9:E9PRB5:H0Y897",
"C9JLQ8:H7C0W8:H7C1J5", "H0Y5U1:O00468", "Q15848", "Q99217",
"Q9NP70"), class = "factor"), Refseq_IDs = structure(c(4L,
3L, 1L, 6L, 5L, 2L), .Label = c("NP_001120.3", "NP_001133.1:NP_872621.1:NP_872622.1",
"NP_001171271.1:NP_004788.1", "NP_056244.2:XP_005247340.1",
"NP_057603.1", "NP_940978.2:XP_005244806.1:XP_006710696.1"
), class = "factor"), Orthology = structure(1:6, .Label = c("Mouse:Abi3bp|",
"Mouse:Adipoq|", "Mouse:Aebp1|", "Mouse:Agrn|", "Mouse:Ambn|",
"Mouse:Amelx|"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L))
в одном из столбцов с именем UniID у меня есть много строк, разделенных символом: Я хочу поместить каждую из них в новую строку и повторить столбцы других
Вывод desire выглядит следующим образом
df2 <-structure(list(Division = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 3L), .Label = c("Main data", "Second data",
"Third data"), class = "factor"), Gene = structure(c(1L, 1L,
1L, 1L, 1L, 2L, 3L, 4L, 5L, 6L, 6L, 7L, 8L), .Label = c("ABI3BP",
"ADIPOQ", "AEBP1", "AEBP2", "AEBP3", "AGRN", "AMBN", "AMELX"), class = "factor"),
IDs = c(17265L, 17265L, 17265L, 17265L, 17265L, 13633L, 303L,
303L, 303L, 329L, 329L, 452L, 461L), IDs.Links = c(17265L,
17265L, 17265L, 17265L, 17265L, 13633L, 303L, 303L, 303L,
329L, 329L, 452L, 461L), UniID = structure(c(1L, 3L, 4L,
5L, 7L, 11L, 2L, 8L, 9L, 6L, 10L, 13L, 12L), .Label = c("B4DSV9",
"C9JLQ8", "D3YTG3", "E9PPR9", "E9PRB5", "H0Y5U1", "H0Y897",
"H7C0W8", "H7C1J5", "O00468", "Q15848", "Q99217", "Q9NP70"
), class = "factor"), Refseq_IDs = structure(c(4L, 4L, 4L,
4L, 4L, 3L, 1L, 1L, 1L, 6L, 7L, 5L, 2L), .Label = c("NP_001120.3",
"NP_001133.1:NP_872621.1:NP_872622.1", "NP_001171271.1:NP_004788.1",
"NP_056244.2:XP_005247340.1", "NP_057603.1", "NP_940978.2:XP_005244806.1:XP_006710696.1",
"NP_940978.2:XP_005244806.1:XP_006710696.2"), class = "factor"),
Orthology = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L,
4L, 4L, 5L, 6L), .Label = c("Mouse:Abi3bp|", "Mouse:Adipoq|",
"Mouse:Aebp1|", "Mouse:Agrn|", "Mouse:Ambn|", "Mouse:Amelx|"
), class = "factor")), class = "data.frame", row.names = c(NA,
-13L))
Я нашел другое сообщение и попытался это сделать, но без какого-либо успеха.
s <- strsplit(as.character(df$UniID), ':')
mydf<-data.frame(director=unlist(s), IDs=rep(df$IDs, lengths(s)))
выдает мне только столбцы идентификаторов и UniIDs
mydf<- df[, lapply(.SD, function(x) unlist(tstrsplit(x, ":", fixed=TRUE))), by = IDs][!is.na(UniID)]
Error in `[.data.frame`(df, , lapply(.SD, function(x) unlist(tstrsplit(x, :
unused argument (by = IDs)
Этот
mydf<- df[, strsplit(as.character(UniID), ":", fixed=TRUE),
by = .(IDs, UniID)][,.(UniID = V1, IDs)]
Error in `[.data.frame`(df, , strsplit(as.character(UniID), ":", fixed = TRUE), :
unused argument (by = .(IDs, UniID))
Ответ №1:
Одна dplyr
возможность:
df %>%
mutate(UniID = strsplit(as.character(UniID), ":")) %>%
unnest()
Division Gene IDs IDs.Links Refseq_IDs Orthology UniID
1 Main data ABI3BP 17265 17265 NP_056244.2:XP_005247340.1 Mouse:Abi3bp| B4DSV9
2 Main data ABI3BP 17265 17265 NP_056244.2:XP_005247340.1 Mouse:Abi3bp| D3YTG3
3 Main data ABI3BP 17265 17265 NP_056244.2:XP_005247340.1 Mouse:Abi3bp| E9PPR9
4 Main data ABI3BP 17265 17265 NP_056244.2:XP_005247340.1 Mouse:Abi3bp| E9PRB5
5 Main data ABI3BP 17265 17265 NP_056244.2:XP_005247340.1 Mouse:Abi3bp| H0Y897
6 Main data ADIPOQ 13633 13633 NP_001171271.1:NP_004788.1 Mouse:Adipoq| Q15848
7 Main data AEBP1 303 303 NP_001120.3 Mouse:Aebp1| C9JLQ8
8 Main data AEBP1 303 303 NP_001120.3 Mouse:Aebp1| H7C0W8
9 Main data AEBP1 303 303 NP_001120.3 Mouse:Aebp1| H7C1J5
10 Second data AGRN 329 329 NP_940978.2:XP_005244806.1:XP_006710696.1 Mouse:Agrn| H0Y5U1
11 Second data AGRN 329 329 NP_940978.2:XP_005244806.1:XP_006710696.1 Mouse:Agrn| O00468
12 Second data AMBN 452 452 NP_057603.1 Mouse:Ambn| Q9NP70
13 Third data AMELX 461 461 NP_001133.1:NP_872621.1:NP_872622.1 Mouse:Amelx| Q99217
Здесь он разбивает столбец «UniID» на :
, а затем отменяет его.
Ответ №2:
Простой-
> library(splitstackshape)
> cSplit(df, "UniID", ":", "long")
Division Gene IDs IDs.Links UniID Refseq_IDs Orthology
1: Main data ABI3BP 17265 17265 B4DSV9 NP_056244.2:XP_005247340.1 Mouse:Abi3bp|
2: Main data ABI3BP 17265 17265 D3YTG3 NP_056244.2:XP_005247340.1 Mouse:Abi3bp|
3: Main data ABI3BP 17265 17265 E9PPR9 NP_056244.2:XP_005247340.1 Mouse:Abi3bp|
4: Main data ABI3BP 17265 17265 E9PRB5 NP_056244.2:XP_005247340.1 Mouse:Abi3bp|
5: Main data ABI3BP 17265 17265 H0Y897 NP_056244.2:XP_005247340.1 Mouse:Abi3bp|
6: Main data ADIPOQ 13633 13633 Q15848 NP_001171271.1:NP_004788.1 Mouse:Adipoq|
7: Main data AEBP1 303 303 C9JLQ8 NP_001120.3 Mouse:Aebp1|
8: Main data AEBP1 303 303 H7C0W8 NP_001120.3 Mouse:Aebp1|
9: Main data AEBP1 303 303 H7C1J5 NP_001120.3 Mouse:Aebp1|
10: Second data AGRN 329 329 H0Y5U1 NP_940978.2:XP_005244806.1:XP_006710696.1 Mouse:Agrn|
11: Second data AGRN 329 329 O00468 NP_940978.2:XP_005244806.1:XP_006710696.1 Mouse:Agrn|
12: Second data AMBN 452 452 Q9NP70 NP_057603.1 Mouse:Ambn|
13: Third data AMELX 461 461 Q99217 NP_001133.1:NP_872621.1:NP_872622.1 Mouse:Amelx|