#r
#r
Вопрос:
У меня есть некоторые данные, которые разделены n
.
# A tibble: 3 x 7
name...15 title...17 company...18 date_range...19 location...20 description...21 li_company_url
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 "jobPosition1" NA NA NA NA NA NA
2 "jobPosition1" NA NA NA NA NA NA
3 "jobPosition1njobPo~ "CEOnDirectornGer~ "European Outsorcing~ "jan 1993 ? nov 1999njan 2010~ "GermanynUSAnMissi~ "Director of systemsnAdmin o~ "nhttps://www.google.com/n~
Каждая строка представляет собой наблюдение, и внутри каждой строки, разделенной n
символом, находится новая строка информации для этого наблюдения. Я хочу расширить строки так, чтобы ожидаемый результат выглядел так:
jobposition1_title jobposition2_title jobposition3_title jobposition4_title jobposition5_title jobposition1_company jobposition2_company jobposition3_company
1 NA NA NA NA NA NA NA NA
2 NA NA NA NA NA NA NA NA
3 CEO Director Member of the directive Partner NA European Outsorcing Google Pepsi
jobposition4_company jobposition5_company jobposition1_date jobposition2_date jobposition3_date jobposition4_date jobposition5_date jobposition1_location
1 NA NA NA NA NA NA NA NA
2 NA NA NA NA NA NA NA NA
3 Yahoo Microsoft jan 1993 ? nov 1999 jan 2010 ? now jan 2010 ? nov 2016 nov 1999 ? oct 2009 sept 2012 ? dic 2014 Germany
Таким jobPosition
образом, ‘s — это имена столбцов вместе с title
, company
, dates
и т.д. (которые разделены n
.
Данные:
data <- structure(list(name...15 = c("jobPosition1", "jobPosition1",
"jobPosition1njobPosition2njobPosition3njobPosition4njobPosition5"
), title...17 = c(NA, NA, "CEOnDirectornGerente nMember of the directivenPartner"
), company...18 = c(NA, NA, "European OutsorcingnGooglenPepsinYahoonMicrosoft"
), date_range...19 = c(NA, NA, "jan 1993 ? nov 1999njan 2010 ? nownjan 2010 ? nov 2016nnov 1999 ? oct 2009nsept 2012 ? dic 2014"
), location...20 = c(NA, NA, "GermanynUSAnMissing DatanMissing DatanMissing Data"
), description...21 = c(NA, NA, "Director of systemsnAdmin of systemsnParnet in the area of accountancyntecnical director"
), li_company_url = c(NA, NA, "nhttps://www.google.com/nhttps://www.yahoo.com/nhttps://www.abcd.com/nhttps://www.xzy.com/"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))
Ожидаемые выходные данные:
data.frame(
# Job titles
jobposition1_title = c("NA", "NA", "CEO"),
jobposition2_title = c("NA", "NA", "Director"),
jobposition3_title = c("NA", "NA", "Member of the directive"),
jobposition4_title = c("NA", "NA", "Partner"),
jobposition5_title = c("NA", "NA", "NA"),
# Job companies
jobposition1_company = c("NA", "NA", "European Outsorcing"),
jobposition2_company = c("NA", "NA", "Google"),
jobposition3_company = c("NA", "NA", "Pepsi"),
jobposition4_company = c("NA", "NA", "Yahoo"),
jobposition5_company = c("NA", "NA", "Microsoft"),
# Date ranges
jobposition1_date = c("NA", "NA", "jan 1993 ? nov 1999"),
jobposition2_date = c("NA", "NA", "jan 2010 ? now"),
jobposition3_date = c("NA", "NA", "jan 2010 ? nov 2016"),
jobposition4_date = c("NA", "NA", "nov 1999 ? oct 2009"),
jobposition5_date = c("NA", "NA", "sept 2012 ? dic 2014"),
# Locations
jobposition1_location = c("NA", "NA", "Germany"),
jobposition2_location = c("NA", "NA", "USA"),
jobposition3_location = c("NA", "NA", "Missing Data"),
jobposition4_location = c("NA", "NA", "Missing Data"),
jobposition5_location = c("NA", "NA", "Missing Data"),
# Description
jobposition1_description = c("NA", "NA", "Director of systmes"),
jobposition2_description = c("NA", "NA", "Admin of systems"),
jobposition3_description = c("NA", "NA", "Parnet in the area of accountancy"),
jobposition4_description = c("NA", "NA", "tecnical director")
jobposition5_description = c("NA", "NA", "NA") # NOTE: Here in data$description...21 has 4 inputs separated by n but there are 5 job descriptions...
# Anything after the last n with empty space would be an NA.
# etc.
)
Ответ №1:
Используется str_split
для получения списка векторов разной длины.
library(tidyverse)
unnested <- map(data, ~unlist(str_split(., "n")))
unnested
#> $name...15
#> [1] "jobPosition1" "jobPosition1" "jobPosition1" "jobPosition2"
#> [5] "jobPosition3" "jobPosition4" "jobPosition5"
#>
#> $title...17
#> [1] NA NA
#> [3] "CEO" "Director"
#> [5] "Gerente " "Member of the directive"
#> [7] "Partner"
#>
#> $company...18
#> [1] NA NA "European Outsorcing"
#> [4] "Google" "Pepsi" "Yahoo"
#> [7] "Microsoft"
#>
#> $date_range...19
#> [1] NA NA "jan 1993 ? nov 1999"
#> [4] "jan 2010 ? now" "jan 2010 ? nov 2016" "nov 1999 ? oct 2009"
#> [7] "sept 2012 ? dic 2014"
#>
#> $location...20
#> [1] NA NA "Germany" "USA"
#> [5] "Missing Data" "Missing Data" "Missing Data"
#>
#> $description...21
#> [1] NA NA
#> [3] "Director of systems" "Admin of systems"
#> [5] "Parnet in the area of accountancy" "tecnical director"
#>
#> $li_company_url
#> [1] NA NA
#> [3] "" "https://www.google.com/"
#> [5] "https://www.yahoo.com/" "https://www.abcd.com/"
#> [7] "https://www.xzy.com/"
Чтобы объединить их в фрейм данных, они должны быть одинаковой длины. Итак, выясните, какой длины они должны быть.
max_len <- unnested %>% lengths() %>% max()
max_len
#> [1] 7
Затем установите длины и свяжите их вместе.
unnested %>%
map(`length<-`, max_len) %>%
bind_cols()
#> # A tibble: 7 x 7
#> name title company date_range location description li_company_url
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 jobPo… NA NA NA NA NA NA
#> 2 jobPo… NA NA NA NA NA NA
#> 3 jobPo… "CEO" European… jan 1993 … Germany Director of … ""
#> 4 jobPo… "Direc… Google jan 2010 … USA Admin of sys… "https://www.g…
#> 5 jobPo… "Geren… Pepsi jan 2010 … Missing… Parnet in th… "https://www.y…
#> 6 jobPo… "Membe… Yahoo nov 1999 … Missing… tecnical dir… "https://www.a…
#> 7 jobPo… "Partn… Microsoft sept 2012… Missing… NA "https://www.x…