#performance #r-markdown
#Производительность #r-markdown
Вопрос:
Я создаю отчет в R, который оценивает и отображает уровень передачи COVID19, и нахожу, что в R Markdown он выполняется НАМНОГО медленнее, чем в консоли. Я использую RStudio для Mac версии v1.3.1093 и R версии 4.0.3.
На этой диаграмме, в которой используется методология Монте-Карло с цепью Маркова (MCMC) для оценки заражений и уровня распространения, сначала получены данные, показывающие новые случаи ежедневно в американском штате Оклахома. Это происходит очень быстро как в консоли, так и в R Markdown. Следующая часть — моделирование MCMC, в котором возникают задержки.
Я запустил его сегодня утром, и это заняло 2,5 часа в R Markdown, по сравнению с 30 минутами при запуске запроса как R Script в консоли.
Я не могу понять, почему это происходит намного медленнее в версии R Markdown. Это становится более важным, потому что, если я выполняю запрос на уровне округа (скажем, для 10 округов), он моделирует это в 10 раз и, следовательно, займет в 10 раз больше времени.
В качестве ярлыка я могу просто запустить 2 запроса, при этом выходные данные запроса моделирования добавляются в больший запрос R Markdown (например, путем сохранения / чтения в файле RDS). Тем не менее, я просто не могу понять, почему это происходит, и поэтому хотел лучше понять, прежде чем идти по этому пути.
Ниже приведен код для версии консоли R:
library(tidyverse)
library(EpiNow2)
library(tools)
library(zoo)
# Paths for output files (used as reference for Daily Dashboard and other reports)
# ok_rt_output <- '/Users/tony/Dropbox/Chickasaw R Queries/ok_rt_output.rds'
# cn_rt_output <- '/Users/tony/Dropbox/Chickasaw R Queries/cn_rt_output.rds'
# cn_counties_rt_output <- '/Users/tony/Dropbox/Chickasaw R Queries/cn_counties_output'
# Define defaults for generation time, incubation period and reporting delay for COVID19.
generation_time <- get_generation_time(disease = "SARS-CoV-2", source = "ganyani")
incubation_period <- get_incubation_period(disease = "SARS-CoV-2", source = "lauer")
#set number of cores to use
options(mc.cores = ifelse(interactive(), 4, 1))
# construct example distributions
generation_time <- get_generation_time(disease = "SARS-CoV-2", source = "ganyani")
incubation_period <- get_incubation_period(disease = "SARS-CoV-2", source = "lauer")
# Reporting Delay
reporting_delay <- list(mean = convert_to_logmean(3, 1),
mean_sd = 0.1,
sd = convert_to_logsd(3, 1),
sd_sd = 0.1,
max = 10)
# Obtain data from JHU Repository
jhu_county_COVID_confirmed <- read_csv("https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv", na="")
# Put dates into a single column
jhu_county_COVID_confirmed_data <- jhu_county_COVID_confirmed %>% select(c(-UID, -iso2, -iso3, -code3, -FIPS, -Lat, -Long_, -Combined_Key, -Country_Region, County=Admin2))
jhu_county_COVID_confirmed_data <- jhu_county_COVID_confirmed_data %>% pivot_longer(c(-County, -Province_State), names_to = "Date", values_to = "Cases")
jhu_county_COVID_confirmed_data$Date <- strptime(trimws(jhu_county_COVID_confirmed_data$Date), format='%m/%d/%y')
jhu_county_COVID_confirmed_data$Date <- as.Date(jhu_county_COVID_confirmed_data$Date)
# Limit to Oklahoma
ok_jhu_county_COVID_confirmed <- jhu_county_COVID_confirmed_data %>% filter(Province_State=='Oklahoma')
ok_jhu_county_COVID_confirmed <- ok_jhu_county_COVID_confirmed %>% rename(county=County, state=Province_State, date=Date, cases_total=Cases)
ok_jhu_county_COVID_confirmed <- ok_jhu_county_COVID_confirmed %>% select(state, county, date, cases_total)
# Add in VERY latest OK data from OK DoH.
# Download latest day of *Oklahoma* COVID data. Defines fields as needed.
ok_covid_data_latest_for_daily_dashboard <- read_csv("https://storage.googleapis.com/ok-covid-gcs-public-download/oklahoma_cases_county.csv", na="")
ok_covid_data_latest_for_daily_dashboard$ReportDate <- as.Date(ok_covid_data_latest_for_daily_dashboard$ReportDate)
ok_covid_data_latest_for_daily_dashboard$County <- as.factor((ok_covid_data_latest_for_daily_dashboard$County))
# Adjust format from state data (counties are in all caps at first; so make them lower case and then adjust the "Mc" counties)
ok_covid_data_latest_for_daily_dashboard <- mutate(ok_covid_data_latest_for_daily_dashboard, County = toTitleCase(tolower(`County`)))
ok_covid_data_latest_for_daily_dashboard$County[ok_covid_data_latest_for_daily_dashboard$County == "Mccurtain"] <- "McCurtain"
ok_covid_data_latest_for_daily_dashboard$County[ok_covid_data_latest_for_daily_dashboard$County == "Mcclain"] <- "McClain"
ok_covid_data_latest_for_daily_dashboard$County[ok_covid_data_latest_for_daily_dashboard$County == "Mcintosh"] <- "McIntosh"
ok_covid_data_latest_for_daily_dashboard <- ok_covid_data_latest_for_daily_dashboard %>% rename(`Date`=`ReportDate`)
ok_covid_data_latest_for_daily_dashboard <- ok_covid_data_latest_for_daily_dashboard %>% rename(`Recoveries`=`Recovered`)
ok_covid_data_latest_for_daily_dashboard$Province_State <- "Oklahoma"
ok_covid_data_latest_for_daily_dashboard <- ok_covid_data_latest_for_daily_dashboard[,c("County", "Province_State", "Date", "Cases", "Deaths")]
ok_latest_day_for_daily_dashboard <- ok_covid_data_latest_for_daily_dashboard %>% group_by(Province_State, County, Date) %>% summarise(OK_Cases=sum(Cases))
ok_latest_day_for_daily_dashboard <- ok_latest_day_for_daily_dashboard %>% rename(cases_total=OK_Cases, date=Date, county=County, state=Province_State)
# Append latest data file for Oklahoma to existing COVID (after first testing it is in fact new)
if(min(ok_latest_day_for_daily_dashboard$date) > max(ok_jhu_county_COVID_confirmed$date)) {ok_cases_by_day <- rbind(ok_jhu_county_COVID_confirmed, ok_latest_day_for_daily_dashboard) }
if(min(ok_latest_day_for_daily_dashboard$date) <= max(ok_jhu_county_COVID_confirmed$date)) {ok_cases_by_day <- ok_jhu_county_COVID_confirmed}
# Oklahoma Cases by Day
ok_cases_by_day <- ok_cases_by_day %>% arrange(state, county, date)
# New Cases in Oklahoma for Rt calculation
ok_cases_by_day_for_Rt <- ok_cases_by_day %>% group_by(date) %>% summarise(OK_Cases = sum(cases_total, na.rm = TRUE))
ok_cases_by_day_for_Rt <- ok_cases_by_day_for_Rt %>% mutate(OK_New_Cases=OK_Cases - lag(OK_Cases,1))
ok_cases_by_day_for_Rt <- ok_cases_by_day_for_Rt %>% select(-OK_Cases) %>% rename(confirm=OK_New_Cases)
# Limit cases to dates from 1 April
ok_cases_by_day_for_Rt <- ok_cases_by_day_for_Rt %>% filter(date >= '2020-04-01')
# estimate Rt and nowcast/forecast cases by date of infection for Oklahoma
out_ok <- epinow(reported_cases = ok_cases_by_day_for_Rt, generation_time = generation_time,
rt = rt_opts(prior = list(mean = 1, sd = 0.2)), # replace earlier mean of 2 and sd of 0.1 used in initial approach by Abbott et al.
delays = delay_opts(incubation_period, reporting_delay), return_output = TRUE,
verbose = TRUE, horizon=14)
# summary of the latest estimates
summary(out_ok)
# plot estimates
plot(out_ok)
Ниже приведен код для R Markdown версии этого запроса
```
---
title: "Rt Sample"
output:
word_document: default
html_document:
df_print: paged
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, collapse = TRUE, warning=FALSE, message=FALSE)
library(tidyverse)
library(zoo)
library(readxl)
library(tools)
library(EpiNow2)
```
## R Markdown
This is a query that graphs the transmission rate of COVID19 in the American state of Oklahoma. It uses the EpiNow2 package to do this.
More info can be found on epiforecasts.io
``` {r ok_rt_chart}
# Note this is a sample report so the commentary (e.g., progress) will appear in the report.
generation_time <- get_generation_time(disease = "SARS-CoV-2", source = "ganyani")
incubation_period <- get_incubation_period(disease = "SARS-CoV-2", source = "lauer")
#set number of cores to use
options(mc.cores = ifelse(interactive(), 4, 1))
# construct example distributions
generation_time <- get_generation_time(disease = "SARS-CoV-2", source = "ganyani")
incubation_period <- get_incubation_period(disease = "SARS-CoV-2", source = "lauer")
# Reporting Delay
reporting_delay <- list(mean = convert_to_logmean(3, 1),
mean_sd = 0.1,
sd = convert_to_logsd(3, 1),
sd_sd = 0.1,
max = 10)
# Obtain data from JHU Repository
jhu_county_COVID_confirmed <- read_csv("https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv", na="")
# Put dates into a single column
jhu_county_COVID_confirmed_data <- jhu_county_COVID_confirmed %>% select(c(-UID, -iso2, -iso3, -code3, -FIPS, -Lat, -Long_, -Combined_Key, -Country_Region, County=Admin2))
jhu_county_COVID_confirmed_data <- jhu_county_COVID_confirmed_data %>% pivot_longer(c(-County, -Province_State), names_to = "Date", values_to = "Cases")
jhu_county_COVID_confirmed_data$Date <- strptime(trimws(jhu_county_COVID_confirmed_data$Date), format='%m/%d/%y')
jhu_county_COVID_confirmed_data$Date <- as.Date(jhu_county_COVID_confirmed_data$Date)
# Limit to Oklahoma
ok_jhu_county_COVID_confirmed <- jhu_county_COVID_confirmed_data %>% filter(Province_State=='Oklahoma')
ok_jhu_county_COVID_confirmed <- ok_jhu_county_COVID_confirmed %>% rename(county=County, state=Province_State, date=Date, cases_total=Cases)
ok_jhu_county_COVID_confirmed <- ok_jhu_county_COVID_confirmed %>% select(state, county, date, cases_total)
# Add in VERY latest OK data from OK DoH.
# Download latest day of *Oklahoma* COVID data. Defines fields as needed.
ok_covid_data_latest_for_daily_dashboard <- read_csv("https://storage.googleapis.com/ok-covid-gcs-public-download/oklahoma_cases_county.csv", na="")
ok_covid_data_latest_for_daily_dashboard$ReportDate <- as.Date(ok_covid_data_latest_for_daily_dashboard$ReportDate)
ok_covid_data_latest_for_daily_dashboard$County <- as.factor((ok_covid_data_latest_for_daily_dashboard$County))
# Adjust format from state data (counties are in all caps at first; so make them lower case and then adjust the "Mc" counties)
ok_covid_data_latest_for_daily_dashboard <- mutate(ok_covid_data_latest_for_daily_dashboard, County = toTitleCase(tolower(`County`)))
ok_covid_data_latest_for_daily_dashboard$County[ok_covid_data_latest_for_daily_dashboard$County == "Mccurtain"] <- "McCurtain"
ok_covid_data_latest_for_daily_dashboard$County[ok_covid_data_latest_for_daily_dashboard$County == "Mcclain"] <- "McClain"
ok_covid_data_latest_for_daily_dashboard$County[ok_covid_data_latest_for_daily_dashboard$County == "Mcintosh"] <- "McIntosh"
ok_covid_data_latest_for_daily_dashboard <- ok_covid_data_latest_for_daily_dashboard %>% rename(`Date`=`ReportDate`)
ok_covid_data_latest_for_daily_dashboard <- ok_covid_data_latest_for_daily_dashboard %>% rename(`Recoveries`=`Recovered`)
ok_covid_data_latest_for_daily_dashboard$Province_State <- "Oklahoma"
ok_covid_data_latest_for_daily_dashboard <- ok_covid_data_latest_for_daily_dashboard[,c("County", "Province_State", "Date", "Cases", "Deaths")]
ok_latest_day_for_daily_dashboard <- ok_covid_data_latest_for_daily_dashboard %>% group_by(Province_State, County, Date) %>% summarise(OK_Cases=sum(Cases))
ok_latest_day_for_daily_dashboard <- ok_latest_day_for_daily_dashboard %>% rename(cases_total=OK_Cases, date=Date, county=County, state=Province_State)
# Append latest data file for Oklahoma to existing COVID (after first testing it is in fact new)
if(min(ok_latest_day_for_daily_dashboard$date) > max(ok_jhu_county_COVID_confirmed$date)) {ok_cases_by_day <- rbind(ok_jhu_county_COVID_confirmed, ok_latest_day_for_daily_dashboard) }
if(min(ok_latest_day_for_daily_dashboard$date) <= max(ok_jhu_county_COVID_confirmed$date)) {ok_cases_by_day <- ok_jhu_county_COVID_confirmed}
# Oklahoma Cases by Day
ok_cases_by_day <- ok_cases_by_day %>% arrange(state, county, date)
# New Cases in Oklahoma for Rt calculation
ok_cases_by_day_for_Rt <- ok_cases_by_day %>% group_by(date) %>% summarise(OK_Cases = sum(cases_total, na.rm = TRUE))
ok_cases_by_day_for_Rt <- ok_cases_by_day_for_Rt %>% mutate(OK_New_Cases=OK_Cases - lag(OK_Cases,1))
ok_cases_by_day_for_Rt <- ok_cases_by_day_for_Rt %>% select(-OK_Cases) %>% rename(confirm=OK_New_Cases)
# Limit cases to dates from 1 April
ok_cases_by_day_for_Rt <- ok_cases_by_day_for_Rt %>% filter(date >= '2020-04-01')
# estimate Rt and nowcast/forecast cases by date of infection for Oklahoma
out_ok <- epinow(reported_cases = ok_cases_by_day_for_Rt, generation_time = generation_time,
rt = rt_opts(prior = list(mean = 1, sd = 0.2)), # replace earlier mean of 2 and sd of 0.1 used in initial approach by Abbott et al.
delays = delay_opts(incubation_period, reporting_delay), return_output = TRUE,
verbose = TRUE, horizon=14)
summary(out_ok)
plot(out_ok)
Комментарии:
1. @T ПЕРРИ, это решаемая проблема? Я столкнулся с тем же вопросом . Спасибо!
2. Привет, к сожалению, это не так! В итоге мне пришлось написать два запроса: один для выполнения необходимого мне анализа, а другой для создания отчета R Markdown (импорт данных из первого запроса). Это не идеально, но это работает. Я хотел бы знать, почему RMarkdown заставляет вещи работать намного медленнее. Возможно, это можно исправить в более новой версии?
3. Понял, возможно, ваш способ — лучший метод на данный момент. Спасибо!