### R code from vignette source 'e:/DATA/SCHOLAR/GUVhistory/PublicHealthReportsManuscript.Rnw' ################################################### ### code chunk number 1: options ################################################### library(foreign) library(stargazer) library(janitor) library(lubridate) library(stringr) library(forcats) library(lattice) library(latticeExtra) library(gridExtra) ## to arrange multiple graphs on a page with grid.arrange() library(zoo) ## for yearmon function library(Hmisc) library(nlme) library(car) library(MASS) library(effects) library(multcomp) library(tidyr) library(dplyr) ################################################### ### code chunk number 2: setup ################################################### ## for reproducability of gls-related and multcomp-related results set.seed(8164) ## chosen randomly via sample(size = 4, 0:9) my.ilogit.f <- function(x){exp(x)/(1 + exp(x))} ## will need these for both non-simultaneous and simultaneous CIs ordered.seasons <- factor(c("summer", "fall", "winter", "spring"), levels = c("summer", "fall", "winter", "spring")) ## summer will be baseline ordered.months <- factor(c("january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"), levels = c("january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december")) ## january will be baseline ## a useful plotting function for ORs and their CIs my.segplot.OR.f <- function(x){ segplot(fct_rev(label) ~ exp(lwr) + exp(upr), data = x, main = "Effect of GUV on pre-school absenteeism", xlab = "estimated odds ratio and simultaneous one-sided 95% confidence intervals", draw.bands = FALSE, centers = exp(estimate), pch = 19, cex = 1.5, segments.fun = panel.arrows, ends = "first", angle = 45, length = .15, lwd = 2, # par.settings = simpleTheme(pch = 19, col = 1), # xlab = expression("Number " %+-% " SE"), panel = function(x, y, z, ...) { panel.abline(v = 1, col = "darkgrey", lty = 2) panel.segplot(x, y, z, ...)}) } my.pairwise.anova.f <- function(x){ for (i in 2:length(x)){ pw.output <- anova(x[[1]], x[[i]]) row.names(pw.output) <- c(names(x)[1], names(x)[i]) print(pw.output) } } ################################################### ### code chunk number 3: getdata ################################################### dd <- read.dta("NCRCguv.dta") ### load temperature data. These are the average temperature in DC for each month, from ### January 1941 to December 1949, inclusive, from https://www.weather.gov/wrh/Climate?wfo=lwx. temps.wide <- read.table("DCmonthlyAverageTemperatures.pip", header = TRUE, sep = "|") ## I found three data entry errors, two keyboard slipups and one mistake in the Epidata .chk file ## 1. There are two entries for June 1945 3-year old group. One of them is actually for June 1946 ## 2. For 4-year olds in June 1946, total days was actually 260, not the 250 that was entered. ## 3. and the misspelling of "octobeer" in the .chk file ## I fixed 1 and 2 in the Epidata .rec file and re-exported to Stata .dta forma ## I will fix 3 below ################################################### ### code chunk number 4: wrangledata ################################################### dd.2 <- dd %>% mutate(month.c = factor(str_replace(month, "octobeer", "october"), levels = ordered.months), month = month.c, md = as.numeric(absentdays), ## Missed Days ad = as.numeric(totaldays), ## Available Days ad.temp = na_if(ad, 0), ## to allow calculation of absentee rate md.prop = md/ad.temp, temp.date = "01", temp.month = as.numeric(month), false.date = as.Date(paste(year, temp.month, temp.date, sep = "-")), yearmonth = as.yearmon(false.date), season = case_when( str_detect(month, paste(c("december", "january", "february"), collapse="|")) ~ "winter", str_detect(month, paste(c("march", "april", "may"), collapse="|")) ~ "spring", str_detect(month, paste(c("june", "july", "august"), collapse="|")) ~ "summer", str_detect(month, paste(c("september", "october", "octobeer", "november"), collapse="|")) ~ "fall" ), ## fix misspelling season.fac = factor(season, levels = c("summer", "fall", "winter", "spring")), ## summer will be baseline in models guv.fac = factor(guv, levels = 0:1, labels = c("no", "yes")) ) ### convert temp dataframe to long format temps.long <- pivot_longer(temps.wide, cols = 2:13, names_to = c("month")) names(temps.long)[3] <- "temperature" ## a hack; should probably do in the line above ### omit December 1949 temperature since NCRC data only goes through November 1949 temps.long.2 <- temps.long %>% filter(!(year == 1949 & month == "december")) ### label each monthly temperature with its season. Conventional 3 x 4 taxonomy temps.long.2 <- temps.long.2 %>% mutate ( temp.season = case_when( str_detect(month, paste(c("december", "january", "february"), collapse="|")) ~ "winter", str_detect(month, paste(c("march", "april", "may"), collapse="|")) ~ "spring", str_detect(month, paste(c("june", "july", "august"), collapse="|")) ~ "summer", str_detect(month, paste(c("september", "october", "november"), collapse="|")) ~ "fall" ), temp.season.fac = factor(temp.season, levels = c("summer", "fall", "winter", "spring")) ) ##------------------- END DATA WRANGLING -------------- ## ## ------------ BEGIN VALIDITY CHECKS -------------- ## describe(dd.2) ## from Hmisc. Note that with character variables, 94 is "higher than" 106, for example dd.2 %>% tabyl(age, month, year) ## seemED to be no record of 2-year olds in July 1945, but there is in the source document. I entered it in Epidata on 2022-06-10 dd.2 %>% filter(ad == 0) ## there are no missing values of ad. but there are 21 missing values of ad.temp. These would be the records with ad == 0 ## these are from July and Aug for 5-year olds ("kindergarten", so they had no available days--summer vacation--and from ## 4-year olds in July 1941 and July 1949--I verified in the source document there was no July session for them those years ## --------------- END VALIDITY CHECKS ------------------ ## ## ------------- BEGIN BUILD THE ANALYTICAL DATASET ------------ ## ## the variables I will/may need dd.3 <- dd.2 %>% dplyr::select(age, year, month, guv, guv.fac, md, ad, md.prop, yearmonth, season, season.fac, false.date) ## I plan to do the analysis by the whole school, not by age group, since I doubt the age groups were segregated ## from each other much. They probably all played together dd.pooled <- dd.3 %>% group_by(false.date) %>% summarise(md = sum(md), ad = sum(ad), guv = mean(guv), season = season.fac[1], month = month[1], year = year[1]) %>% mutate(md.prop = md/ad, time.index = row_number()) %>% mutate(guv.int = as.integer(guv), guv.fac = factor(guv)) dd.pooled <- dd.pooled %>% mutate(trend.temp = time.index, ## fancy.trend = scale(trend.temp), ## sin.term = sin(2*pi*fancy.trend/12), ## cos.term = cos(2*pi*fancy.trend/12), my.logit = log(md.prop/(1 - md.prop)), guv.fac = factor(guv) ) ### join temperatures to dd.pooled dd.pooled <- left_join(dd.pooled, temps.long.2, by = c("year", "month")) glimpse(dd.pooled) dd.analytical <- dd.pooled ## will repeatedly need new data for predictions from the various models I fit newdata.noguv <- data.frame(guv = 0, guv.int = 0, guv.fac = factor(0), time.index = dd.analytical$time.index, ad = dd.analytical$ad, season = dd.analytical$season, month = dd.analytical$month, temperature = temps.long.2$temperature) ## ----------- END BUILD ANALYTICAL DATASET --------------- ## ################################################### ### code chunk number 5: descriptives ################################################### ## --------- BEGIN DESCRIPTIVE STATISTICS ------------- ## n.obs <- nrow(dd.analytical) last.month.without.guv <- max(which(dd.analytical$guv.fac == 0)) first.month.with.guv <- min(which(dd.analytical$guv.fac == 1)) n.months.with.guv <- n.obs - last.month.without.guv n.months.without.guv <- n.obs - n.months.with.guv my.descsummary.min.f <- function(y){ifelse(y < 1, round(min(y) * 100, 1), min(y))} my.descsummary.med.f <- function(y){ifelse(y < 1, round(median(y) * 100, 1), median(y))} my.descsummary.max.f <- function(y){ifelse(y < 1, round(max(y) * 100, 1), max(y))} desc.summary <- dd.analytical %>% select(ad, md, md.prop) %>% gather(stat, value, c(ad, md, md.prop)) %>% group_by(stat) %>% summarise(minimum = min(value), median = median(value), maximum = max(value) ) %>% mutate(stat = c("available days", "missed days", "absenteeism rate (%)") ) desc.summary[str_detect(desc.summary$stat, "rate"), 2:4] <- round(desc.summary[str_detect(desc.summary$stat, "rate"), 2:4] * 100, 1) desc.temp.summary <- dd.analytical %>% select(temperature) %>% gather(stat, value, temperature) %>% summarise(minimum = min(value), median = median(value), maximum = max(value) ) summ.absenteeism.rate <- desc.summary %>% filter(stat == "absenteeism rate (%)") ## ----------- END DESCRIPTIVE STATISTICS ------------ ## ## ----------- BEGIN DESCRIPTIVE PLOTS ---------------- ## pdf(file = "TempCycles.pdf") bh <- 2 seg.y <- 82 seg.start <- 60 seg.end <- 107 xyplot(temperature ~ time.index, data = dd.analytical, type = "l", ylab = "average monthly temperature", xlab = "sequential month since January 1941", sub = "blue bar indicates GUV period") + as.layer(segplot(seg.y ~ seg.start + seg.end, draw.bands = TRUE, band.height = bh, col = "blue")) dev.off() pdf(file = "descdist.pdf", height = 9) plot.dist.ad <- dd.analytical %>% select(ad) %>% gather(stat, value, c(ad)) %>% densityplot(~ value | stat, strip = strip.custom(var.name = "available child-days of attendance", strip.names = c(TRUE, TRUE)), data = .) plot.dist.md <- dd.analytical %>% select(md) %>% gather(stat, value, c(md)) %>% densityplot(~ value | stat, strip = strip.custom(var.name = "child-days of attendance missed due to respiratory illness", strip.names = c(TRUE, TRUE)), data = .) plot.dist.md.prop <- dd.analytical %>% select(md.prop) %>% gather(stat, value, c(md.prop)) %>% densityplot(~ value | stat, strip = strip.custom(var.name = "absenteeism rate due to respiratory illness", strip.names = c(TRUE, TRUE)), data = .) grid.arrange(plot.dist.ad, plot.dist.md, plot.dist.md.prop, heights = c(33/100, 33/100, 34/100), ncol=1) dev.off() bwplot(md.prop ~ guv.fac | season.fac, data = dd.3, par.settings = list(plot.symbol = list(col = "blue")), ylab = "proportion of days missed", xlab = "GUV installed", main = "Proportion of possible monthly student-days of attendance \n that were missed due to respiratory illness") pdf(file = "seasonalstripplot.pdf") stripplot(md.prop ~ guv.fac | season, jitter.data = TRUE, data = dd.analytical, ylab = "proportion of days missed", xlab = "GUV installed", main = "Proportion of possible monthly student-days of attendance \n that were missed", layout = c(4,1), pch = 16, scales = list(x = list(labels = c("no", "yes")))) dev.off() ## not run ### dd.3 %>% xyplot(md.prop ~ factor(age) | season, groups = guv.fac, data = ., key = simpleKey(text = c("no GUV", "GUV installed"), lines = FALSE, points = TRUE), pch = c(1,4), cex = 1.2, type = c("p", "smooth"), as.table = TRUE, xlab = "age group", ylab = "proportion of days missed") ## not used dd.3 %>% xyplot(md.prop ~ false.date, groups = guv.fac , subset = (age == 2), type = c("p", "smooth"), pch = 16, data = ., ylab = "proportion of days missed", xlab = "time, by month", key = simpleKey(text = c("no GUV", "GUV installed"), lines = TRUE, points = TRUE)) ## was a downward trend already at play when GUV was installed? bh <- 0.01 seg.y <- 0.25 seg.start <- 6 seg.end <- 9.5 pdf(file = "distabsenteeismannualstripplotGUVbar.pdf") dd.analytical %>% stripplot(md.prop ~ factor(year(false.date)), data = ., col = "black", ylab = "monthly proportion of days missed", sub = "GUV installed December 1945 (blue bar is GUV period)", main = "Proportion of available student-days missed \n due to respiratory illnesses") + as.layer(segplot(seg.y ~ seg.start + seg.end, draw.bands = TRUE, band.height = bh, col = "blue")) dev.off() ## I like this one! bh <- 0.01 seg.y <- 0.23 seg.start <- 1946 seg.end <- 1950 pdf(file = "timeseriesmonthlyabsencerateGUVbar.pdf") dd.3 %>% group_by(yearmonth) %>% summarise(totmiss = sum(md, na.rm = TRUE), totalavail = sum(ad, na.rm = TRUE), totpropmissed = totmiss/totalavail) %>% xyplot(totpropmissed ~ yearmonth, ylab = "proportion of days missed", xlab = "time, by month", sub = "GUV installed December 1945 (blue bar is GUV period)", main = "Proportion of available student-days missed \n due to respiratory illnesses", data = ., panel = function(x, y){ panel.xyplot(x, y, type = "b", lwd = 1.5, col = "black",) }) + as.layer(segplot(seg.y ~ seg.start + seg.end, draw.bands = TRUE, band.height = bh, col = "blue")) dev.off() ## not used dd.3 %>% xyplot(md.prop ~ year, data = ., type = c("p", "smooth")) ## ----------- END DESCRIPTIVE PLOTS --------------- ## ################################################### ### code chunk number 6: nlme ################################################### ## ============ BEGIN MODEL BUILDING ============= ## ## keep model names short for stargazer() ## my substantive interest is in fixed effects: the effect of guv, month, season. ## I don't *think* I have any variance components that I am interested in. ## So I will fit all generalized least squares models with ML, not REML, so can compare fixed effects with anova ## I don't think I have any random effects ## Later, when assessing fit, and for presenting predictions, I can elect to use REML ## -------------- BEGIN TEMPERATURE-BASED MODELING ---------------- ## ### first create a variance structure with two levels, one for GUV period and one for non-GUV period vfi.temp <- varIdent(form = ~ 1 | guv.fac) vfii.temp <- Initialize(vfi.temp, data = dd.analytical) ### I intend to use a time.index:guv interaction term. ### This will have the result of breaking any time.index effect into ### two pieces: one prior to GUV and one after. Hopefully it will keep ### time.index from subsuming some of the guv effect. ### And will allow me to test hypotheses about the two separate time-trend segments, ### to see if either can be dispensed with ### Also add guv as a grouping factor to the AR correlation structure ### the "full" model m.full.temp <- gls(my.logit ~ guv.fac + temperature + time.index + temperature:guv.fac + time.index:guv.fac, weights = vfii.temp, correlation = corAR1(form = ~ time.index | guv.fac), data = dd.analytical, method = "ML") ### full, with a quadratic term for temperature, since graph of resid vs temperature ### from a full model without it suggests perhaps it is needed ### also therefore include an interaction between guv.fac and the newly added quad temp term m.quadtemp.temp <- update(m.full.temp, . ~ . + I(temperature^2) + I(temperature^2):guv.fac) ### some restricted models m.noguv.temp <- update(m.quadtemp.temp, . ~ . - guv.fac - temperature:guv.fac - I(temperature^2):guv.fac - time.index:guv.fac) m.notemperature.temp <- update(m.quadtemp.temp, . ~ . - temperature - temperature:guv.fac) m.notrend.temp <- update(m.quadtemp.temp, . ~ . - time.index - time.index:guv.fac) m.nointeraction.temp <- update(m.quadtemp.temp, . ~ . - temperature:guv.fac - I(temperature^2):guv.fac) ### an AR(2) full model m.full.temp.ar2 <- update(m.full.temp, correlation = corARMA(form = ~ time.index, p = 2, q = 0)) ### an AR(3) full model m.full.temp.ar3 <- update(m.full.temp, correlation = corARMA(form = ~ time.index, p = 3, q = 0)) ### an AR(4) full model m.full.temp.ar4 <- update(m.full.temp, correlation = corARMA(form = ~ time.index, p = 4, q = 0)) ## ---------- BEGIN SEASONAL LOGIT MODELING -------------------- ## ### allow variance to vary from guv to no-guv period, and from season-to-season ## because: dd.analytical %>% group_by(season, guv.fac) %>% summarise(groupvar = var(md)) dd.analytical %>% group_by(season, guv.fac) %>% summarise(groupvar.logit = var(my.logit)) vfi.sea <- varIdent(form = ~ 1|season*guv.fac) vfii.sea <- Initialize(vfi.sea, data = dd.analytical) m.base.sea <- gls(my.logit ~ time.index, weights = vfii.sea, correlation = corAR1(form = ~ time.index), data = dd.analytical, method = "ML") ## summary(m.base.sea) m.full.sea <- gls(my.logit ~ season + guv.fac + season:guv.fac + time.index, weights = vfii.sea, correlation = corAR1(form = ~ time.index), data = dd.analytical, method = "ML") ## summary(m.full.sea) m.no.trend.sea <- update(m.full.sea, . ~ . - time.index) ## summary(m.no.trend.sea) m.no.guv.sea <- update(m.full.sea, . ~ . - guv.fac - season:guv.fac) ## summary(m.no.guv.sea) m.no.both.sea <- update(m.full.sea, . ~ . - guv.fac - season:guv.fac - time.index) ## summary(m.no.both.sea) ## --------------- END SEASONAL LOGIT MODELING ------------------- ## ## -------------- BEGIN MONTHLY LOGIT MODELING ----------------- ## vfi.monthly <- varIdent( ~ 1|month*guv.fac) ## variance allowed to vary from guv to non-guv periods, and month-to-month vfii.monthly <- Initialize(vfi.monthly, data = dd.analytical) ## fit the model, choosing the corr structure m.base.mo <- gls(my.logit ~ time.index, weights = vfii.monthly, correlation = corAR1(form = ~ time.index), data = dd.analytical, method = "ML") m.full.mo <- gls(my.logit ~ month + guv.fac + month:guv.fac + time.index, weights = vfii.monthly, correlation = corAR1(form = ~ time.index), data = dd.analytical, method = "ML") m.no.guv.mo <- update(m.full.mo, . ~ . - guv.fac - month:guv.fac) m.no.trend.mo <- update(m.full.mo, . ~ . - time.index) m.no.both.mo <- update(m.full.mo, . ~ . - time.index - guv.fac - month:guv.fac) ## -------------- END MONTHLY LOGIT MODELING ----------------- ## ## ------------ BEGIN COMPARING MODELS ---------------- ## ### for convenience, make lists of the two sets of models ### then remember lapply() list.of.models.sea <- list(m.full.sea, m.no.guv.sea, m.no.trend.sea, m.no.both.sea, m.base.sea) names(list.of.models.sea) <- c("m.full.sea", "m.no.guv.sea", "m.no.trend.sea", "m.no.both.sea", "m.base.sea") list.of.models.mo <- list(m.full.mo, m.no.guv.mo, m.no.trend.mo, m.no.both.mo, m.base.mo) names(list.of.models.mo) <- c("m.full.mo", "m.no.guv.mo", "m.no.trend.mo", "m.no.both.mo", "m.base.mo") ### compare models #### seasonal models my.pairwise.anova.f(list.of.models.sea) seasonal.model.aic.table <- AIC(m.full.sea, m.no.trend.sea, m.no.guv.sea, m.no.both.sea, m.base.sea) #### monthly models my.pairwise.anova.f(list.of.models.mo) monthly.model.aic.table <- AIC(m.full.mo, m.no.trend.mo, m.no.guv.mo, m.no.both.mo, m.base.mo) ### INTERPRETATION ### monthly models seem to fit better than seasonal models. No surprise there. ### For both the seasonal set of models and the monthly set, one can remove either ### guv or trend without losing much in terms of fit and significance, ### but cannot remove both without suffering in AIC and in LL. lapply(list.of.models.sea, coef) lapply(list.of.models.mo, coef) ### INTERPRETATION ### In either set of models, if you remove either guv or trend individually from the full model, ### the coefficient on the remaining predictor changes a lot--- ### by factors of 2 to 8. This collinearity between time.trend ### and guv is expected, from the nature of an interventional time ### series in a single "subject", ie. single preschool. No room had ### guv before installation, and all rooms had it after. ### INTERPRETATION ### So will keep both guv and trend (time.index) in the working model(s) ### assess and compare temperature-based models AIC(m.quadtemp.temp, m.full.temp, m.noguv.temp, m.notrend.temp, m.notemperature.temp, m.nointeraction.temp) anova(m.quadtemp.temp, m.full.temp, m.noguv.temp, m.notrend.temp, m.notemperature.temp, m.nointeraction.temp) ## quadratic for temp seems significant ### compare to what I consider my best month-based model AIC(m.quadtemp.temp, m.full.temp, m.notrend.temp, m.full.mo) ### assess terms in the full temperature-based model anova(m.full.temp) m.quadtemp.temp.anova.table <- anova(m.quadtemp.temp) anova(m.notrend.temp) ## quadratic in temp but no time terms ### regarding piecewise time trend: #### are the two segments, tested simultaneously, signficant? hyp.test.time.trend <- glht(m.quadtemp.temp, linfct = c("time.index = 0", ## effect of time.index pre-guv "time.index + guv.fac1:time.index = 0") ) ## effect of time.index post-guv summary(hyp.test.time.trend, adjusted(type = "hommel")) guv.trend.ps <- summary(hyp.test.time.trend, adjusted(type = "hommel"))$test$pvalues pre.guv.trend.pvalue <- guv.trend.ps[1] post.guv.trend.pvalue <- guv.trend.ps[2] #### the above fails to reject the null that both effects are zero. anova.table.quadtemp.with.without.trend <- anova(m.quadtemp.temp, m.notrend.temp) #### the above anova yields same conclusion: tested together, time.trend and its interaction with guv are not significantly different from zero #### INTERPRETATION #### I will omit all time.trend terms from consideration in the working model ## ------------- END COMPARING MODELS --------------- ## ## vvvvvvvv SET A WORKING MODEL. CHANGE AS DESIRED vvvvvvvvvvv ## # working.model <- m.notrend.temp ## this is (m4) in the stargazer table. Adjust as desired. # wm.anova.table <- anova(working.model) ## ^^^^^^^^^^^ THAT'S THE CURRENT WORKING MODEL ^^^^^^^^^^^^ ## ## ---------- BEGIN DIAGNOSTIC PLOTS ON WORKING MODEL ------------ ## ### serial autocorrelation pdf(file = "residdxacf.pdf", 8) plot(ACF(working.model), alpha = 0.05) ## still some disheartening autocorrelation, cyclical dev.off() ### residuals vs fitted plot.resid.fitted <- plot(working.model, resid(.) ~ fitted(.)) ## OK ### residuals vs predictors plot.resid.guv <- plot(working.model, guv.fac ~ resid(.)) ## not too bad ### notional predictors not in the model plot.resid.time <- plot(working.model, resid(.) ~ time.index) ## OK plot.resid.season <- plot(working.model, season ~ resid(.)) ## not too bad. Season is not in the temperature-based models #### do I need a quadratic term in temperature? resid.noquad.temp <- residuals(m.full.temp) ## m.full.temp has no quadratic temp term plot.need.quad.temp <- xyplot(resid.noquad.temp ~ dd.analytical$temperature, type = c("p", "smooth"), xlab = "temperature", ylab = "residuals") ## I think I do need the temp quad term wm.resid <- residuals(working.model) ## this has quad temp term, and interactions between guv.fac and both temp terms plot.have.quad.temp <- xyplot(wm.resid ~ dd.analytical$temperature, type = c("p", "smooth"), xlab = "temperature", ylab = "residuals") plot.resid.temp <- plot(working.model, resid(.) ~ temperature) #### make display graphs pdf(file = "residdxplot1.pdf", height = 7) plot.resid.fitted dev.off() pdf(file = "justifyquadtemp.pdf", height = 8) grid.arrange(plot.need.quad.temp, plot.have.quad.temp, heights = c(1/2, 1/2)) dev.off() pdf(file = "residdxplot2.pdf", height = 9) grid.arrange(plot.resid.guv, plot.resid.temp, plot.resid.time, heights = rep(1/3, 3)) dev.off() ### normality of residuals plot.resid.qqnorm <- qqnorm(working.model, abline = c(0,1)) ## quite nice with ML, (a little off with REML) working.model.resid <- residuals(working.model) plot.resid.density <- densityplot(~working.model.resid) ## fairly Guassian-looking pdf(file = "residdxplot3.pdf", height = 9) grid.arrange(plot.resid.qqnorm, plot.resid.density, heights = rep(1/2, 2)) dev.off() ## overall model fit to the observed data ### on linear predictor (logit) scale pdf(file = "overallfit.pdf") par(mfrow=c(2,1)) lims = c(-4, -1) fitted.model <-fitted(working.model) plot(fitted.model ~ dd.analytical$my.logit, xlim = lims, ylim = lims, ylab = "fitted values logit scale", xlab = "logit (log-odds) of observed monthly absenteeism", main = "Overall model fit") ## OK abline(a = 0, b = 1) ### on absenteeism rate (probability) scale pred.model <- working.model preds.prob <- my.ilogit.f(predict(working.model)) ## use existing values of predictors with(dd.analytical, plot(md.prop ~ time.index, ylab = "absenteeism rate", xlab = "time: months since January 1941")) ## observed data legend(legend = c("model predictions", "observed measurements"), col = c("purple", "black"), lty = c(1, NA), pch = c(NA, 1), x = "topright") with(dd.analytical, lines(preds.prob ~ time.index, col = "purple")) ## predicted by model dev.off() ## --------------- END DIAGNOSTIC PLOTS ON WORKING MODEL ------------ ## ## ---------- BEGIN HYPOTHESIS TESTS AND CIs FROM WORKING MODEL ---------- ## ### assess terms in the working model AIC(working.model) ### GUV effects from the temperature-based model ### on absenteeism rate, various scales wm.effect.lp.scale <- Effect(c("guv.fac", "temperature"), mod = working.model, se = list(type = "pointwise")) ## on scale of absenteeism rate pdf(file = "GUVtemperatureEffectInteraction.pdf") plot(wm.effect.lp.scale, main = "Estimated effect of upper-room GUV on preschool absenteeism \n due to respiratory illness", lines = list(multiline = TRUE, col = c("red", "blue")), lattice = list(key.args = simpleKey(c("without GUV", "with GUV"), points = FALSE, title = "")), confint = list(style = "bands"), axes = list(y = list(lab = "absenteeism rate", transform = my.ilogit.f)), sub = "monthly average temperature, representing seasonal cycles \n in respiratory epidemiology and outdoor activity") dev.off() ## on scale of odds ratio of absenteeism plot(wm.effect.lp.scale, lines = list(multiline = TRUE), confint = list(style = "bands"), axes = list(y = list(lab = "odds ratio of absenteeism", transform = exp))) ## on scale of linear predictor (logit) plot(wm.effect.lp.scale, lines = list(multiline = TRUE), confint = list(style = "bands"), axes = list(y = list(lab = "linear predictor", transform = NULL))) ### plot ORs at 4 illustrative temps, one for each season ### get these by using glht() of the multcomp package, followed by some machinations ### mean temp of each of fours seasons mst <- temps.long.2 %>% group_by(temp.season.fac) %>% summarise(mst = mean(temperature)) illus.temps <- mst ## a 2-column tibble; specify in command below what column to use for illustrative temps vector.mst <- pull(illus.temps[,2]) ## in order, top to bottom: summer, fall, winter, spring. Same as rows of illus.temps ## >= for one-sided tests simul.four.seasons <- glht(working.model, linfct = c("guv.fac1 + vector.mst[1]*guv.fac1:temperature + vector.mst[1]^2*guv.fac1:I(temperature^2) >= 0", "guv.fac1 + vector.mst[2]*guv.fac1:temperature + vector.mst[2]^2*guv.fac1:I(temperature^2) >= 0", "guv.fac1 + vector.mst[3]*guv.fac1:temperature + vector.mst[3]^2*guv.fac1:I(temperature^2) >= 0", "guv.fac1 + vector.mst[4]*guv.fac1:temperature + vector.mst[4]^2*guv.fac1:I(temperature^2) >= 0")) simul.four.seasons.ci.df <- data.frame(confint(simul.four.seasons, adjusted(type = "hommel"))$confint) ## segplot won't work with values of _Inf; choose -2 arbitrarily for display simul.four.seasons.ci.df <- simul.four.seasons.ci.df %>% mutate(season = illus.temps$temp.season.fac, estimate = Estimate, label = season, lwr = -2) pdf(file = "seasonalORsegplot.pdf") my.segplot.OR.f (simul.four.seasons.ci.df) dev.off() ### plot counterfactual trajectory to illustrate effect of GUV #### on scale of absenteeism rate pred.model <- working.model counter.preds.prob <- my.ilogit.f(predict(working.model , newdata.noguv)) ## counterfactual: use newdata with guv always = 0 pdf(file = "BigPicture.pdf") with(dd.analytical, plot(md.prop ~ time.index, pch = as.numeric(season), main = "monthly absenteeism rate due to respiratory illness \n among children in a preschool, \n January 1941 to November 1949", sub = "GUV installed in month 59", ylab = "absenteeism rate", xlab = "time: months since January 1941")) ## observed data legend(legend = c("model predictions before GUV", "model predictions with GUV", "model predictions without GUV", "observed measurements:", "summer", "fall", "winter", "spring"), col = c("purple", "blue", "red", rep("black", 5)), lty = c(1, 1, 2, rep(NA, 5)), pch = c(NA, NA, NA, NA, 1:4), cex = 0.8, x = "topright") with(dd.analytical, lines(my.ilogit.f(predict(working.model)) ~ time.index, col = "blue")) ## omit newdata to use observed values predicted by model lines(counter.preds.prob ~ dd.analytical$time.index, lty = 2, col = "red") ## if GUV had not been installed dev.off() #### on scale of number of missed days pred.model <- working.model counter.preds.prob <- my.ilogit.f(predict(working.model , newdata.noguv)) ## counterfactual: use newdata with guv always = 0 with(dd.analytical, plot(md ~ time.index, pch = as.numeric(season), main = "monthly missed days due to respiratory illness \n among children in a preschool, \n January 1941 to November 1949", sub = "GUV installed in month 59", ylab = "missed student-days of attendance", xlab = "time: months since January 1941")) ## observed data legend(legend = c("model predictions before GUV", "model predictions with GUV", "model predictions without GUV", "observed measurements:", "summer", "fall", "winter", "spring"), col = c("purple", "blue", "red", rep("black", 5)), lty = c(1, 1, 2, rep(NA, 5)), pch = c(NA, NA, NA, NA, 1:4), cex = 0.8, x = "topright") with(dd.analytical, lines((my.ilogit.f(predict(working.model)) * dd.analytical$ad) ~ time.index, col = "blue")) ## omit newdata to use observed values predicted by model lines((counter.preds.prob * dd.analytical$ad) ~ dd.analytical$time.index, lty = 2, col = "red") ## if GUV had not been installed ################################################### ### code chunk number 7: displaystargazermodeltable ################################################### ## all based on the quadratic temperature full model m1 <- m.quadtemp.temp m2 <- m.full.temp m3 <- m.noguv.temp m4 <- m.notrend.temp m5 <- m.notemperature.temp m6 <- m.nointeraction.temp ### save the latex code for the big stargaze table as a .tex file stargazer(type = "latex", out = "stargaze.tex", title = "Model comparison", m1, m2, m3, m4, m5, m6, label = "stargaze", dep.var.labels = "log-odds of monthly absenteeism rate", intercept.bottom = FALSE, intercept.top = TRUE, notes.append = FALSE, notes = "", report = "vcs")