--- title: 'Embracing the positive: An examination of how well resilience factors at age 14 can predict distress at age 17' author: "JFritz" date: "2019/Nov/30" output: word_document --- ##00: Table of content ```{r} #0A: Opening data and loading packages #0B: Identifying and removing outliers #01: Sample descriptives #02: Making the data frames for the whole sample #03: Run a regression analysis (all including CA and gender): Compare models with RFs only, with GD only, and with GD and RFs #04: Run a relative importance analysis with CA, gender and RFs as predictors #05: Run a relative importance analysis with CA, gender and GD as predictors #06: Run a relative importance analysis with CA, gender and RFs and GD as predictors #07: Prepare the 3-class solution variable #08: Ordinal model: Run the machine learning model with CA, gender and RFs as predictors #09: Ordinal model: Run the machine learning model with CA, gender and GD as predictors #10: Ordinal model: Run the machine learning model with CA, gender and RFs and GD as predictors #11: Continuous model: Run the machine learning model with CA, gender and RFs as predictors #12: Continuous model: Run the machine learning model with CA, gender and GD as predictors #13: Continuous model: Run the machine learning model with CA, gender and RFs and GD as predictors #14: Ordinal model: Run the machine learning model with gender, brooding and self-esteem RFs as predictors #15: Ordinal model: Run the machine learning model with gender, brooding, self-esteem RFs and GD as predictors #16: Continuous model: Run the machine learning model with gender, brooding and self-esteem RFs as predictors #17: Continuous model: Run the machine learning model with gender, brooding, self-esteem RFs and GD as predictors #18: Prepare the 4-class solution variable #19: Ordinal model (4 calsses): Run the machine learning model with CA, gender and RFs as predictors #20: Ordinal model (4 calsses): Run the machine learning model with CA, gender and GD as predictors #21: Ordinal model (4 calsses): Run the machine learning model with CA, gender and RFs and GD as predictors ``` ##0A: Opening data and loading packages ```{r} #open data #choose.files() Pred_data <- readRDS("*Enter_trajectory_name*\\Prediction_data_imp_2019.11Nov.04_final.csv") #install.packages("x") library("beanplot") library("brant") library("car") library("caret") library("coin") library("Hmisc") library("MASS") library("MLmetrics") library("pastecs") library("pROC") library("qgraph") library("relaimpo") library("VGAM") ``` ##0B: Identifying and removing outliers ```{r} ## compute the main regression model: with both RFs & GD table(Pred_data$Aggression_14_bin) Pred_data$Aggression_sc <- scale(Pred_data$Aggression_14_bin); table(Pred_data$Aggression_sc) # high aggression RF = -2.13860040771842; low aggression RF= 0.467201935224639 table(Pred_data$ExpressiveSuppression14_bin) Pred_data$ExpressiveSuppression_sc <- scale(Pred_data$ExpressiveSuppression14_bin); table(Pred_data$ExpressiveSuppression_sc) # high low expressive suppression RF = -1.49504989077618; low expressive suppression RF = 0.668310974317733 out_data <- na.omit(data.frame(Pred_data$FriendSupport_14, Pred_data$FamilySupport_14, Pred_data$FamilyCohesion_14, Pred_data$PosSes_14, Pred_data$NegSes_14, Pred_data$Brd_14, Pred_data$Reflection_14, Pred_data$DisTol_14, Pred_data$Aggression_sc, Pred_data$ExpressiveSuppression_sc, Pred_data$GeneralDistress_14_lti, Pred_data$GeneralDistress_17_lti, Pred_data$CA_dic, Pred_data$gender)); nrow(out_data) fit <- lm(Pred_data.GeneralDistress_17_lti ~ Pred_data.CA_dic + Pred_data.gender + Pred_data.FriendSupport_14 + Pred_data.FamilySupport_14 + Pred_data.FamilyCohesion_14 + Pred_data.PosSes_14 + Pred_data.NegSes_14 + Pred_data.Brd_14 + Pred_data.Reflection_14 + Pred_data.DisTol_14 + Pred_data.Aggression_sc + Pred_data.ExpressiveSuppression_sc + Pred_data.GeneralDistress_14_lti, data = out_data) summary(fit) #outlier out_data$standardized.residuals<- rstandard(fit) # >3 or 2.75 should be considered for exclusion out_data$toolarge.residuals <- out_data$standardized.residuals > 3 | out_data$standardized.residuals < -3 sum(out_data$toolarge.residual) # 0/1130 --> 0% out_data[out_data$toolarge.residual, c("standardized.residuals", "Pred_data.CA_dic")]# 0 outlierTest(fit) #868 2.957175 0.0031701 #pdf(file = "outlier.pdf") qqPlot(fit, main="QQ Plot") # outlier: 1061, 868 leveragePlots(fit) leveragePlots(fit, col = "white") #dev.off() #remove the outliers from the data set --> not neccesary nrow(Pred_data) ``` ##01: Sample descriptives ```{r} Desc_data_I <- na.omit(data.frame(Pred_data$id, Pred_data$FriendSupport_14, Pred_data$FamilySupport_14, Pred_data$FamilyCohesion_14, Pred_data$PosSes_14, Pred_data$NegSes_14, Pred_data$Brd_14, Pred_data$Reflection_14, Pred_data$DisTol_14, Pred_data$Aggression_sc, Pred_data$ExpressiveSuppression_sc, Pred_data$GeneralDistress_14_lti, Pred_data$GeneralDistress_17_lti, Pred_data$CA_dic, Pred_data$gender, Pred_data$acorn_ses, Pred_data$psychdiag_14)); nrow(Desc_data_I) Desc_data_II <- data.frame(Pred_data$id, Pred_data$age_14, Pred_data$age_17, Pred_data$psychdiag_17); nrow(Desc_data_II) library("dplyr") Desc_data <- left_join(Desc_data_I, Desc_data_II, by = "Pred_data.id"); nrow(Desc_data) detach("package:dplyr", unload=TRUE) #a: CA+ vs CA- table(Desc_data$Pred_data.CA_dic) #b: gender table(Desc_data$Pred_data.gender) chi_sex <- table(Desc_data$Pred_data.gender, Desc_data$Pred_data.CA_dic); chi_sex chisq.test(chi_sex) #c: Socio-economit status (SES) ##CA chi_SES <- table(Desc_data$Pred_data.acorn_ses, Desc_data$Pred_data.CA_dic); chi_SES #As SES was split in 5 ordered categories, we applied the Asymptotic Cochran-Armitage test chi_SES2 <- matrix( c(30, 73, 11, 36, 104, 168, 41, 37, 313, 317), byrow = TRUE, ncol = 2, dimnames = list( "SES" = c("hardpressed", "moderatemeans", "comfoff", "urbanprosperity", "wealthyachievers"), "CA" = c("0", "1") ) ) chi_SES2 <- as.table(chi_SES2);chi_SES2 chi_SES2 <- chisq_test(chi_SES2, scores = list("SES" = c(1,2,3,4,5)));chi_SES2 ##gender chi_SES <- table(Desc_data$Pred_data.acorn_ses, Desc_data$Pred_data.gender); chi_SES #As SES was split in 5 ordered categories, we applied the Asymptotic Cochran-Armitage test chi_SES2 <- matrix( c(51, 52, 23, 24, 154, 118, 34, 44, 358, 272), byrow = TRUE, ncol = 2, dimnames = list( "SES" = c("hardpressed", "moderatemeans", "comfoff", "urbanprosperity", "wealthyachievers"), "gender" = c("female", "male") ) ) chi_SES2 <- as.table(chi_SES2);chi_SES2 chi_SES2 <- chisq_test(chi_SES2, scores = list("SES" = c(1,2,3,4,5)));chi_SES2 #d: age 14 ##CA age <- na.omit(data.frame(Desc_data$Pred_data.age_14, Desc_data$Pred_data.CA_dic)) stat.desc(age$Desc_data.Pred_data.age_14, norm = TRUE) age_CA <- subset(age, age$Desc_data.Pred_data.CA_dic==1) stat.desc(age_CA$Desc_data.Pred_data.age_14, norm = TRUE) age_noCA <- subset(age, age$Desc_data.Pred_data.CA_dic==0) stat.desc(age_noCA$Desc_data.Pred_data.age_14, norm = TRUE) t.test(age$Desc_data.Pred_data.age_14 ~ age$Desc_data.Pred_data.CA_dic, var.equal = FALSE) ##gender age <- na.omit(data.frame(Desc_data$Pred_data.age_14, Desc_data$Pred_data.gender)) stat.desc(age$Desc_data.Pred_data.age_14, norm = TRUE) age_fem <- subset(age, age$Desc_data.Pred_data.gender==0) stat.desc(age_fem$Desc_data.Pred_data.age_14, norm = TRUE) age_male <- subset(age, age$Desc_data.Pred_data.gender==1) stat.desc(age_male$Desc_data.Pred_data.age_14, norm = TRUE) t.test(age$Desc_data.Pred_data.age_14 ~ age$Desc_data.Pred_data.gender, var.equal = FALSE) #e: age 17 ##CA age <- na.omit(data.frame(Desc_data$Pred_data.age_17, Desc_data$Pred_data.CA_dic)) stat.desc(age$Desc_data.Pred_data.age_17, norm = TRUE) age_CA <- subset(age, age$Desc_data.Pred_data.CA_dic==1) stat.desc(age_CA$Desc_data.Pred_data.age_17, norm = TRUE) age_noCA <- subset(age, age$Desc_data.Pred_data.CA_dic==0) stat.desc(age_noCA$Desc_data.Pred_data.age_17, norm = TRUE) t.test(age$Desc_data.Pred_data.age_17 ~ age$Desc_data.Pred_data.CA_dic, var.equal = FALSE) ##gender age <- na.omit(data.frame(Desc_data$Pred_data.age_17, Desc_data$Pred_data.gender)) stat.desc(age$Desc_data.Pred_data.age_17, norm = TRUE) age_fem <- subset(age, age$Desc_data.Pred_data.gender==0) stat.desc(age_fem$Desc_data.Pred_data.age_17, norm = TRUE) age_male <- subset(age, age$Desc_data.Pred_data.gender==1) stat.desc(age_male$Desc_data.Pred_data.age_17, norm = TRUE) t.test(age$Desc_data.Pred_data.age_17 ~ age$Desc_data.Pred_data.gender, var.equal = FALSE) #f: prior psychiatric history at age 14 ##CA chi_diag <- table(Desc_data$Pred_data.psychdiag_14, Desc_data$Pred_data.CA_dic); chi_diag chisq.test(chi_diag) ##gender chi_diag <- table(Desc_data$Pred_data.psychdiag_14, Desc_data$Pred_data.gender); chi_diag chisq.test(chi_diag) #g: prior psychiatric history at age 17 ##CA chi_diag <- table(Desc_data$Pred_data.psychdiag_17, Desc_data$Pred_data.CA_dic); chi_diag chisq.test(chi_diag) ##gender chi_diag <- table(Desc_data$Pred_data.psychdiag_17, Desc_data$Pred_data.gender); chi_diag chisq.test(chi_diag) ``` ##02: Making the data frames for the whole sample ```{r} #Make a data frame for the whole sample # 1130 RF_data <- na.omit(data.frame(Pred_data$FriendSupport_14, Pred_data$FamilySupport_14, Pred_data$FamilyCohesion_14, Pred_data$PosSes_14, Pred_data$NegSes_14, Pred_data$Brd_14, Pred_data$Reflection_14, Pred_data$DisTol_14, Pred_data$Aggression_sc, Pred_data$ExpressiveSuppression_sc, Pred_data$GeneralDistress_14_lti, Pred_data$GeneralDistress_14_lti_SE, Pred_data$GeneralDistress_17_lti, Pred_data$GeneralDistress_17_lti_SE, Pred_data$CA_dic, Pred_data$gender)); nrow(RF_data) #correlation matrix cormatrix <- na.omit(data.frame(Pred_data$CA_dic, Pred_data$gender, Pred_data$FriendSupport_14, Pred_data$FamilySupport_14, Pred_data$FamilyCohesion_14, Pred_data$PosSes_14, Pred_data$NegSes_14, Pred_data$Brd_14, Pred_data$Reflection_14, Pred_data$DisTol_14, Pred_data$Aggression_sc, Pred_data$ExpressiveSuppression_sc, Pred_data$GeneralDistress_14_lti, Pred_data$GeneralDistress_17_lti, Pred_data$GeneralDistress_17_FLCA3, Pred_data$GeneralDistress_17_FLCA4)); nrow(cormatrix) cor <- cor_auto(cormatrix); cor ``` ##03: Run a regression analysis (all including CA and gender): Compare models with RFs only, with GD only, and with GD and RFs ```{r} #0a: compute the regression model : with CA linmod_final_0a <- lm(Pred_data.GeneralDistress_17_lti ~ Pred_data.CA_dic, data = RF_data) summary(linmod_final_0a) #0b: compute the regression model : with CA and gender linmod_final_0b <- lm(Pred_data.GeneralDistress_17_lti ~ Pred_data.CA_dic + Pred_data.gender, data = RF_data) summary(linmod_final_0b) vif(linmod_final_0b) #1: compute the regression model : with RFs linmod_final_1 <- lm(Pred_data.GeneralDistress_17_lti ~ Pred_data.CA_dic + Pred_data.gender + Pred_data.FriendSupport_14 + Pred_data.FamilySupport_14 + Pred_data.FamilyCohesion_14 + Pred_data.PosSes_14 + Pred_data.NegSes_14 + Pred_data.Brd_14 + Pred_data.Reflection_14 + Pred_data.DisTol_14 + Pred_data.Aggression_sc + Pred_data.ExpressiveSuppression_sc, data = RF_data) summary(linmod_final_1) vif(linmod_final_1) #2: compute the regression model : with GD linmod_final_2 <- lm(Pred_data.GeneralDistress_17_lti ~ Pred_data.CA_dic + Pred_data.gender + Pred_data.GeneralDistress_14_lti, data = RF_data) summary(linmod_final_2) vif(linmod_final_2) #:3 compute the regression model : with RFs and GD linmod_final_3 <- lm(Pred_data.GeneralDistress_17_lti ~ Pred_data.CA_dic + Pred_data.gender + Pred_data.FriendSupport_14 + Pred_data.FamilySupport_14 + Pred_data.FamilyCohesion_14 + Pred_data.PosSes_14 + Pred_data.NegSes_14 + Pred_data.Brd_14 + Pred_data.Reflection_14 + Pred_data.DisTol_14 + Pred_data.Aggression_sc + Pred_data.ExpressiveSuppression_sc + Pred_data.GeneralDistress_14_lti, data = RF_data) summary(linmod_final_3) vif(linmod_final_3) sqrt(vif(linmod_final_3)) > 2 #4: comparing the above models anova(linmod_final_0a, linmod_final_0b,test = "LRT") anova(linmod_final_0b, linmod_final_1, test = "LRT") anova(linmod_final_0b, linmod_final_2, test = "LRT") anova(linmod_final_0b, linmod_final_3, test = "LRT") anova(linmod_final_1, linmod_final_3, test = "LRT") anova(linmod_final_2, linmod_final_3, test = "LRT") ``` ##04: Run a relative importance analysis with CA, gender and RFs as predictors ```{r} #1: compute the relative importance (RI) model linmod_final <- lm(Pred_data.GeneralDistress_17_lti ~ Pred_data.CA_dic + Pred_data.gender + Pred_data.FriendSupport_14 + Pred_data.FamilySupport_14 + Pred_data.FamilyCohesion_14 + Pred_data.PosSes_14 + Pred_data.NegSes_14 + Pred_data.Brd_14 + Pred_data.Reflection_14 + Pred_data.DisTol_14 + Pred_data.Aggression_sc + Pred_data.ExpressiveSuppression_sc, data = RF_data) summary(linmod_final) #2: compute the RI values metrics <- calc.relimp(linmod_final, type = c("first", "lmg")); metrics plot(metrics) #3: compute the absolute RI values metrics_abs <- calc.relimp(linmod_final, type = c("first", "lmg"), rela = TRUE); metrics_abs plot(metrics_abs) out_I <- data.frame(rowSums(metrics_abs$ave.coeffs)/12) output <- data.frame(metrics_abs$lmg, metrics_abs$first, out_I); output #4: boostrapping CIs for the absolute RI value set.seed(4) bootobject <- boot.relimp(linmod_final, b = 1000, type = c("first", "lmg"), rela = TRUE) randomlmg <- booteval.relimp(bootobject, bty = "perc", level = 0.95) output <- rbind(randomlmg$lmg.lower, randomlmg$lmg.upper) output <- as.matrix(t(output)) colnames(output) <- c("random.lower", "random.upper") matrixrownames <- c(randomlmg$namen[2:13]) rownames(output) <- matrixrownames; output plot(booteval.relimp(bootobject, bty = "perc", level = 0.95)) ``` ##05: Run a relative importance analysis with CA, gender and GD as predictors ```{r} #1: compute the relative importance (RI) model linmod_final <- lm(Pred_data.GeneralDistress_17_lti ~ Pred_data.CA_dic + Pred_data.gender + Pred_data.GeneralDistress_14_lti, data = RF_data) summary(linmod_final) #2: compute the RI values metrics <- calc.relimp(linmod_final, type = c("first", "lmg")); metrics plot(metrics) #3: compute the absolute RI values metrics_abs <- calc.relimp(linmod_final, type = c("first", "lmg"), rela = TRUE); metrics_abs plot(metrics_abs) out_I <- data.frame(rowSums(metrics_abs$ave.coeffs)/3) output <- data.frame(metrics_abs$lmg, metrics_abs$first, out_I); output #4: boostrapping CIs for the absolute RI value set.seed(4) bootobject <- boot.relimp(linmod_final, b = 1000, type = c("first", "lmg"), rela = TRUE) randomlmg <- booteval.relimp(bootobject, bty = "perc", level = 0.95) output <- rbind(randomlmg$lmg.lower, randomlmg$lmg.upper) output <- as.matrix(t(output)) colnames(output) <- c("random.lower", "random.upper") matrixrownames <- c(randomlmg$namen[2:4]) rownames(output) <- matrixrownames; output plot(booteval.relimp(bootobject, bty = "perc", level = 0.95)) ``` ##06: Run a relative importance analysis with CA, gender and RFs and GD as predictors ```{r} #1: compute the relative importance (RI) model linmod_final <- lm(Pred_data.GeneralDistress_17_lti ~ Pred_data.CA_dic + Pred_data.gender + Pred_data.FriendSupport_14 + Pred_data.FamilySupport_14 + Pred_data.FamilyCohesion_14 + Pred_data.PosSes_14 + Pred_data.NegSes_14 + Pred_data.Brd_14 + Pred_data.Reflection_14 + Pred_data.DisTol_14 + Pred_data.Aggression_sc + Pred_data.ExpressiveSuppression_sc + Pred_data.GeneralDistress_14_lti, data = RF_data) summary(linmod_final) #2: compute the RI values metrics <- calc.relimp(linmod_final, type = c("first", "lmg")); metrics plot(metrics) #3: compute the absolute RI values metrics_abs <- calc.relimp(linmod_final, type = c("first", "lmg"), rela = TRUE); metrics_abs plot(metrics_abs) out_I <- data.frame(rowSums(metrics_abs$ave.coeffs)/13) output <- data.frame(metrics_abs$lmg, metrics_abs$first, out_I); output #4: boostrapping CIs for the absolute RI value set.seed(4) bootobject <- boot.relimp(linmod_final, b = 1000, type = c("first", "lmg"), rela = TRUE) randomlmg <- booteval.relimp(bootobject, bty = "perc", level = 0.95) output <- rbind(randomlmg$lmg.lower, randomlmg$lmg.upper) output <- as.matrix(t(output)) colnames(output) <- c("random.lower", "random.upper") matrixrownames <- c(randomlmg$namen[2:14]) rownames(output) <- matrixrownames; output plot(booteval.relimp(bootobject, bty = "perc", level = 0.95)) ``` ##07: Prepare the 3-class solution variable ```{r} ##recode class variables #recode FLCA 3 variable table(Pred_data$GeneralDistress_17_FLCA3) Pred_data$GeneralDistress_17_FLCA3 <- recode(Pred_data$GeneralDistress_17_FLCA3, as.numeric=TRUE, "1=2; 2=1; 3=3"); table(Pred_data$GeneralDistress_17_FLCA3) #recode FLCA 4 variable table(Pred_data$GeneralDistress_17_FLCA4) Pred_data$GeneralDistress_17_FLCA4 <- recode(Pred_data$GeneralDistress_17_FLCA4, as.numeric=TRUE, "1=2; 2=1; 3=4; 4=3"); table(Pred_data$GeneralDistress_17_FLCA4) ##interpretation for 3 classes: set off against the invariant continuous general distress score plot_three <- na.omit(data.frame(Pred_data$GeneralDistress_17_lti, Pred_data$GeneralDistress_17_FLCA3, Pred_data$CA_dic)); nrow(plot_three) names=c(rep("GD", 1130)) value=c(plot_three$Pred_data.GeneralDistress_17_lti) group=as.factor(c(plot_three$Pred_data.GeneralDistress_17_FLCA3)) data=na.omit(data.frame(names,value,group)) data$group <- as.factor(data$group) #pdf(file = "three_classes_vs_distress_level.pdf") plot <- ggplot(data, aes(x = group, y = value, group = group)) + geom_point(aes(colour = group), alpha=0.6, position = "jitter", size = 0.5) + scale_colour_manual(values=c("darkblue","darkred","darkgreen"),labels = c("C1", "C2", "C3"))+ geom_boxplot(alpha=0, outlier.colour="black", outlier.shape=16, outlier.size=0.5, notch=FALSE) + facet_wrap( ~ names, ncol = 5) + theme_minimal() + theme(axis.title.x = element_blank(), axis.title.y = element_blank(), legend.position = "right", panel.grid = element_blank(), legend.title = element_blank(), axis.text.x = element_blank())+ guides(colour = guide_legend(override.aes = list(size=2.5))) plot #dev.off() table(data$group) ##make RF data frame with the ordinal class variable RF_data <- na.omit(data.frame(Pred_data$CA_dic, Pred_data$gender, Pred_data$FriendSupport_14, Pred_data$FamilySupport_14, Pred_data$FamilyCohesion_14, Pred_data$PosSes_14, Pred_data$NegSes_14, Pred_data$Brd_14, Pred_data$Reflection_14, Pred_data$DisTol_14, Pred_data$Aggression_sc, Pred_data$ExpressiveSuppression_sc, Pred_data$GeneralDistress_14_lti, Pred_data$GeneralDistress_14_lti_SE, Pred_data$GeneralDistress_17_lti, Pred_data$GeneralDistress_17_lti_SE, Pred_data$GeneralDistress_17_FLCA3, Pred_data$GeneralDistress_17_FLCA4)); nrow(RF_data) #code CA and gender as numeric RF_data$gender_num <- as.numeric(RF_data$Pred_data.gender);table(RF_data$gender_num) RF_data$CA_dic_num <- as.numeric(RF_data$Pred_data.CA_dic);table(RF_data$CA_dic_num) library("car") table(RF_data$Pred_data.GeneralDistress_17_FLCA3) # 1 2 3 #623 390 117 RF_data$Cluster_data.three_clusters <- gsub("1","alow", RF_data$Pred_data.GeneralDistress_17_FLCA3) RF_data$Cluster_data.three_clusters <- gsub("2","bmoderate", RF_data$Cluster_data.three_clusters) RF_data$Cluster_data.three_clusters <- gsub("3","chigh", RF_data$Cluster_data.three_clusters) RF_data$Cluster_data.three_clusters <- ordered(RF_data$Cluster_data.three_clusters); class(RF_data$Cluster_data.three_clusters) table(RF_data$Cluster_data.three_clusters); class(RF_data$Cluster_data.three_clusters) detach("package:car", unload=TRUE) ``` ##08: Ordinal model: Run the machine learning model with CA, gender and RFs as predictors ```{r} ##make the data frame for the whole sample RF_data_ML_II <- data.frame(RF_data$CA_dic_num, RF_data$gender_num, RF_data$Pred_data.FriendSupport_14, RF_data$Pred_data.FamilySupport_14, RF_data$Pred_data.FamilyCohesion_14, RF_data$Pred_data.PosSes_14, RF_data$Pred_data.NegSes_14, RF_data$Pred_data.Brd_14, RF_data$Pred_data.Reflection_14, RF_data$Pred_data.DisTol_14, RF_data$Pred_data.Aggression_sc, RF_data$Pred_data.ExpressiveSuppression_sc, RF_data$Cluster_data.three_clusters) head(RF_data_ML_II); nrow(RF_data_ML_II) class(RF_data_ML_II$RF_data.Cluster_data.three_clusters) ##splitting the data in training and prediction data set set.seed(100) trainDataIndex <- createDataPartition(RF_data_ML_II$RF_data.Cluster_data.three_clusters, p=0.75, list = F) # 75% training data trainData_ML_II <- RF_data_ML_II[trainDataIndex,]; head(trainData_ML_II) testData_ML_II <- RF_data_ML_II[-trainDataIndex,]; head(testData_ML_II) table(trainData_ML_II$RF_data.Cluster_data.three_clusters) # alow bmoderate chigh # 468 293 88 table(testData_ML_II$RF_data.Cluster_data.three_clusters) # alow bmoderate chigh # 155 97 29 ##running the ordinal model on the training dataset: polr formula <- RF_data.Cluster_data.three_clusters ~ RF_data.CA_dic_num+ RF_data.gender_num+ RF_data.Pred_data.FriendSupport_14+ RF_data.Pred_data.FamilySupport_14+ RF_data.Pred_data.FamilyCohesion_14+ RF_data.Pred_data.PosSes_14+ RF_data.Pred_data.NegSes_14+ RF_data.Pred_data.Brd_14+ RF_data.Pred_data.Reflection_14+ RF_data.Pred_data.DisTol_14+ RF_data.Pred_data.Aggression_sc+ RF_data.Pred_data.ExpressiveSuppression_sc ordinalMod <- polr(formula, data = trainData_ML_II, Hess=TRUE, method ="probit"); summary(ordinalMod) #probit: Residual Deviance: 1452.56; AIC: 1480.56 ordinalMod <- polr(formula, data = trainData_ML_II, Hess=TRUE, method ="logistic"); summary(ordinalMod) #logistic: Residual Deviance: 1445.99; AIC: 1473.99 ctable <- coef(summary(ordinalMod)); p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2 ctable <- cbind(ctable, "p value" = p); ctable brant(ordinalMod, by.var= TRUE) #prop-odds-unmet: RF_data.gender_num=0; RF_data.Pred_data.DisTol_14=0.02 ##running the ordinal model on the training dataset: VGAM ordinalModpar0 <- vgam(formula, data = trainData_ML_II, cumulative(link = "logitlink", reverse = TRUE, parallel = TRUE)); summary(ordinalModpar0) #RD =1445.99; LL =-722.9948; reverse=TRUE ctablepar <- coef(summary(ordinalModpar0)); ctablepar ordinalModpar_f1 <- vgam(formula, data = trainData_ML_II, cumulative(link = "logitlink", reverse = TRUE, parallel = FALSE ~ 1 + RF_data.gender_num)); summary(ordinalModpar_f1) #RD =1422.378; LL =-711.189; reverse=TRUE ctablepar <- coef(summary(ordinalModpar_f1)); ctablepar lrtest_vglm(ordinalModpar0, ordinalModpar_f1) # p = 1.179e-06 *** ordinalModpar_f <- vgam(formula, data = trainData_ML_II, cumulative(link = "logitlink", reverse = TRUE, parallel = FALSE ~ 1 + RF_data.gender_num + RF_data.Pred_data.DisTol_14)); summary(ordinalModpar_f) #RD =1420.354; LL =-710.1768; reverse=TRUE ctablepar <- coef(summary(ordinalModpar_f)); ctablepar lrtest_vglm(ordinalModpar_f1, ordinalModpar_f) # p = 0.1548 ##predict pred_ML_II <- apply(VGAM::predictvglm(ordinalModpar_f, newdata = testData_ML_II, type = "response"), 1, which.max); head(pred_ML_II) pred_ML_II <- gsub("1","alow", pred_ML_II) pred_ML_II <- gsub("2","bmoderate", pred_ML_II) pred_ML_II <- gsub("3","chigh", pred_ML_II) pred_ML_II <- as.factor(pred_ML_II); table(pred_ML_II) # alow bmoderate chigh # 189 91 1 class_ML_II <- as.factor(testData_ML_II[,13]); table(class_ML_II) # alow bmoderate chigh # 155 97 29 ##assess accuracy caret::confusionMatrix(pred_ML_II, class_ML_II) #Prediction #Prediction alow bmoderate chigh # alow 129 44 16 # bmoderate 26 52 13 # chigh 0 1 0 # Accuracy : 0.6441 # Class: alow Class: bmoderate Class: chigh #Sensitivity 0.8323 0.5361 0.000000 #Specificity 0.5238 0.7880 0.996032 #Balanced Accuracy 0.6780 0.6621 0.498016 #ROC curve predProbs_ML_II <- as.data.frame(VGAM::predictvglm(ordinalModpar_f, newdata = testData_ML_II, type = "response")) predROC_ML_II <- multiclass.roc(testData_ML_II[,13], predProbs_ML_II[,"chigh"]); auc(predROC_ML_II) # 0.7515 predROC_ML_II <- multiclass.roc(testData_ML_II[,13], predProbs_ML_II[,"bmoderate"]); auc(predROC_ML_II) # 0.6485 predROC_ML_II <- multiclass.roc(testData_ML_II[,13], predProbs_ML_II[,"alow"]); auc(predROC_ML_II) # 0.7036 ``` ##09: Ordinal model: Run the machine learning model with CA, gender and GD as predictors ```{r} ##make the data frame for the whole sample RF_data_ML_II <- data.frame(RF_data$CA_dic_num, RF_data$gender_num, RF_data$Pred_data.GeneralDistress_14_lti, RF_data$Cluster_data.three_clusters) head(RF_data_ML_II); nrow(RF_data_ML_II) class(RF_data_ML_II$RF_data.Cluster_data.three_clusters) ##splitting the data in training and prediction data set set.seed(100) trainDataIndex <- createDataPartition(RF_data_ML_II$RF_data.Cluster_data.three_clusters, p=0.75, list = F) # 75% training data trainData_ML_II <- RF_data_ML_II[trainDataIndex,]; head(trainData_ML_II) testData_ML_II <- RF_data_ML_II[-trainDataIndex,]; head(testData_ML_II) table(trainData_ML_II$RF_data.Cluster_data.three_clusters) # alow bmoderate chigh # 468 293 88 table(testData_ML_II$RF_data.Cluster_data.three_clusters) # alow bmoderate chigh # 155 97 29 ##run the ordinal model on the training dataset: polr formula <- RF_data.Cluster_data.three_clusters ~ RF_data.CA_dic_num + RF_data.gender_num + RF_data.Pred_data.GeneralDistress_14_lti ordinalMod <- polr(formula, data = trainData_ML_II, Hess=TRUE, method ="probit"); summary(ordinalMod) #probit: Residual Deviance: 1433.319; AIC: 1443.319 ordinalMod <- polr(formula, data = trainData_ML_II, Hess=TRUE, method ="logistic"); summary(ordinalMod) #logistic: Residual Deviance: 1426.084; AIC: 1436.084 ctable <- coef(summary(ordinalMod)); p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2 ctable <- cbind(ctable, "p value" = p); ctable brant(ordinalMod, by.var= TRUE) #prop-odds-unmet: RF_data.gender_num=0; RF_data.Pred_data.GeneralDistress_14_lti=0 ##run the ordinal model on the training dataset: VGAM ordinalModpar0 <- vgam(formula, data = trainData_ML_II, cumulative(link = "logitlink", reverse = TRUE, parallel = TRUE)); summary(ordinalModpar0) #RD = 1426.084; LL = -713.0418; reverse=TRUE ctablepar <- coef(summary(ordinalModpar0)); ctablepar ordinalModpar_f1 <- vgam(formula, data = trainData_ML_II, cumulative(link = "logitlink", reverse = TRUE, parallel = FALSE ~ 1 + RF_data.gender_num)); summary(ordinalModpar_f1) #RD = 1402.029; LL = -701.0146; reverse=TRUE ctablepar <- coef(summary(ordinalModpar_f1)); ctablepar lrtest_vglm(ordinalModpar0, ordinalModpar_f1) # p = 9.366e-07 *** ordinalModpar_f <- vgam(formula, data = trainData_ML_II, cumulative(link = "logitlink", reverse = TRUE, parallel = FALSE ~ 1 + RF_data.gender_num + RF_data.Pred_data.GeneralDistress_14_lti)); summary(ordinalModpar_f) #RD = 1390.408; LL = -695.2038; reverse=TRUE ctablepar <- coef(summary(ordinalModpar_f)); ctablepar lrtest_vglm(ordinalModpar_f1, ordinalModpar_f) # p = 0.0006519 *** ##predict pred_ML_II <- apply(VGAM::predictvglm(ordinalModpar_f, newdata = testData_ML_II, type = "response"), 1, which.max); head(pred_ML_II) pred_ML_II <- gsub("1","alow", pred_ML_II) pred_ML_II <- gsub("2","bmoderate", pred_ML_II) pred_ML_II <- gsub("3","chigh", pred_ML_II) pred_ML_II <- as.factor(pred_ML_II); table(pred_ML_II) # alow bmoderate # 186 95 class_ML_II <- as.factor(testData_ML_II[,4]); table(class_ML_II) # alow bmoderate chigh # 155 97 29 ##assess accuracy caret::confusionMatrix(pred_ML_II, class_ML_II) #Prediction alow bmoderate chigh # alow 123 46 17 # bmoderate 32 51 12 # chigh 0 0 0 # Accuracy : 0.6192 # Class: alow Class: bmoderate Class: chigh #Sensitivity 0.7935 0.5258 0.0000 #Specificity 0.5000 0.7609 1.0000 #Balanced Accuracy 0.6468 0.6433 0.5000 #ROC curve predProbs_ML_II <- as.data.frame(VGAM::predictvglm(ordinalModpar_f, newdata = testData_ML_II, type = "response")) predROC_ML_II <- multiclass.roc(testData_ML_II[,4], predProbs_ML_II[,"chigh"]); auc(predROC_ML_II) # 0.7096 predROC_ML_II <- multiclass.roc(testData_ML_II[,4], predProbs_ML_II[,"bmoderate"]); auc(predROC_ML_II) # 0.6829 predROC_ML_II <- multiclass.roc(testData_ML_II[,4], predProbs_ML_II[,"alow"]); auc(predROC_ML_II) # 0.6913 ``` ##10: Ordinal model: Run the machine learning model with CA, gender and RFs and GD as predictors ```{r} ##make the data frame for the whole sample RF_data_ML_II <- data.frame(RF_data$CA_dic_num, RF_data$gender_num, RF_data$Pred_data.FriendSupport_14, RF_data$Pred_data.FamilySupport_14, RF_data$Pred_data.FamilyCohesion_14, RF_data$Pred_data.PosSes_14, RF_data$Pred_data.NegSes_14, RF_data$Pred_data.Brd_14, RF_data$Pred_data.Reflection_14, RF_data$Pred_data.DisTol_14, RF_data$Pred_data.Aggression_sc, RF_data$Pred_data.ExpressiveSuppression_sc, RF_data$Pred_data.GeneralDistress_14_lti, RF_data$Cluster_data.three_clusters) head(RF_data_ML_II); nrow(RF_data_ML_II) class(RF_data_ML_II$RF_data.Cluster_data.three_clusters) ##splitting the data in training and prediction data set set.seed(100) trainDataIndex <- createDataPartition(RF_data_ML_II$RF_data.Cluster_data.three_clusters, p=0.75, list = F) # 75% training data trainData_ML_II <- RF_data_ML_II[trainDataIndex,]; head(trainData_ML_II) testData_ML_II <- RF_data_ML_II[-trainDataIndex,]; head(testData_ML_II) table(trainData_ML_II$RF_data.Cluster_data.three_clusters) # alow bmoderate chigh # 468 293 88 table(testData_ML_II$RF_data.Cluster_data.three_clusters) # alow bmoderate chigh # 155 97 29 ##run the ordinal model on the training dataset: polr formula <- RF_data.Cluster_data.three_clusters ~ RF_data.CA_dic_num+ RF_data.gender_num+ RF_data.Pred_data.FriendSupport_14+ RF_data.Pred_data.FamilySupport_14+ RF_data.Pred_data.FamilyCohesion_14+ RF_data.Pred_data.PosSes_14+ RF_data.Pred_data.NegSes_14+ RF_data.Pred_data.Brd_14+ RF_data.Pred_data.Reflection_14+ RF_data.Pred_data.DisTol_14+ RF_data.Pred_data.Aggression_sc+ RF_data.Pred_data.ExpressiveSuppression_sc + RF_data.Pred_data.GeneralDistress_14_lti ordinalMod <- polr(formula, data = trainData_ML_II, Hess=TRUE, method ="probit"); summary(ordinalMod) #probit: Residual Deviance: 1425.151; AIC: 1455.151 ordinalMod <- polr(formula, data = trainData_ML_II, Hess=TRUE, method ="logistic"); summary(ordinalMod) #logistic: Residual Deviance: 1419.274; AIC: 1449.274 ctable <- coef(summary(ordinalMod)); p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2 ctable <- cbind(ctable, "p value" = p); ctable brant(ordinalMod, by.var= TRUE) #prop-odds-unmet: RF_data.gender_num=0; RF_data.Pred_data.DisTol_14=0.01; ##run the ordinal model on the training dataset: VGAM ordinalModpar0 <- vgam(formula, data = trainData_ML_II, cumulative(link = "logitlink", reverse = TRUE, parallel = TRUE)); summary(ordinalModpar0) #RD =1419.274; LL =-709.637; reverse=TRUE ctablepar <- coef(summary(ordinalModpar0)); ctablepar ordinalModpar_f1 <- vgam(formula, data = trainData_ML_II, cumulative(link = "logitlink", reverse = TRUE, parallel = FALSE ~ 1 + RF_data.gender_num)); summary(ordinalModpar_f1) #RD =1395.018; LL =-697.5092; reverse=TRUE ctablepar <- coef(summary(ordinalModpar_f1)); ctablepar lrtest_vglm(ordinalModpar0, ordinalModpar_f1) # p = 8.436e-07 *** ordinalModpar_f <- vgam(formula, data = trainData_ML_II, cumulative(link = "logitlink", reverse = TRUE, parallel = FALSE ~ 1 + RF_data.gender_num + RF_data.Pred_data.DisTol_14)); summary(ordinalModpar_f) #RD =1392.831; LL =-696.4155; reverse=TRUE ctablepar <- coef(summary(ordinalModpar_f)); ctablepar lrtest_vglm(ordinalModpar_f1, ordinalModpar_f) # p = 0.1392 ##predict pred_ML_II <- apply(VGAM::predictvglm(ordinalModpar_f, newdata = testData_ML_II, type = "response"), 1, which.max); head(pred_ML_II) pred_ML_II <- gsub("1","alow", pred_ML_II) pred_ML_II <- gsub("2","bmoderate", pred_ML_II) pred_ML_II <- gsub("3","chigh", pred_ML_II) pred_ML_II <- as.factor(pred_ML_II); table(pred_ML_II) # alow bmoderate chigh # 187 93 1 class_ML_II <- as.factor(testData_ML_II[,14]); table(class_ML_II) # alow bmoderate chigh # 155 97 29 ##assess accuracy caret::confusionMatrix(pred_ML_II, class_ML_II) #Prediction alow bmoderate chigh # alow 127 45 15 # bmoderate 28 51 14 # chigh 0 1 0 # Accuracy : 0.6335 # Class: alow Class: bmoderate Class: chigh #Sensitivity 0.8194 0.5258 0.000000 #Specificity 0.5238 0.7717 0.996032 #Balanced Accuracy 0.6716 0.6488 0.498016 #ROC curve predProbs_ML_II <- as.data.frame(VGAM::predictvglm(ordinalModpar_f, newdata = testData_ML_II, type = "response")) predROC_ML_II <- multiclass.roc(testData_ML_II[,14], predProbs_ML_II[,"chigh"]); auc(predROC_ML_II) # 0.7404 predROC_ML_II <- multiclass.roc(testData_ML_II[,14], predProbs_ML_II[,"bmoderate"]); auc(predROC_ML_II) # 0.6784 predROC_ML_II <- multiclass.roc(testData_ML_II[,14], predProbs_ML_II[,"alow"]); auc(predROC_ML_II) # 0.6891 ``` ##11: Continuous model: Run the machine learning model with CA, gender and RFs as predictors ```{r} ##make the data frame for the whole sample RF_data_ML_ws <- data.frame(RF_data$Pred_data.CA_dic, RF_data$Pred_data.gender, RF_data$Pred_data.FriendSupport_14, RF_data$Pred_data.FamilySupport_14, RF_data$Pred_data.FamilyCohesion_14, RF_data$Pred_data.PosSes_14, RF_data$Pred_data.NegSes_14, RF_data$Pred_data.Brd_14, RF_data$Pred_data.Reflection_14, RF_data$Pred_data.DisTol_14, RF_data$Pred_data.Aggression_sc, RF_data$Pred_data.ExpressiveSuppression_sc, RF_data$Pred_data.GeneralDistress_17_lti, RF_data$Pred_data.GeneralDistress_17_lti_SE) head(RF_data_ML_ws); nrow(RF_data_ML_ws) ##splitting the data in training and prediction data set set.seed(100) trainDataIndex <- createDataPartition(RF_data_ML_ws$RF_data.Pred_data.GeneralDistress_17_lti, p=0.75, list = F) # 75% training data trainData_ML_ws <- RF_data_ML_ws[trainDataIndex,]; head(trainData_ML_ws) testData_ML_ws <- RF_data_ML_ws[-trainDataIndex,]; head(testData_ML_ws) stat.desc(trainData_ML_ws$RF_data.Pred_data.GeneralDistress_17_lti) # 850 stat.desc(testData_ML_ws$RF_data.Pred_data.GeneralDistress_17_lti) # 280 ##run the linear model on the training dataset #lm formula <- RF_data.Pred_data.GeneralDistress_17_lti ~ RF_data.Pred_data.CA_dic + RF_data.Pred_data.gender + RF_data.Pred_data.FriendSupport_14 + RF_data.Pred_data.FamilySupport_14 + RF_data.Pred_data.FamilyCohesion_14 + RF_data.Pred_data.PosSes_14 + RF_data.Pred_data.NegSes_14 + RF_data.Pred_data.Brd_14 + RF_data.Pred_data.Reflection_14 + RF_data.Pred_data.DisTol_14 + RF_data.Pred_data.Aggression_sc+ RF_data.Pred_data.ExpressiveSuppression_sc lmMod <- lm(formula, data = trainData_ML_ws); summary (lmMod) #caret set.seed(100) lm_ML_ws <- train(x = trainData_ML_ws[,c(1:12)], y = trainData_ML_ws[,13], method = "lm", trControl = trainControl(method = "boot", number = 1000)); lm_ML_ws$finalModel lm_ML_ws # interpret R-squared #predict on test dataset pred_ML_ws <- predict(lm_ML_ws$finalModel, testData_ML_ws[,c(1:12)]); summary(pred_ML_ws) ##assess continuous outcomes #original - continuous stat.desc(testData_ML_ws$RF_data.Pred_data.GeneralDistress_17_lti) #predicted - continuous pred_ML_ws <- data.frame(pred_ML_ws); stat.desc(pred_ML_ws$pred_ML_ws) #model accuracy - continuous modelvalues <- data.frame(obs = testData_ML_ws$RF_data.Pred_data.GeneralDistress_17_lti, pred = pred_ML_ws$pred_ML_ws); defaultSummary(modelvalues) plot(testData_ML_ws$RF_data.Pred_data.GeneralDistress_17_lti, pred_ML_ws$pred_ML_ws) #prediction accuracy - continuous modelvalue_acc <- data.frame(obs = testData_ML_ws$RF_data.Pred_data.GeneralDistress_17_lti, obs_SE = testData_ML_ws$RF_data.Pred_data.GeneralDistress_17_lti_SE, pred = pred_ML_ws$pred_ML_ws); head(modelvalue_acc) modelvalue_acc$obs_lowSE95 <- (testData_ML_ws$RF_data.Pred_data.GeneralDistress_17_lti-(modelvalue_acc$obs_SE*1.96)) modelvalue_acc$obs_highSE95 <- (testData_ML_ws$RF_data.Pred_data.GeneralDistress_17_lti+(modelvalue_acc$obs_SE*1.96)) modelvalue_acc$obs_SE95 <- as.numeric(ifelse(modelvalue_acc$pred>modelvalue_acc$obs_highSE95,0,modelvalue_acc$pred)); head(modelvalue_acc) modelvalue_acc$obs_SE95 <- as.numeric(ifelse(modelvalue_acc$predmodelvalue_acc$obs_highSE95,0,modelvalue_acc$pred)); head(modelvalue_acc) modelvalue_acc$obs_SE95 <- as.numeric(ifelse(modelvalue_acc$predmodelvalue_acc$obs_highSE95,0,modelvalue_acc$pred)); head(modelvalue_acc) modelvalue_acc$obs_SE95 <- as.numeric(ifelse(modelvalue_acc$predmodelvalue_acc$obs_highSE95,0,modelvalue_acc$pred)); head(modelvalue_acc) modelvalue_acc$obs_SE95 <- as.numeric(ifelse(modelvalue_acc$predmodelvalue_acc$obs_highSE95,0,modelvalue_acc$pred)); head(modelvalue_acc) modelvalue_acc$obs_SE95 <- as.numeric(ifelse(modelvalue_acc$pred