#SPSP Introduction to R #Video 6 Script: Int. Associative Tests #Bivariate and Multiple Regression #Set working directory if needed---- setwd("C:/Users/.../Desktop/LearningR/IntroR") # #Installing and attaching packages---- PacMan <- function(pkg){new.pkg <- pkg[!(pkg %in% installed.packages() [,"Package"])] if (length(new.pkg)) install.packages(new.pkg, dependencies = TRUE) sapply(pkg, library, character.only = TRUE)} pkgs <- c("ggplot2", "apaTables", "nortest", "MVN", "gvlma", "semTools", "DescTools", "dplyr", "tidyr", "broom", "lm.beta", "ppcor", "olsrr") PacMan(pkgs) # #Import data---- grade_DF <- read.csv( "gradeData_2020.csv", header=TRUE, na.strings=c("NA", " ", "")) View(grade_DF) # #Bivariate Regression---- # ##Check Assumptions---- #Visually assess scatterplot ggplot(grade_DF, aes(x=SAT, y=FinalExam)) + geom_point(size=1, alpha=.4) + theme_classic() + geom_smooth(method=lm, color="black", fill="blue") # #Univariate normality library(nortest) lillie.test(grade_DF$SAT) lillie.test(grade_DF$FinalExam) # shapiro.test(grade_DF$SAT) shapiro.test(grade_DF$FinalExam) # library(semTools) skew(grade_DF$SAT) kurtosis(grade_DF$SAT) skew(grade_DF$FinalExam) kurtosis(grade_DF$FinalExam) #Create summary table #with multiple variables library(dplyr) library(tidyr) library(DescTools) SumTab <- grade_DF %>% summarize_at(vars(SAT, FinalExam), c(M=mean, SD=sd, Mdn=median, skew=Skew, kurt=Kurt))%>% gather(stat, value) %>% separate(stat, into = c("Var", "stat")) %>% spread(stat, value) %>% mutate_if(is.numeric, round, 3) %>% mutate_if(is.numeric, format, nsmall=3) %>% select_at(vars(Var, M, SD, Mdn, skew, kurt)) View(SumTab) # #Bivariate Normality library(MVN) NormTest <- mvn(grade_DF[,c(3,6)], mvnTest = "royston", univariateTest = "Lillie", multivariatePlot = "qq", multivariateOutlierMethod = "quan") NormTest$multivariateNormality NormTest$univariateNormality View(NormTest$Descriptives) # #Evaluate Residuals #Run Model BivarReg <- lm( formula = FinalExam ~ SAT, data = grade_DF) #Histogram of Residuals hist(resid(BivarReg)) hist(BivarReg$residuals) #ggplot2 - Residuals Histogram #Graph directly from Regression Model Output ggplot(data=as.data.frame(BivarReg$residuals), aes(x=BivarReg$residuals))+ geom_histogram(position="identity", alpha=0.8, binwidth=5, col="white")+ labs(title="Residuals Histogram", x="Residuals")+theme_classic() #Add fit statistics to data frame grade_DF$ExamPred <- predict(BivarReg) grade_DF$ExamResid <- residuals(BivarReg) View(grade_DF) #Residual histogram from DF ggplot(data=grade_DF, aes(x=ExamResid))+ geom_histogram(position="identity", alpha=0.8, binwidth=5, col="white")+ labs(title="Residuals Histogram")+ theme_classic() # #Check the mean of the residuals is 0 round(mean(BivarReg$residuals),3) summary(BivarReg$residuals) #Using residuals stored in data frame round(mean(grade_DF$ExamResid), 3) # #Check assumptions with plots plot(BivarReg) # #Check plots from gvlma library(gvlma) plot(gvlma(BivarReg), onepage = FALSE) #Plot Pred by Resid values ggplot(grade_DF, aes(x=ExamPred, y=ExamResid)) + geom_point(alpha=.4) + geom_smooth(method="lm", se=FALSE, color="blue")+ theme_classic() #Create Standardized Residuals Mresid <- mean(grade_DF$ExamResid) SDresid <- sd(grade_DF$ExamResid) grade_DF$ExamResid_z <- (( grade_DF$ExamResid-Mresid)/SDresid) # #Plot Pred by Stand Resid values ggplot(grade_DF, aes(x=ExamPred, y=ExamResid_z)) + geom_point(alpha=.4) + geom_smooth(method="lm", se=FALSE, color="blue")+ theme_classic()+ scale_x_continuous(breaks=seq(65, 90, 5)) # #Augment model to a Tidy Tibble library(broom) RegMetrics <- round(augment(BivarReg), 3) # #View summary of RegMetrics summary(RegMetrics) View(RegMetrics) #Add measure of influence to DF INF_est <- influence(BivarReg) grade_DF$sigmaInf <- INF_est$sigma View(grade_DF) # ##Complete Regression---- BivarReg <- lm( formula = FinalExam ~ SAT, data = grade_DF) #Output regression model summary.lm(BivarReg) anova(BivarReg) #Coefficients for model coefficients(BivarReg) coef(BivarReg) #Confidence interval for estimates confint.lm(BivarReg, level=.95) # ##Present Regression---- #Correlation table with M and SD library(apaTables) apa.cor.table(grade_DF[,c(3,6)], filename="CorOut.doc") #Regression model table apa.reg.table(BivarReg, filename="RegOut.doc") # ##Consider Alternatives---- set.seed(42)#For reproducibility #Bootstrap Confidence Intervals apa.reg.boot.table(BivarReg, filename="BootReg5000.doc", table.number=1001, number.samples = 5000) # #Multiple Regression---- # ##Check Assumptions---- #Visually assess scatterplots ggplot(grade_DF, aes(x=SAT, y=FinalExam)) + geom_point(size=1, alpha=.4) + theme_classic() + geom_smooth(method=lm, color="black", fill="blue") # ggplot(grade_DF, aes(x=minStudy, y=FinalExam)) + geom_point(size=1, alpha=.4) + theme_classic() + geom_smooth(method=lm, color="black", fill="blue") # #Univariate and Multivariate Normality #Create data frame with variables of interest MultReg_DF <- subset.data.frame(grade_DF, select=c(SAT, minStudy, FinalExam)) View(MultReg) #Check normality MultNorm <- mvn(MultReg_DF, mvnTest = "royston", univariateTest = "Lillie", multivariatePlot = "qq", multivariateOutlierMethod = "quan") View(MultNorm$multivariateNormality) View(MultNorm$univariateNormality) View(MultNorm$Descriptives) # #Evaluate Residuals #Run Model MultReg <- lm( formula = FinalExam ~ SAT + minStudy, data = grade_DF) #Create Residuals Histogram #Add predicted and residual #values to data frame grade_DF$ExamPred <- predict(MultReg) grade_DF$ExamResid <- residuals(MultReg) View(grade_DF) #Create histogram of residuals ggplot(data=grade_DF, aes(x=ExamResid))+ geom_histogram(position="identity", alpha=0.8, binwidth=5, col="white")+ labs(title="Residuals Histogram")+ theme_classic() # #Check the mean of the residuals is 0 summary(grade_DF$ExamResid) # #Check plots plot(gvlma(MultReg), onepage = FALSE) # #Augment model to a Tidy Tibble MultRegMetrics <- round(augment(MultReg), 3) #View summary of RegMetrics summary(MultRegMetrics) #Add influence estimate to DF INF_estMult <- influence(MultReg) View(INF_estMult) grade_DF$sigmaInfMult <- INF_estMult$sigma View(grade_DF) # #Check Collinearity #Check zero-order correlation #between IVs apa.cor.table(MultReg_DF, filename="MultCorOut.doc") #Formally test collinearity install.packages("olsrr", dependencies=T) library(olsrr) CollTest <- ols_coll_diag(MultReg) View(CollTest$vif_t %>% mutate_if(is.numeric, round, 3)) View(CollTest$eig_cindex %>% mutate_if(is.numeric, round, 3)) # ##Complete Regression---- MultReg <- lm( FinalExam ~ SAT + minStudy, grade_DF) #Summary output for model summary.lm(MultReg) #Summary for Bivariate model anova(BivarReg) #Summary for comparison between #Bivariate and Multiple Regression #Does adding minStudy variable #improve estimates for FinalExam? anova(BivarReg, MultReg) # ##Present Regression---- #Descriptives and zero-order correlations library(apaTables) apa.cor.table(MultReg_DF, filename="MultCorOut.doc") #Regression table apa.reg.table(MultReg, filename="MultRegOut.doc") # ##Consider Alternatives---- set.seed(42)#For reproducibility #Bootstrap confidence intervals apa.reg.boot.table(MultReg, filename="BootMultReg5000.doc", table.number=1002, number.samples = 5000) #BONUS: 3D Visualizations---- #Stationary 3D plot library(scatterplot3d) ScatEXAM_DF <- scatterplot3d(grade_DF$FinalExam ~ grade_DF$SAT + grade_DF$minStudy, main="Final Exam \n by SAT and Minutes Studied", xlab="SAT", ylab="Min. Studied", zlab="Final Exam") ScatPlane <- lm(FinalExam~SAT+minStudy, data=grade_DF) ScatEXAM_DF$plane3d(ScatPlane) # #Interactive 3D plot library(rgl) plot3d(FinalExam~SAT+minStudy, data=grade_DF, xlab="SAT", ylab="Min. Studied", zlab="Final Exam") # #Scatterplot with Grouping Var. library(car) scatter3d(FinalExam~SAT+minStudy| schoolType, data=grade_DF, xlab="SAT", ylab="Final Exam", zlab="minStudy") #