#SPSP Introduction to R
#Video 5 Script: Basic Associative Tests
#Correlation and Chi Square
#Set working directory if needed----
setwd("C:/Users/.../Desktop/LearningR/IntroR")
#
#Installing and attaching packages----
PacMan <- function(pkg){new.pkg <- 
  pkg[!(pkg %in% installed.packages()
        [,"Package"])] 
if (length(new.pkg))         
  install.packages(new.pkg, 
                   dependencies = TRUE)     
sapply(pkg, library, character.only = TRUE)}
pkgs <- c("ggplot2", "dplyr", "apa", 
          "apaTables", "nortest", "MVN")
PacMan(pkgs)
#
#Import data----
grade_DF <- read.csv(
  "gradeData_2020.csv", 
  header=TRUE, 
  na.strings=c("NA", " ", ""))
#
#Converting Final Exam Variable to Categorical----
grade_DF$Letter <- 0
grade_DF$Letter[grade_DF$FinalExam< 59.5] <- "F"
grade_DF$Letter[grade_DF$FinalExam>=59.5 &
          grade_DF$FinalExam<69.5] <- "D"
grade_DF$Letter[grade_DF$FinalExam>=69.5 &
          grade_DF$FinalExam<79.5] <- "C"
grade_DF$Letter[grade_DF$FinalExam>=79.5 &
          grade_DF$FinalExam<89.5] <- "B"
grade_DF$Letter[grade_DF$FinalExam>=89.5] <- "A"
grade_DF$Letter <- factor(grade_DF$Letter)
levels(grade_DF$Letter)
View(grade_DF)
#
#Chi Square Test----
#
#Data Screening and Assumption Testing----
GrdSchoolType <- chisq.test(
  grade_DF$Letter, 
  grade_DF$schoolType, correct=FALSE)
round(GrdSchoolType$expected, 2)
LowExp <- sum(ifelse(
  GrdSchoolType$expected<=5,1,0))
Cells <- nlevels(grade_DF$Letter)*
          nlevels(grade_DF$schoolType)
LowExp
Cells
#Percent of expectancies at or below 5
LowExp/Cells*100
#
#Create combined letter grade for D and F
levels(grade_DF$Letter) <- 
  c(levels(grade_DF$Letter),"D or F")
grade_DF$Letter[grade_DF$FinalExam<69.5] <- 
        "D or F"
grade_DF$Letter <- droplevels(grade_DF$Letter)
View(grade_DF)
levels(grade_DF$Letter)
#
#Screen new levels
GrdSchoolType2 <- chisq.test(
  grade_DF$Letter, 
  grade_DF$schoolType, correct=FALSE)
round(GrdSchoolType2$expected, 2)
LowExp <- sum(ifelse(
  GrdSchoolType2$expected<=5,1,0))
Cells <- nlevels(grade_DF$Letter)*
  nlevels(grade_DF$schoolType)
LowExp
Cells
#Percent of expectancies at or below 5
LowExp/Cells*100
#
#Performing Chi Square Test----
GrdSchoolType2 <- chisq.test(
  grade_DF$Letter, 
  grade_DF$schoolType, correct=FALSE)
GrdSchoolType2
#
#Effect Size: Cramer's V
CramerV <- function(x, y){
  chiOut <- chisq.test(x, y, 
                       correct=FALSE)
  kx <- nlevels(x)
  ky <- nlevels(y)
  N <- sum(chiOut$observed
           [1:(kx*ky)])
  V <- sqrt(chiOut$statistic/
              (N*min((kx-1), (ky-1))))
  names(V) <- "Cramer's V"
  return(c(chiOut$statistic, V))
}
#
CramerV(grade_DF$Letter, 
        grade_DF$schoolType)
#
#Presenting Results----
#Letter & schoolType Tables
GrdSchoolType$observed
round(GrdSchoolType$expected, 2)
#
chisq_apa(GrdSchoolType2, 
          format="docx")
#
#Considering Alternatives----
#Chi Square Output with Alt. tests
X2_Out <- function(x, y){
  chiOut <- chisq.test(x, y, 
                    correct=FALSE)
  df1 <- length(unique(x))-1
  df2 <- length(unique(y))-1
  N <- sum(chiOut$observed[
          1:((df1+1)*(df2+1))])
  V <- round(sqrt(chiOut$statistic/
            (N*min(df1, df2))), 3)
  ChiVal <- round(chiOut$statistic, 3)
  Chidf <- round(chiOut$parameter, 0)
  pVal <- format.pval(chiOut$p.value, 
            eps=.001, digits=3)
  ChiResult <- data.frame(ChiVal, 
                          Chidf, 
                          pVal, V)
  #
  #Yates' Correction
  Yates <- chisq.test(x, y, 
                      correct=TRUE)
  ChiVal <- round(Yates$statistic, 3)
  Chidf <- round(Yates$parameter, 0)
  pVal <- format.pval(Yates$p.value, 
                      eps=.001, digits=3)
  V <- ""
  YatesCorrect <- data.frame(ChiVal, Chidf, 
                             pVal, V)
  #
  #Combine outputs
  ChiTab <- rbind.data.frame(ChiResult, 
                    YatesCorrect)
  names(ChiTab) <- c("Value", "df", 
             "p Value", "Cramer's V")
  row.names(ChiTab) <- c(
              "Pearson's Chi Sq",
              "Yates' Correction")
  ChiTab
}
X2Result <- X2_Out(grade_DF$Letter, 
                   grade_DF$schoolType)
View(X2Result)
#
#Use Monte Carlo simulation for p value
chisq.test(grade_DF$Letter, 
           grade_DF$schoolType, 
           simulate.p.value = TRUE, 
           B=5000)
#
#Bivariate Correlation----
#
#Checking assumptions----
#Visually assess scatterplot
ggplot(grade_DF, aes(x=SAT, y=FinalExam)) +
  geom_point(size=1, alpha=.2) +
  theme_classic() +
  geom_smooth(method=lm, color="black", 
              fill="blue")
#Univariate normality
library(nortest)
lillie.test(grade_DF$SAT)
lillie.test(grade_DF$FinalExam)
#Bivariate Normality
library(MVN)
names(grade_DF)
NormTest <- mvn(grade_DF[,c(3,6)], 
    mvnTest = "royston", 
    univariateTest = "Lillie", 
    multivariatePlot = "qq",
    univariatePlot = "qq")
NormTest$multivariateNormality
NormTest$univariateNormality
#
#Perform correlation----
cor.test(x=grade_DF$SAT, 
     y=grade_DF$FinalExam, 
     method="pearson")
#APA correlation output
CorrSATexam <- cor.test(x=grade_DF$SAT, 
    y=grade_DF$FinalExam, method="pearson")
cor_apa(CorrSATexam, format="docx")
#Add confidence interval
CorrSATexam
CorrSATexam$conf.int
#Table with confidence interval
names(grade_DF)
apa.cor.table(grade_DF[,c(3,6)], 
              filename="CorOut.doc")
#
#Considering alternatives----
cor.test(x=grade_DF$SAT, 
         y=grade_DF$FinalExam, 
         method="kendall")