#SPSP Introduction to R #Video 3 Script #Set working directory if needed---- setwd("C:/Users/.../Desktop/LearningR/IntroR") # #Import data---- grade_DF <- read.csv( "gradeData_2020.csv", header=TRUE, na.strings=c("NA", " ", "", "-99")) #Import SPSS file library(foreign) grade_SPSS <- read.spss( "gradeData_2020.sav", to.data.frame=TRUE) # #Check data imported correctly from .csv file---- #See names for columns in data frame names(grade_DF) #See first 10 rows from data frame head(grade_DF, n=10) #See last 6 rows from data frame #Note: Default is n=6, which could #be changed to see different number of rows tail(grade_DF) # #View data frame View(grade_DF) # #Display internal structure of object str(grade_DF) # #Show levels from column schoolType in #grade_DF data frame levels(grade_DF$schoolType) #Show type/mode of schoolType from grade_DF class(grade_DF$schoolType) # #Visually explore data---- #Create Bar chart - Frequency School Type #Check frequency of schoolType table(grade_DF$schoolType) #See levels of schoolType #to know order of level labels levels(grade_DF$schoolType) #Create plot from frequency table barplot(table(grade_DF$schoolType), names.arg=c( "Home","Priv","Pub","Rel")) #Create plot with titles #The expression function allows you to #insert special symbols barplot(table(grade_DF$schoolType), main="Bar Chart: Type of School", xlab="Type of School", ylab=expression(paste( "Frequency (", italic("n"),")")), names.arg=c("Home","Priv","Pub","Rel"), border="purple") #Histogram SAT hist(grade_DF$SAT, main="SAT Histogram", xlab="SAT") #Boxplot - SAT by School Type boxplot(SAT~schoolType, data=grade_DF, main="SAT by School Type", xlab="School Type", ylab="SAT(Mdn)") #Scatterplot of SAT and FinalExam plot(x=grade_DF$SAT, y= grade_DF$FinalExam, xlab="SAT", ylab="FinalExam") #Add fit line abline(lm(grade_DF$FinalExam~grade_DF$SAT), col="blue") #Quantitatively Explore Data---- #Summarize data frame without first column summary(grade_DF[,-1]) #Summarize categorical variable summary(grade_DF$schoolType) #Summarize continuous variable summary(grade_DF$SAT) # #Frequency table with categorical var table(grade_DF$schoolType) SchoolTab <- as.data.frame( table(grade_DF$schoolType, dnn="Type of School")) View(SchoolTab) #Proportions SchoolProp <- prop.table(SchoolTab$Freq) SchoolProp SchoolPerc <- SchoolProp*100 SchoolFreqTab <- data.frame(SchoolTab, Proportion=SchoolProp, Percentage=SchoolPerc) View(SchoolFreqTab) names(SchoolFreqTab) #Add names to object names(SchoolFreqTab) <- c("School Type", "Freq (n)", "Proportion", "%") View(SchoolFreqTab) #Save table for later use write.csv(SchoolFreqTab, file="SchoolFreqTab.csv", row.names = FALSE) # #Summarizing Continuous Var gradeMean <- mean(grade_DF$FinalExam) gradeMean gradeMedian <- median(grade_DF$FinalExam) gradeMedian gradeMin <- min(grade_DF$FinalExam) gradeMin gradeMax <- max(grade_DF$FinalExam) gradeMax summary(grade_DF$FinalExam) #Use min and max objects to calculate range gradeMax-gradeMin #summarize variability gradeVar <- var(grade_DF$FinalExam) gradeVar round(gradeVar, 2) gradeSD <- sd(grade_DF$FinalExam) gradeSD sqrt(gradeVar) # #Creating summary table with dplyr #Install dplyr package install.packages("dplyr", dependencies = T) #Attach/activate package to use it library(dplyr) #Install and attach semTools install.packages("semTools", dependencies=T) library(semTools) # View(grade_DF %>% group_by(schoolType) %>% summarize(Grp_n=n(), SATmean=mean(SAT), SATsd=sd(SAT), SATse=SATsd/sqrt(Grp_n), LL95=SATmean-(SATse*qt(.975,Grp_n-1)), UL95=SATmean+(SATse*qt(.975,Grp_n-1)), SATskew=skew(SAT)[1], SATkurt=kurtosis(SAT)[1])) # #Converting Data---- #Continuous to Categorical #Create new column in data frame filled with 0s grade_DF$GradeCat <- 0 View(grade_DF) #Create low category for individuals with FinalExam #less than 1 standard deviation (SD) below mean grade_DF$GradeCat[grade_DF$FinalExam< (gradeMean-gradeSD)] <- "low" #Create avg category for individuals with FinalExam #between two points: greater than or equal mean minus SD #and less than or equal mean plus SD grade_DF$GradeCat[grade_DF$FinalExam>= (gradeMean-gradeSD)& grade_DF$FinalExam<= (gradeMean+gradeSD)] <- "avg" #Create high category for individuals with FinalExam #greater than 1 SD above mean grade_DF$GradeCat[grade_DF$FinalExam> (gradeMean+gradeSD)] <- "high" View(grade_DF) #See type of data for GradeCat column class(grade_DF$GradeCat) #Convert GradeCat to factor grade_DF$GradeCat <- factor(grade_DF$GradeCat) #Check levels for GradeCat levels(grade_DF$GradeCat) # #Table with two grouping variables table(grade_DF$schoolType, grade_DF$GradeCat) # # #Managing and Exporting Data---- #Select Adjacent Columns names(grade_DF) View(select(grade_DF, gender:GradeCat)) #Select Separate Columns View(select(grade_DF, c(Part_ID, SAT, FinalExam))) #Create subset of data using filter #with only individuals from private schools library(dplyr) Private_DF <- filter( grade_DF, schoolType=="private") table(grade_DF$schoolType) View(Private_DF) levels(grade_DF$schoolType) #Order data by SAT scores View(arrange(Private_DF, SAT)) #Add randomly assigned conditions #Set seed to allow for reproducible quasi-random numbers set.seed(42) #Use sample function to get set of numbers 1, 2, or 3 #that is equal in length to the number of participant IDs Private_DF$Cond <- sample(1:3, size=length(Private_DF$Part_ID), replace=T) View(Private_DF) table(Private_DF$Cond) # #Save data frame with students #who attended private school to new .csv file write.csv(Private_DF, file="Private_DF.csv", row.names = F)