setwd("D:\\Analytics Educator\\Analytics Educator\\logo\\letter head\\ppt template\\you tube\\10. T test ANOVA using R") library(openxlsx) ## T Test rock <- read.xlsx("Arindam Inc. HR Data.xlsx",sheet="data") head(rock) t.test(Salary ~ Gender, data=rock) # Different way of T Test sm <- rock[rock$Gender == "Male","Salary"] sf <- rock[rock$Gender == "Female","Salary"] t.test(sm,sf) ## Paired T Test kane <- read.xlsx("Arindam Inc. HR Data.xlsx",sheet="data1") kane t.test(weight ~ group, paired = TRUE, data=kane) kane aggregate(weight ~ group, data=kane, FUN= "mean") # if you want to test whether the average weight before #treatment is greater than the average weight after #treatment, type this: t.test(weight ~ group, data = kane, paired = TRUE, alternative = "greater") # if you want to test whether the average weight before #treatment is less than the average weight after #treatment, type this: t.test(weight ~ group, data = kane, paired = TRUE, alternative = "less") ## ANOVA head(rock) boogieman <- aov(Salary ~ Department, data=rock) summary(boogieman) TukeyHSD(boogieman) #### Check the homogeneity of variance assumption library(car) leveneTest(Salary ~ Department, data = rock) #From the output above we can see that the p-value #is not less than the significance level of 0.05. #This means that there is no evidence to suggest that #the variance across groups is statistically significantly #different. Therefore, we can assume the homogeneity of #variances in the different treatment groups. # Extract the residuals goldberg <- residuals(object = boogieman ) # Run Shapiro-Wilk test shapiro.test(x = goldberg ) library(nortest) ad.test(goldberg) # A non-parametric alternative to one-way ANOVA is #Kruskal-Wallis rank sum test, which can be used #when ANNOVA assumptions are not met. kruskal.test(Salary ~ Department, data = rock) # If it shows error then check if the group variable #is not a factor str(rock) rock$Department <- as.factor(rock$Department) #Chi-squared Test of Independence #Two random variables x and y are called independent if #the probability distribution of one variable is not #affected by the presence of another. #H0: the row and the column variables #of the contingency table are independent. #H1: row and column variables are dependent library(MASS) head(survey) table(survey$Smoke) table(survey$Exer) kane = table(survey$Smoke, survey$Exer) bookert <- chisq.test(kane) bookert undertaker = cbind(kane[,"Freq"], kane[,"None"] + kane[,"Some"]) bookert <- chisq.test(undertaker) bookert kane <- read.table("housetasks.txt") #The data is a contingency table containing 13 housetasks #and their distribution in the couple: #rows are the different tasks #values are the frequencies of the tasks done : #by the wife only #alternatively #by the husband only #or jointly bookert <- chisq.test(housetasks) bookert #H0: the row and the column variables #of the contingency table are independent. # Observed counts bookert$observed round(bookert$expected,2) #Expected counts are the projected frequencies in each cell #if the null hypothesis is true #(aka, no association between the variables.) round(bookert$residuals, 3) library(corrplot) corrplot(bookert$residuals, is.cor = FALSE) #Positive residuals are in blue. Positive values in cells #specify an attraction (positive association) between the #corresponding row and column variables. #In the image above, it’s evident that there are an #association between the column Wife and the rows Laundry, #Main_meal. #There is a strong positive association between the column #Husband and the row Repair #Negative residuals are in red. This implies a repulsion #(negative association) between the corresponding row and #column variables. For example the column Wife are #negatively associated (~ “not associated”) with the row #Repairs. There is a repulsion between the column Husband #and, the rows Laundry and Main_meal john_cena <- 100*bookert$residuals^2/bookert$statistic round(john_cena, 3) corrplot(john_cena, is.cor = FALSE) It can be seen that: #Wife is strongly associated with Laundry, Main_meal, Dinner #Husband is strongly associated with the row Repairs #jointly is frequently associated with the row Holidays # Correlation # Ranges from -1 to +1 # 0 means no correlation joyita <- mtcars head(joyita) # Check correlation between wt & mpg # Normality test for mpg shapiro.test(joyita$mpg) ad.test(joyita$mpg) # Normality test for wt shapiro.test(joyita$wt) ad.test(joyita$wt) ## Pearson correlation test cor(joyita$wt, joyita$mpg, method = "pearson") cor.test(joyita$wt, joyita$mpg, method="pearson") ## If they do not follow normal distribution # Kendall rank correlation test cor.test(joyita$wt, joyita$mpg, method="kendall") # tau is the Kendall correlation coefficient. #Spearman rank correlation coefficient cor.test(joyita$wt, joyita$mpg, method="spearman") #rho is the Spearman’s correlation coefficient.