setwd("D:\\Analytics Educator\\Analytics Educator\\logo\\letter head\\ppt template\\you tube\\10. T test ANOVA using R")

library(openxlsx)

## T Test

rock <- read.xlsx("Arindam Inc. HR Data.xlsx",sheet="data")

head(rock)

t.test(Salary ~ Gender, data=rock)

# Different way of T Test
sm <- rock[rock$Gender == "Male","Salary"]

sf <- rock[rock$Gender == "Female","Salary"]

t.test(sm,sf)





## Paired T Test



kane <- read.xlsx("Arindam Inc. HR Data.xlsx",sheet="data1")
kane

t.test(weight ~ group, paired = TRUE, data=kane)


kane

aggregate(weight ~ group, data=kane, FUN= "mean")


# if you want to test whether the average weight before 
#treatment is greater than the average weight after 
#treatment, type this:

t.test(weight ~ group, data = kane, paired = TRUE,
alternative = "greater")


# if you want to test whether the average weight before 
#treatment is less than the average weight after 
#treatment, type this:

t.test(weight ~ group, data = kane, paired = TRUE,
alternative = "less")



## ANOVA


head(rock)

boogieman <-  aov(Salary ~ Department, data=rock)
summary(boogieman)

TukeyHSD(boogieman)


#### Check the homogeneity of variance assumption

library(car)
leveneTest(Salary ~ Department, data = rock)

#From the output above we can see that the p-value 
#is not less than the significance level of 0.05. 
#This means that there is no evidence to suggest that 
#the variance across groups is statistically significantly
#different. Therefore, we can assume the homogeneity of 
#variances in the different treatment groups.


# Extract the residuals
goldberg <- residuals(object = boogieman )

# Run Shapiro-Wilk test
shapiro.test(x = goldberg )



library(nortest)
ad.test(goldberg)




# A non-parametric alternative to one-way ANOVA is
#Kruskal-Wallis rank sum test, which can be used 
#when ANNOVA assumptions are not met.

kruskal.test(Salary ~ Department, data = rock)

# If it shows error then check if the group variable
#is not a factor

str(rock)
rock$Department <- as.factor(rock$Department)




#Chi-squared Test of Independence
#Two random variables x and y are called independent if 
#the probability distribution of one variable is not 
#affected by the presence of another.

#H0: the row and the column variables
#of the contingency table are independent.

#H1: row and column variables are dependent

library(MASS)
head(survey)

table(survey$Smoke)
table(survey$Exer)

kane = table(survey$Smoke, survey$Exer) 


bookert <- chisq.test(kane)
bookert


undertaker = cbind(kane[,"Freq"], kane[,"None"] +
 kane[,"Some"]) 

bookert <- chisq.test(undertaker)
bookert


kane <- read.table("housetasks.txt")

#The data is a contingency table containing 13 housetasks 
#and their distribution in the couple:

#rows are the different tasks
#values are the frequencies of the tasks done :
#by the wife only
#alternatively
#by the husband only
#or jointly

bookert <- chisq.test(housetasks)
bookert

#H0: the row and the column variables
#of the contingency table are independent.

# Observed counts
bookert$observed


round(bookert$expected,2)

#Expected counts are the projected frequencies in each cell
#if the null hypothesis is true 
#(aka, no association between the variables.)



round(bookert$residuals, 3)

library(corrplot)
corrplot(bookert$residuals, is.cor = FALSE)

#Positive residuals are in blue. Positive values in cells 
#specify an attraction (positive association) between the 
#corresponding row and column variables.
#In the image above, it’s evident that there are an 
#association between the column Wife and the rows Laundry, 
#Main_meal.
#There is a strong positive association between the column 
#Husband and the row Repair
#Negative residuals are in red. This implies a repulsion
#(negative association) between the corresponding row and
#column variables. For example the column Wife are 
#negatively associated (~ “not associated”) with the row 
#Repairs. There is a repulsion between the column Husband 
#and, the rows Laundry and Main_meal



john_cena <- 100*bookert$residuals^2/bookert$statistic
round(john_cena, 3)
corrplot(john_cena, is.cor = FALSE)

It can be seen that:

#Wife is strongly associated with Laundry, Main_meal, Dinner
#Husband is strongly associated with the row Repairs
#jointly is frequently associated with the row Holidays



# Correlation
# Ranges from -1  to +1
# 0 means no correlation

joyita <- mtcars

head(joyita)

# Check correlation between wt & mpg

								

# Normality test for mpg
shapiro.test(joyita$mpg) 
ad.test(joyita$mpg)

# Normality test for wt
shapiro.test(joyita$wt) 
ad.test(joyita$wt)


## Pearson correlation test

cor(joyita$wt, joyita$mpg, method = "pearson")

cor.test(joyita$wt, joyita$mpg, method="pearson")





## If they do not follow normal distribution

# Kendall rank correlation test
cor.test(joyita$wt, joyita$mpg, method="kendall")

# tau is the Kendall correlation coefficient.



#Spearman rank correlation coefficient


cor.test(joyita$wt, joyita$mpg, method="spearman")

#rho is the Spearman’s correlation coefficient.