Importation des données
Analyse univariée (pas dans le rapport).
D <- 1:ncol(data)
options(digits=1)
Hmisc::describe(D)## D
## n missing distinct Info Mean Gmd .05 .10
## 10 0 10 1 5.5 3.667 1.45 1.90
## .25 .50 .75 .90 .95
## 3.25 5.50 7.75 9.10 9.55
##
## lowest : 1 2 3 4 5, highest: 6 7 8 9 10
##
## Value 1 2 3 4 5 6 7 8 9 10
## Frequency 1 1 1 1 1 1 1 1 1 1
## Proportion 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1
par(mfrow=c(2,5))
for (j in D) {
hist(data[,j],col="blue", xlab = names(data)[j],main = "")
}A travers ce grapgique on peut voir que certaines variables suivent une loi normal.
Analyse bivariée
Relation des prédicteurs entre eux.
entrees <- as.data.frame(data[,-1])
cormat <- round(cor(data),2)
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(cormat)
reorder_cormat <- function(cormat){
# Utiliser la corrélation entre les variables
# comme mésure de distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
# Reordonner la matrice de corrélation
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Fondre la matrice de corrélation
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Créer un ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()+
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(0.6, 0.7),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5))
print(ggheatmap)Commentaire dans le rapport.
lien entre chaque prédicteur et la variable réponse
A <- ggplot(data = data, aes(x = T, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
B <- ggplot(data = data, aes(x = HR, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
C <- ggplot(data = data, aes(x = PR, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
D <- ggplot(data = data, aes(x = CL, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
E <- ggplot(data = data, aes(x = PD, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
F <- ggplot(data = data, aes(x = MA, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
G <- ggplot(data = data, aes(x = SI, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
H <- ggplot(data = data, aes(x = FM, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
I <- ggplot(data = data, aes(x = SH, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
grid.arrange(A, B, C, D, E, F, G, H, I, ncol=3, nrow = 3)Commentaire dans le rapport.
cor.test(data$T, data$TDM, method = "pearson") ##
## Pearson's product-moment correlation
##
## data: data$T and data$TDM
## t = -4, df = 53, p-value = 2e-04
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7 -0.3
## sample estimates:
## cor
## -0.5
Standardisation des données.
min <- apply(data,2,min)
max <- apply(data,2,max)
data_stand <- sweep(data,2,STATS=min,FUN="-")
s <- max-min
data_stand <- sweep(data_stand, 2, s, FUN="/")
head(data_stand)## TDM T HR PR CL PD MA SI FM SH
## 1 0.3 0.4 0.3 0.2 0.0 0.2 0.3 0.4 0.5 0.12
## 2 0.0 1.0 0.5 0.5 0.2 0.5 0.0 0.4 0.5 0.27
## 3 0.3 0.5 0.6 0.4 0.3 0.7 0.3 0.8 0.9 0.49
## 4 0.3 0.2 0.6 0.4 0.2 0.2 0.4 0.5 0.7 0.08
## 5 0.2 0.7 0.5 0.7 0.4 0.4 0.3 0.8 0.5 0.41
## 6 0.2 0.6 0.4 0.2 0.0 0.4 0.6 0.7 0.4 0.30
Les observations sont bien comprises entre 0 et 1.
Base de fonctions des polynômes d’ordre 3.
data_stand_multi <- data_stand %>%
mutate( T_2=T**2, T_3=T**3, HR_2=HR**2, HR_3=HR**3, PR_2=PR**2, PR_3=PR**3, CL_2=CL**2, CL_3=CL**3, PD_2=PD**2, PD_3=PD**3, MA_2=MA**2, MA_3=MA**3, SI_2=SI**2, SI_3=SI**3, FM_2=FM**2, FM_3=FM**3, SH_2=SH**2, SH_3=SH**3)Analyse multivariée
Comparaison des critères pour la sélection de tous les sous-ensembles.
require(MASS)
B = 30
exhau.AIC <- rep(NA,B)
exhau.R2a <- rep(NA,B)
exhau.Pv <- rep(NA,B)
forw.AIC <- rep(NA,B)
forw.R2a <- rep(NA,B)
forw.Pv <- rep(NA,B)
back.AIC <- rep(NA,B)
back.R2a <- rep(NA,B)
back.Pv <- rep(NA,B)
set.seed(123)
for(i in 1:B){
# Partition aléatoire des données : 2/3 pour l’apprentissage et 1/3 pour le test.
n <- nrow(data_stand_multi)
tr <- sample(1:n,((55*2)/3))
train <- data_stand_multi[tr,]
test <- data_stand_multi[-tr,]
Xtrain <- train[,-1]
Ytrain <- train[,1]
Xtest <- test[,-1]
Ytest <- test[,1]
# Sélection de tous les sous-ensembles
kmax=27
grille_mod <-1:kmax
err_AIC_mod <- rep(NA,length(grille_mod))
err_R2a_mod <- rep(NA,length(grille_mod))
err_Pv_mod <- rep(NA,length(grille_mod))
p=27
models <- regsubsets(TDM~.,data=train,nvmax=p,method=c("exhaustive"))
for(k in 1:length(grille_mod))
{
model_k <- summary(models)$which[k,-1]
nv <- names(which(model_k == TRUE))
nv<- paste(nv, collapse = "+")
nv <- paste0("TDM", "~", nv)
nv <- as.formula(nv)
model_lineaire <- lm(nv,data=train)
err_AIC_mod[k] <- AIC(model_lineaire)
err_R2a_mod[k] <- summary(model_lineaire)$adj.r.squared
err_Pv_mod[k] <- broom::glance(model_lineaire)$p.value
}
## Critère AIC
k_opt_AIC <- grille_mod[which.min(err_AIC_mod)]
model_k_opt_AIC <- summary(models)$which[k_opt_AIC,-1]
nv <- names(which(model_k_opt_AIC == TRUE))
nv<- paste(nv, collapse = "+")
nv <- paste0("TDM", "~", nv)
nv <- as.formula(nv)
model_lineaire_k_opt_AIC <- lm(nv,data=train)
prediction_model_lineaire_k_opt_AIC <- predict(model_lineaire_k_opt_AIC,Xtest)
exhau.AIC[i] <- sqrt(mean((Ytest - prediction_model_lineaire_k_opt_AIC)^2))
## Critère R2ajusté
k_opt_R2a <- grille_mod[which.min(err_R2a_mod)]
model_k_opt_R2a <- summary(models)$which[k_opt_R2a,-1]
nv <- names(which(model_k_opt_R2a == TRUE))
nv<- paste(nv, collapse = "+")
nv <- paste0("TDM", "~", nv)
nv <- as.formula(nv)
model_lineaire_k_opt_R2a <- lm(nv,data=train)
prediction_model_lineaire_k_opt_R2a <- predict(model_lineaire_k_opt_R2a,Xtest)
exhau.R2a[i] <- sqrt(mean((Ytest - prediction_model_lineaire_k_opt_R2a)^2))
## Critère P.value
k_opt_Pv <- grille_mod[which.min(err_Pv_mod)]
model_k_opt_Pv <- summary(models)$which[k_opt_Pv,-1]
nv <- names(which(model_k_opt_Pv == TRUE))
nv<- paste(nv, collapse = "+")
nv <- paste0("TDM", "~", nv)
nv <- as.formula(nv)
model_lineaire_k_opt_Pv <- lm(nv,data=train)
prediction_model_lineaire_k_opt_Pv <- predict(model_lineaire_k_opt_Pv,Xtest)
exhau.Pv[i] <- sqrt(mean((Ytest - prediction_model_lineaire_k_opt_Pv)^2))
# Selection pas à pas ascendante(forward)
nullmodel = lm(TDM~1, train)
fullmodel = lm(TDM~., train)
scope = list(lower=formula(nullmodel),upper=formula(fullmodel))
fit.forw <- lm(TDM ~1, data = train)
fit.back <- lm(TDM ~., data = train)
## Critère AIC
fit_forw_AIC <- SignifReg(fit.forw, scope = scope, direction = "forward", trace = FALSE, criterion = "AIC")
pred_test_forw_AIC <- predict(fit_forw_AIC,Xtest)
forw.AIC[i] <- sqrt(mean((Ytest -pred_test_forw_AIC)^2))
## Critère R2ajusté
fit_forw_R2a <- SignifReg(fit.forw, scope = scope, direction = "forward", trace = FALSE, criterion = "r-adj")
pred_test_forw_R2a <- predict(fit_forw_R2a,Xtest)
forw.R2a[i] <- sqrt(mean((Ytest -pred_test_forw_R2a)^2))
## Critère P.value
fit_forw_Pv <- SignifReg(fit.forw, scope = scope, direction = "forward", trace = FALSE, criterion = "p-value")
pred_test_forw_Pv <- predict(fit_forw_Pv,Xtest)
forw.Pv[i] <- sqrt(mean((Ytest -pred_test_forw_Pv)^2))
# Selection pas à pas descendante(backward)
## Critère AIC
fit_back_AIC <- SignifReg(fit.back, scope = scope, direction = "backward", trace = FALSE, criterion = "AIC")
pred_test_back_AIC <- predict(fit_back_AIC,Xtest)
back.AIC[i] <- sqrt(mean((Ytest -pred_test_back_AIC)^2))
## Critère R2ajusté
fit_back_R2a <- SignifReg(fit.back, scope = scope, direction = "backward", trace = FALSE, criterion = "r-adj")
pred_test_back_R2a <- predict(fit_back_R2a,Xtest)
back.R2a[i] <- sqrt(mean((Ytest -pred_test_back_R2a)^2))
## Critère P.value
fit_back_Pv <- SignifReg(fit.back, scope = scope, direction = "backward", trace = FALSE, criterion = "p-value")
pred_test_back_Pv <- predict(fit_back_Pv,Xtest)
back.Pv[i] <- sqrt(mean((Ytest -pred_test_back_Pv)^2))
}Comparaissons via la moyenne des erreurs test
mean(exhau.AIC)## [1] 0.2
mean(forw.AIC)## [1] 0.1
mean(back.AIC)## [1] 0.2
mean(exhau.R2a)## [1] 0.1
mean(forw.R2a)## [1] 0.1
mean(back.R2a)## [1] 0.2
mean(exhau.Pv)## [1] 0.1
mean(forw.Pv)## [1] 0.1
mean(back.Pv)## [1] 0.1
par(mfrow=c(1,1))
err_test_1<-cbind(exhau.AIC, exhau.R2a, exhau.Pv, forw.AIC, forw.R2a, forw.Pv)
boxplot(err_test_1,ylab="erreur test", col=rainbow(6))Comparaison des 8 méthodes de sélection et taux de Faux Négatif
# Calcul pour le taux de faux négatif
X <- data_stand[,-1]
Indbeta<-matrix(0,nrow=ncol(X),ncol = 1) #Indicateur de vrais beta non nuls (dim depend de dim X)
rownames(Indbeta)<-colnames(X)
Indbeta<-rbind(0,Indbeta) #on rajoute l intercepte
rownames(Indbeta)[1]<-"(Intercept)" #Sans
Ind<-which(rownames(Indbeta) == "T" )
Indbeta[Ind]<-1
# Vecteur des erreurs
B = 30
exhau.Pv <- rep(NA,B)
back.Pv <- rep(NA,B)
forw.R2a <- rep(NA,B)
Lasso <- rep(NA,B)
Ridge <- rep(NA,B)
PCR <- rep(NA,B)
PLS <- rep(NA,B)
sPLS <- rep(NA,B)
FN<-matrix(NA,nrow=B,ncol=8) # false negative rate
colnames(FN)<-c("PCR", "PLS", "sPLS","Lasso","Ridge","Back.Pv","Forw.R2", "Exhau.Pv")
set.seed(123)
for(i in 1:B){
# Partition aléatoire des données : 2/3 pour l’apprentissage et 1/3 pour le test.
n <- nrow(data_stand_multi)
tr <- sample(1:n,((55*2)/3))
train <- data_stand_multi[tr,]
test <- data_stand_multi[-tr,]
Xtrain <- train[,-1]
Ytrain <- train[,1]
Xtest <- test[,-1]
Ytest <- test[,1]
# Sélection de tous les sous-ensembles
kmax=27
grille_mod <-1:kmax
err_Pv_mod <- rep(NA,length(grille_mod))
p=27
models <- regsubsets(TDM~.,data=train,nvmax=p,method=c("exhaustive"))
for(k in 1:length(grille_mod))
{
model_k <- summary(models)$which[k,-1]
nv <- names(which(model_k == TRUE))
nv<- paste(nv, collapse = "+")
nv <- paste0("TDM", "~", nv)
nv <- as.formula(nv)
model_lineaire <- lm(nv,data=train)
err_Pv_mod[k] <- broom::glance(model_lineaire)$p.value
}
## Critère P.value
k_opt_Pv <- grille_mod[which.min(err_Pv_mod)]
model_k_opt_Pv <- summary(models)$which[k_opt_Pv,-1]
nv <- names(which(model_k_opt_Pv == TRUE))
nv<- paste(nv, collapse = "+")
nv <- paste0("TDM", "~", nv)
nv <- as.formula(nv)
model_lineaire_k_opt_Pv <- lm(nv,data=train)
prediction_model_lineaire_k_opt_Pv <- predict(model_lineaire_k_opt_Pv,Xtest)
exhau.Pv[i] <- sqrt(mean((Ytest - prediction_model_lineaire_k_opt_Pv)^2))
betahat.PvS<-model_lineaire_k_opt_Pv$coefficients
IndPvS<-which(rownames(Indbeta)%in%names(betahat.PvS))
Indbetahat.PvS<-Indbeta
Indbetahat.PvS[-IndPvS]<-0
Indbetahat.PvS[IndPvS]<-1
# Selection pas à pas
nullmodel = lm(TDM~1, train)
fullmodel = lm(TDM~., train)
scope = list(lower=formula(nullmodel),upper=formula(fullmodel))
fit.forw <- lm(TDM ~1, data = train)
fit.back <- lm(TDM ~., data = train)
## Ascendant avec critère R2ajusté
fit_forw_R2a <- SignifReg(fit.forw, scope = scope, direction = "forward", trace = FALSE, criterion = "r-adj")
pred_test_forw_R2a <- predict(fit_forw_R2a,Xtest)
forw.R2a[i] <- sqrt(mean((Ytest -pred_test_forw_R2a)^2))
betahat.R2a<-fit_forw_R2a$coefficients
IndR2a<-which(rownames(Indbeta)%in%names(betahat.R2a))
Indbetahat.R2a<-Indbeta
Indbetahat.R2a[-IndR2a]<-0
Indbetahat.R2a[IndR2a]<-1
## Descendant avec Critère P.value
fit_back_Pv <- SignifReg(fit.back, scope = scope, direction = "backward", trace = FALSE, criterion = "p-value")
pred_test_back_Pv <- predict(fit_back_Pv,Xtest)
back.Pv[i] <- sqrt(mean((Ytest -pred_test_back_Pv)^2))
betahat.PvB<-fit_back_Pv$coefficients
IndPvB<-which(rownames(Indbeta)%in%names(betahat.PvB))
Indbetahat.PvB<-Indbeta
Indbetahat.PvB[-IndPvB]<-0
Indbetahat.PvB[IndPvB]<-1
# Regression Lasso
fit.cv.lasso <- cv.glmnet(x = as.matrix(Xtrain), y = Ytrain, family= "gaussian", alpha = 1, nfold =10,
standardize = FALSE, intercept = TRUE)
lambda.opt.cvm<-fit.cv.lasso$lambda.min
pred_test_Lasso <- predict(fit.cv.lasso, newx=as.matrix(Xtest),s=lambda.opt.cvm,type="response")
Lasso[i] <- sqrt(mean((Ytest-pred_test_Lasso)^2))
betahat.lasso <- predict(fit.cv.lasso,s=lambda.opt.cvm,type="coefficients")
Indbetahat.lasso<-betahat.lasso
Indbetahat.lasso[Indbetahat.lasso==0]<-0
Indbetahat.lasso[Indbetahat.lasso!=0]<-1
# Regression Ridge
fit.cv.Ridge <- cv.glmnet(x = as.matrix(Xtrain), y = Ytrain, family= "gaussian", alpha = 0, nfold = 10,
standardize = FALSE, intercept = TRUE)
lambda.opt.cvm<-fit.cv.Ridge$lambda.min
pred_test_Ridge <- predict(fit.cv.Ridge, newx=as.matrix(Xtest),s=lambda.opt.cvm,type="response")
Ridge[i] <- sqrt(mean((Ytest-pred_test_Ridge)^2))
betahat.Ridge <- predict(fit.cv.Ridge,s=lambda.opt.cvm,type="coefficients")
Indbetahat.Ridge<-betahat.Ridge
Indbetahat.Ridge[Indbetahat.Ridge==0]<-0
Indbetahat.Ridge[Indbetahat.Ridge!=0]<-1
# Regression PCR
pcr.fit <- pcr(TDM ~., data=train, scale=TRUE , validation="CV", segments=10)
k.pcr <- which.min(pcr.fit$validation$PRESS)
pred_test.pcr <- predict(pcr.fit,as.matrix(Xtest), ncomp=k.pcr)
PCR[i] <- sqrt(mean((Ytest-pred_test.pcr)^2))
betahat.pcr <- as.matrix(pcr.fit$coefficients[,,k.pcr],ncol=1) #l extraction des coefs ne donne pas l intercept, il faudra le rajouter
betahat.pcr <- rbind(predict(pcr.fit,t(as.matrix(rep(0,27))), ncomp=k.pcr),betahat.pcr) #je recupere l intercept en faisant la prediction sur un vecteur de 0s
rownames(betahat.pcr)[1]<-"(Intercept)"
Indbetahat.pcr<-betahat.pcr
Indbetahat.pcr[Indbetahat.pcr==0]<-0
Indbetahat.pcr[Indbetahat.pcr!=0]<-1
# Regression PLSR
pls.fit <- plsr(TDM ~., data=train, scale=TRUE , validation="CV", segments=10)
k.pls <- which.min(pls.fit$validation$PRESS)
pred_test.pls<-predict(pls.fit,as.matrix(Xtest), ncomp=k.pls)
PLS[i] <- sqrt(mean((Ytest-pred_test.pls)^2))
betahat.pls <- as.matrix(pls.fit$coefficients[,,k.pls],ncol=1) #l extraction des coefs ne donne pas l intercept, il faudra le rajouter
betahat.pls<-rbind(predict(pls.fit,t(as.matrix(rep(0,27))), ncomp=k.pls),betahat.pls) #je recupere l intercept en faisant la prediction sur un vecteur de 0s
rownames(betahat.pls)[1]<-"(Intercept)"
Indbetahat.pls<-betahat.pls
Indbetahat.pls[Indbetahat.pls==0]<-0
Indbetahat.pls[Indbetahat.pls!=0]<-1
# Regression sPLSR
spls.cv <- cv.spls(Xtrain, Ytrain, eta = seq(0.1,0.9,0.1), K = c(1:27),plot.it= FALSE)
spls.fit <- spls(x=Xtrain, y=Ytrain, eta = spls.cv$eta.opt, K = spls.cv$K.opt,
scale.x=TRUE, scale.y=FALSE, trace=FALSE)
pred_test.spls <- predict(spls.fit,as.matrix(Xtest))
sPLS[i] <- sqrt(mean((Ytest-pred_test.spls)^2))
betahat.spls <-coef(spls.fit) #l extraction des coefs ne donne pas l intercept, il faut le rajouter
betahat.spls<-rbind(predict(spls.fit,t(as.matrix(rep(0,27)))),betahat.spls) #je recupere l intercept en faisant la prediction sur un vecteur de 0s
rownames(betahat.spls)[1]<-"(Intercept)"
Indbetahat.spls<-betahat.spls
Indbetahat.spls[Indbetahat.spls==0]<-0
Indbetahat.spls[Indbetahat.spls!=0]<-1
FN[i,1] <- sum(abs(Indbetahat.pcr[Ind,]-Indbeta[Ind,]))/length(Ind)
FN[i,2] <- sum(abs(Indbetahat.pls[Ind,]-Indbeta[Ind,]))/length(Ind)
FN[i,3] <- sum(abs(Indbetahat.spls[Ind,]-Indbeta[Ind,]))/length(Ind)
FN[i,4] <- sum(abs(Indbetahat.lasso[Ind]-Indbeta[Ind]))/length(Ind)
FN[i,5] <- sum(abs(Indbetahat.Ridge[Ind]-Indbeta[Ind]))/length(Ind)
FN[i,6] <- sum(abs(Indbetahat.PvB[Ind]-Indbeta[Ind]))/length(Ind)
FN[i,7] <- sum(abs(Indbetahat.R2a[Ind]-Indbeta[Ind]))/length(Ind)
FN[i,8] <- sum(abs(Indbetahat.PvS[Ind]-Indbeta[Ind]))/length(Ind)
}Comparaison des 8 méthodes via graphique
par(mfrow=c(1,1))
err_test_2<-cbind(exhau.Pv, forw.R2a, back.Pv, Lasso, Ridge, PCR, PLS, sPLS)
boxplot(err_test_2,ylab="erreur test", col=rainbow(8))Représentation graphique des taux de faux positifs
par(mfrow=c(1,1))
couleurs<-c("red","orange","purple","yellow","grey","blue","green","pink")
boxplot(FN,col=couleurs,main="FN rate", ylab="Taux.Faux_Négatifs", xlab= "Méthodes")