Importation des données

Analyse univariée (pas dans le rapport).

D <- 1:ncol(data)
options(digits=1)
Hmisc::describe(D)
## D 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       10        0       10        1      5.5    3.667     1.45     1.90 
##      .25      .50      .75      .90      .95 
##     3.25     5.50     7.75     9.10     9.55 
## 
## lowest :  1  2  3  4  5, highest:  6  7  8  9 10
##                                                   
## Value        1   2   3   4   5   6   7   8   9  10
## Frequency    1   1   1   1   1   1   1   1   1   1
## Proportion 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1
par(mfrow=c(2,5))
for (j in D) {
  hist(data[,j],col="blue", xlab = names(data)[j],main = "")
}

A travers ce grapgique on peut voir que certaines variables suivent une loi normal.

Analyse bivariée

Relation des prédicteurs entre eux.

entrees <- as.data.frame(data[,-1])
cormat <- round(cor(data),2)
get_upper_tri <- function(cormat){
  cormat[lower.tri(cormat)]<- NA
  return(cormat)
}
upper_tri <- get_upper_tri(cormat)
reorder_cormat <- function(cormat){
  # Utiliser la corrélation entre les variables
  # comme mésure de distance
  dd <- as.dist((1-cormat)/2)
  hc <- hclust(dd)
  cormat <-cormat[hc$order, hc$order]
}
# Reordonner la matrice de corrélation
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Fondre la matrice de corrélation
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Créer un ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab",
                       name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()+ 
  geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.ticks = element_blank(),
    legend.justification = c(1, 0),
    legend.position = c(0.6, 0.7),
    legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5))
print(ggheatmap)

Commentaire dans le rapport.

lien entre chaque prédicteur et la variable réponse

A <- ggplot(data = data, aes(x = T, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
B <- ggplot(data = data, aes(x = HR, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
C <- ggplot(data = data, aes(x = PR, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
D <- ggplot(data = data, aes(x = CL, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
E <- ggplot(data = data, aes(x = PD, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
F <- ggplot(data = data, aes(x = MA, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
G <- ggplot(data = data, aes(x = SI, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
H <- ggplot(data = data, aes(x = FM, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
I <- ggplot(data = data, aes(x = SH, y = TDM)) + geom_point() + geom_smooth(formula = y ~ x, method = "lm", se = FALSE)
grid.arrange(A, B, C, D, E, F, G, H, I, ncol=3, nrow = 3)

Commentaire dans le rapport.

cor.test(data$T, data$TDM, method = "pearson") 
## 
##  Pearson's product-moment correlation
## 
## data:  data$T and data$TDM
## t = -4, df = 53, p-value = 2e-04
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7 -0.3
## sample estimates:
##  cor 
## -0.5

Standardisation des données.

min <- apply(data,2,min)
max <- apply(data,2,max)
data_stand <- sweep(data,2,STATS=min,FUN="-")
s <- max-min
data_stand <- sweep(data_stand, 2, s, FUN="/")

head(data_stand)
##   TDM   T  HR  PR  CL  PD  MA  SI  FM   SH
## 1 0.3 0.4 0.3 0.2 0.0 0.2 0.3 0.4 0.5 0.12
## 2 0.0 1.0 0.5 0.5 0.2 0.5 0.0 0.4 0.5 0.27
## 3 0.3 0.5 0.6 0.4 0.3 0.7 0.3 0.8 0.9 0.49
## 4 0.3 0.2 0.6 0.4 0.2 0.2 0.4 0.5 0.7 0.08
## 5 0.2 0.7 0.5 0.7 0.4 0.4 0.3 0.8 0.5 0.41
## 6 0.2 0.6 0.4 0.2 0.0 0.4 0.6 0.7 0.4 0.30

Les observations sont bien comprises entre 0 et 1.

Base de fonctions des polynômes d’ordre 3.

data_stand_multi <- data_stand %>%
  mutate( T_2=T**2, T_3=T**3, HR_2=HR**2, HR_3=HR**3, PR_2=PR**2, PR_3=PR**3, CL_2=CL**2, CL_3=CL**3, PD_2=PD**2, PD_3=PD**3, MA_2=MA**2, MA_3=MA**3, SI_2=SI**2, SI_3=SI**3, FM_2=FM**2, FM_3=FM**3, SH_2=SH**2, SH_3=SH**3)

Analyse multivariée

Comparaison des critères pour la sélection de tous les sous-ensembles.

require(MASS)
B = 30
exhau.AIC <- rep(NA,B)
exhau.R2a <- rep(NA,B)
exhau.Pv <- rep(NA,B)
forw.AIC <- rep(NA,B)
forw.R2a <- rep(NA,B)
forw.Pv <- rep(NA,B)
back.AIC <- rep(NA,B)
back.R2a <- rep(NA,B)
back.Pv <- rep(NA,B)
set.seed(123)
for(i in 1:B){
  
  # Partition aléatoire des données : 2/3 pour l’apprentissage et 1/3 pour le test.
  n <- nrow(data_stand_multi)
  tr <- sample(1:n,((55*2)/3))
  train <- data_stand_multi[tr,]
  test <- data_stand_multi[-tr,]
  Xtrain <- train[,-1]
  Ytrain <- train[,1]
  Xtest <- test[,-1]
  Ytest <- test[,1]
  
  # Sélection de tous les sous-ensembles
  kmax=27
  grille_mod <-1:kmax 
  err_AIC_mod <- rep(NA,length(grille_mod))
  err_R2a_mod <- rep(NA,length(grille_mod))
  err_Pv_mod <- rep(NA,length(grille_mod))
  p=27
  models <- regsubsets(TDM~.,data=train,nvmax=p,method=c("exhaustive"))
  for(k in 1:length(grille_mod))
  {
    model_k <- summary(models)$which[k,-1]
    nv <- names(which(model_k == TRUE))
    nv<- paste(nv, collapse = "+")
    nv <- paste0("TDM", "~", nv)
    nv <- as.formula(nv)
    model_lineaire <- lm(nv,data=train)
    err_AIC_mod[k] <- AIC(model_lineaire)
    err_R2a_mod[k] <- summary(model_lineaire)$adj.r.squared
    err_Pv_mod[k] <- broom::glance(model_lineaire)$p.value
  }
  
  ## Critère AIC
  k_opt_AIC <- grille_mod[which.min(err_AIC_mod)]
  model_k_opt_AIC <- summary(models)$which[k_opt_AIC,-1]
  nv <- names(which(model_k_opt_AIC == TRUE))
  nv<- paste(nv, collapse = "+")
  nv <- paste0("TDM", "~", nv)
  nv <- as.formula(nv)
  model_lineaire_k_opt_AIC <- lm(nv,data=train)
  prediction_model_lineaire_k_opt_AIC <- predict(model_lineaire_k_opt_AIC,Xtest)
  exhau.AIC[i] <- sqrt(mean((Ytest - prediction_model_lineaire_k_opt_AIC)^2))
  
  ## Critère R2ajusté
  k_opt_R2a <- grille_mod[which.min(err_R2a_mod)]
  model_k_opt_R2a <- summary(models)$which[k_opt_R2a,-1]
  nv <- names(which(model_k_opt_R2a == TRUE))
  nv<- paste(nv, collapse = "+")
  nv <- paste0("TDM", "~", nv)
  nv <- as.formula(nv)
  model_lineaire_k_opt_R2a <- lm(nv,data=train)
  prediction_model_lineaire_k_opt_R2a <- predict(model_lineaire_k_opt_R2a,Xtest)
  exhau.R2a[i] <- sqrt(mean((Ytest - prediction_model_lineaire_k_opt_R2a)^2))
  
  ## Critère P.value
  k_opt_Pv <- grille_mod[which.min(err_Pv_mod)]
  model_k_opt_Pv <- summary(models)$which[k_opt_Pv,-1]
  nv <- names(which(model_k_opt_Pv == TRUE))
  nv<- paste(nv, collapse = "+")
  nv <- paste0("TDM", "~", nv)
  nv <- as.formula(nv)
  model_lineaire_k_opt_Pv <- lm(nv,data=train)
  prediction_model_lineaire_k_opt_Pv <- predict(model_lineaire_k_opt_Pv,Xtest)
  exhau.Pv[i] <- sqrt(mean((Ytest - prediction_model_lineaire_k_opt_Pv)^2))
  
  # Selection pas à pas ascendante(forward)
  nullmodel = lm(TDM~1, train)
  fullmodel = lm(TDM~., train)
  scope = list(lower=formula(nullmodel),upper=formula(fullmodel))
  fit.forw <- lm(TDM ~1, data = train)
  fit.back <- lm(TDM ~., data = train)
  
  ## Critère AIC
  fit_forw_AIC <-  SignifReg(fit.forw, scope = scope, direction = "forward", trace = FALSE, criterion = "AIC")
  pred_test_forw_AIC <- predict(fit_forw_AIC,Xtest)
  forw.AIC[i] <- sqrt(mean((Ytest -pred_test_forw_AIC)^2))

  ## Critère R2ajusté
  fit_forw_R2a <-  SignifReg(fit.forw, scope = scope, direction = "forward", trace = FALSE, criterion = "r-adj")
  pred_test_forw_R2a <- predict(fit_forw_R2a,Xtest)
  forw.R2a[i] <- sqrt(mean((Ytest -pred_test_forw_R2a)^2))
  
  ## Critère P.value
  fit_forw_Pv <-  SignifReg(fit.forw, scope = scope, direction = "forward", trace = FALSE, criterion = "p-value")
  pred_test_forw_Pv <- predict(fit_forw_Pv,Xtest)
  forw.Pv[i] <- sqrt(mean((Ytest -pred_test_forw_Pv)^2))
  
  # Selection pas à pas descendante(backward)
  
  ## Critère AIC
  fit_back_AIC <-  SignifReg(fit.back, scope = scope, direction = "backward", trace = FALSE, criterion = "AIC")
  pred_test_back_AIC <- predict(fit_back_AIC,Xtest)
  back.AIC[i] <- sqrt(mean((Ytest -pred_test_back_AIC)^2))
  
  ## Critère R2ajusté
  fit_back_R2a <-  SignifReg(fit.back, scope = scope, direction = "backward", trace = FALSE, criterion = "r-adj")
  pred_test_back_R2a <- predict(fit_back_R2a,Xtest)
  back.R2a[i] <- sqrt(mean((Ytest -pred_test_back_R2a)^2))
  
  ## Critère P.value
  fit_back_Pv <-  SignifReg(fit.back, scope = scope, direction = "backward", trace = FALSE, criterion = "p-value")
  pred_test_back_Pv <- predict(fit_back_Pv,Xtest)
  back.Pv[i] <- sqrt(mean((Ytest -pred_test_back_Pv)^2))

}

Comparaissons via la moyenne des erreurs test

mean(exhau.AIC)
## [1] 0.2
mean(forw.AIC)
## [1] 0.1
mean(back.AIC)
## [1] 0.2
mean(exhau.R2a)
## [1] 0.1
mean(forw.R2a)
## [1] 0.1
mean(back.R2a)
## [1] 0.2
mean(exhau.Pv)
## [1] 0.1
mean(forw.Pv)
## [1] 0.1
mean(back.Pv)
## [1] 0.1
par(mfrow=c(1,1))
err_test_1<-cbind(exhau.AIC, exhau.R2a, exhau.Pv, forw.AIC, forw.R2a, forw.Pv) 
boxplot(err_test_1,ylab="erreur test", col=rainbow(6))

Comparaison des 8 méthodes de sélection et taux de Faux Négatif

# Calcul pour le taux de faux négatif
X <- data_stand[,-1]
Indbeta<-matrix(0,nrow=ncol(X),ncol = 1) #Indicateur de vrais beta non nuls (dim depend de dim X)
rownames(Indbeta)<-colnames(X)
Indbeta<-rbind(0,Indbeta) #on rajoute l intercepte 
rownames(Indbeta)[1]<-"(Intercept)"  #Sans
Ind<-which(rownames(Indbeta) == "T" )
Indbeta[Ind]<-1


# Vecteur des erreurs
B = 30
exhau.Pv <- rep(NA,B)
back.Pv <- rep(NA,B)
forw.R2a <- rep(NA,B)
Lasso <- rep(NA,B)
Ridge <- rep(NA,B)
PCR <- rep(NA,B)
PLS <- rep(NA,B)
sPLS <- rep(NA,B)
FN<-matrix(NA,nrow=B,ncol=8) # false negative rate
colnames(FN)<-c("PCR", "PLS", "sPLS","Lasso","Ridge","Back.Pv","Forw.R2", "Exhau.Pv")
set.seed(123)


for(i in 1:B){
  # Partition aléatoire des données : 2/3 pour l’apprentissage et 1/3 pour le test.
  n <- nrow(data_stand_multi)
  tr <- sample(1:n,((55*2)/3))
  train <- data_stand_multi[tr,]
  test <- data_stand_multi[-tr,]
  Xtrain <- train[,-1]
  Ytrain <- train[,1]
  Xtest <- test[,-1]
  Ytest <- test[,1]
  
  # Sélection de tous les sous-ensembles
  kmax=27
  grille_mod <-1:kmax 
  err_Pv_mod <- rep(NA,length(grille_mod))
  p=27
  models <- regsubsets(TDM~.,data=train,nvmax=p,method=c("exhaustive"))
  for(k in 1:length(grille_mod))
  {
    model_k <- summary(models)$which[k,-1]
    nv <- names(which(model_k == TRUE))
    nv<- paste(nv, collapse = "+")
    nv <- paste0("TDM", "~", nv)
    nv <- as.formula(nv)
    model_lineaire <- lm(nv,data=train)
    err_Pv_mod[k] <- broom::glance(model_lineaire)$p.value
  }
  ## Critère P.value
  k_opt_Pv <- grille_mod[which.min(err_Pv_mod)]
  model_k_opt_Pv <- summary(models)$which[k_opt_Pv,-1]
  nv <- names(which(model_k_opt_Pv == TRUE))
  nv<- paste(nv, collapse = "+")
  nv <- paste0("TDM", "~", nv)
  nv <- as.formula(nv)
  model_lineaire_k_opt_Pv <- lm(nv,data=train)
  prediction_model_lineaire_k_opt_Pv <- predict(model_lineaire_k_opt_Pv,Xtest)
  exhau.Pv[i] <- sqrt(mean((Ytest - prediction_model_lineaire_k_opt_Pv)^2))
  betahat.PvS<-model_lineaire_k_opt_Pv$coefficients
  IndPvS<-which(rownames(Indbeta)%in%names(betahat.PvS))
  Indbetahat.PvS<-Indbeta
  Indbetahat.PvS[-IndPvS]<-0
  Indbetahat.PvS[IndPvS]<-1
  
  
  # Selection pas à pas 
  nullmodel = lm(TDM~1, train)
  fullmodel = lm(TDM~., train)
  scope = list(lower=formula(nullmodel),upper=formula(fullmodel))
  fit.forw <- lm(TDM ~1, data = train)
  fit.back <- lm(TDM ~., data = train)
  
  ## Ascendant avec critère R2ajusté
  fit_forw_R2a <-  SignifReg(fit.forw, scope = scope, direction = "forward", trace = FALSE, criterion = "r-adj")
  pred_test_forw_R2a <- predict(fit_forw_R2a,Xtest)
  forw.R2a[i] <- sqrt(mean((Ytest -pred_test_forw_R2a)^2))
  betahat.R2a<-fit_forw_R2a$coefficients
  IndR2a<-which(rownames(Indbeta)%in%names(betahat.R2a))
  Indbetahat.R2a<-Indbeta
  Indbetahat.R2a[-IndR2a]<-0
  Indbetahat.R2a[IndR2a]<-1
  
  ## Descendant avec Critère P.value
  fit_back_Pv <-  SignifReg(fit.back, scope = scope, direction = "backward", trace = FALSE, criterion = "p-value")
  pred_test_back_Pv <- predict(fit_back_Pv,Xtest)
  back.Pv[i] <- sqrt(mean((Ytest -pred_test_back_Pv)^2))
  betahat.PvB<-fit_back_Pv$coefficients
  IndPvB<-which(rownames(Indbeta)%in%names(betahat.PvB))
  Indbetahat.PvB<-Indbeta
  Indbetahat.PvB[-IndPvB]<-0
  Indbetahat.PvB[IndPvB]<-1

  
  # Regression Lasso
  fit.cv.lasso <- cv.glmnet(x = as.matrix(Xtrain), y = Ytrain, family= "gaussian", alpha = 1, nfold =10, 
                            standardize = FALSE, intercept = TRUE)
  lambda.opt.cvm<-fit.cv.lasso$lambda.min
  pred_test_Lasso <- predict(fit.cv.lasso, newx=as.matrix(Xtest),s=lambda.opt.cvm,type="response")
  Lasso[i] <- sqrt(mean((Ytest-pred_test_Lasso)^2))
  betahat.lasso <- predict(fit.cv.lasso,s=lambda.opt.cvm,type="coefficients")
  Indbetahat.lasso<-betahat.lasso
  Indbetahat.lasso[Indbetahat.lasso==0]<-0
  Indbetahat.lasso[Indbetahat.lasso!=0]<-1
  
  # Regression Ridge
  fit.cv.Ridge <- cv.glmnet(x = as.matrix(Xtrain), y = Ytrain, family= "gaussian", alpha = 0, nfold = 10, 
                            standardize = FALSE, intercept = TRUE)
  lambda.opt.cvm<-fit.cv.Ridge$lambda.min
  pred_test_Ridge <- predict(fit.cv.Ridge, newx=as.matrix(Xtest),s=lambda.opt.cvm,type="response")
  Ridge[i] <- sqrt(mean((Ytest-pred_test_Ridge)^2))
  betahat.Ridge <- predict(fit.cv.Ridge,s=lambda.opt.cvm,type="coefficients")
  Indbetahat.Ridge<-betahat.Ridge
  Indbetahat.Ridge[Indbetahat.Ridge==0]<-0
  Indbetahat.Ridge[Indbetahat.Ridge!=0]<-1
  
  
  # Regression PCR
  pcr.fit <- pcr(TDM ~., data=train, scale=TRUE , validation="CV", segments=10)
  k.pcr <- which.min(pcr.fit$validation$PRESS)
  pred_test.pcr <- predict(pcr.fit,as.matrix(Xtest), ncomp=k.pcr)
  PCR[i] <- sqrt(mean((Ytest-pred_test.pcr)^2))
  betahat.pcr <- as.matrix(pcr.fit$coefficients[,,k.pcr],ncol=1) #l extraction des coefs ne donne pas l intercept, il faudra le rajouter
  betahat.pcr <- rbind(predict(pcr.fit,t(as.matrix(rep(0,27))), ncomp=k.pcr),betahat.pcr) #je recupere l intercept en faisant la prediction sur un vecteur de 0s
  rownames(betahat.pcr)[1]<-"(Intercept)"
  Indbetahat.pcr<-betahat.pcr
  Indbetahat.pcr[Indbetahat.pcr==0]<-0
  Indbetahat.pcr[Indbetahat.pcr!=0]<-1
  
  # Regression PLSR
  pls.fit <- plsr(TDM ~., data=train, scale=TRUE , validation="CV", segments=10)
  k.pls <- which.min(pls.fit$validation$PRESS)
  pred_test.pls<-predict(pls.fit,as.matrix(Xtest), ncomp=k.pls)
  PLS[i] <- sqrt(mean((Ytest-pred_test.pls)^2))
  betahat.pls <- as.matrix(pls.fit$coefficients[,,k.pls],ncol=1) #l extraction des coefs ne donne pas l intercept, il faudra le rajouter
  betahat.pls<-rbind(predict(pls.fit,t(as.matrix(rep(0,27))), ncomp=k.pls),betahat.pls) #je recupere l intercept en faisant la prediction sur un vecteur de 0s
  rownames(betahat.pls)[1]<-"(Intercept)"
  Indbetahat.pls<-betahat.pls
  Indbetahat.pls[Indbetahat.pls==0]<-0
  Indbetahat.pls[Indbetahat.pls!=0]<-1
  
  
  # Regression sPLSR
  spls.cv <- cv.spls(Xtrain, Ytrain, eta = seq(0.1,0.9,0.1), K = c(1:27),plot.it= FALSE)
  spls.fit <- spls(x=Xtrain, y=Ytrain, eta = spls.cv$eta.opt, K = spls.cv$K.opt,
                   scale.x=TRUE, scale.y=FALSE, trace=FALSE)
  pred_test.spls <- predict(spls.fit,as.matrix(Xtest))
  sPLS[i] <- sqrt(mean((Ytest-pred_test.spls)^2))
  betahat.spls <-coef(spls.fit) #l extraction des coefs ne donne pas l intercept, il faut le rajouter
  betahat.spls<-rbind(predict(spls.fit,t(as.matrix(rep(0,27)))),betahat.spls) #je recupere l intercept en faisant la prediction sur un vecteur de 0s
  rownames(betahat.spls)[1]<-"(Intercept)"
  Indbetahat.spls<-betahat.spls
  Indbetahat.spls[Indbetahat.spls==0]<-0
  Indbetahat.spls[Indbetahat.spls!=0]<-1
  
  FN[i,1] <- sum(abs(Indbetahat.pcr[Ind,]-Indbeta[Ind,]))/length(Ind) 
  FN[i,2] <- sum(abs(Indbetahat.pls[Ind,]-Indbeta[Ind,]))/length(Ind) 
  FN[i,3] <- sum(abs(Indbetahat.spls[Ind,]-Indbeta[Ind,]))/length(Ind) 
  FN[i,4] <- sum(abs(Indbetahat.lasso[Ind]-Indbeta[Ind]))/length(Ind)  
  FN[i,5] <- sum(abs(Indbetahat.Ridge[Ind]-Indbeta[Ind]))/length(Ind)
  FN[i,6] <- sum(abs(Indbetahat.PvB[Ind]-Indbeta[Ind]))/length(Ind)
  FN[i,7] <- sum(abs(Indbetahat.R2a[Ind]-Indbeta[Ind]))/length(Ind) 
  FN[i,8] <- sum(abs(Indbetahat.PvS[Ind]-Indbeta[Ind]))/length(Ind)
  
  
}

Comparaison des 8 méthodes via graphique

par(mfrow=c(1,1))
err_test_2<-cbind(exhau.Pv, forw.R2a, back.Pv, Lasso, Ridge, PCR, PLS, sPLS)
boxplot(err_test_2,ylab="erreur test", col=rainbow(8))

Représentation graphique des taux de faux positifs

par(mfrow=c(1,1))
couleurs<-c("red","orange","purple","yellow","grey","blue","green","pink")
boxplot(FN,col=couleurs,main="FN rate", ylab="Taux.Faux_Négatifs", xlab= "Méthodes")