Rcom ###### Módulo 6: Análisis multivariado ###### ###### Profesor: Luis E. Nieto-Barajas ###### ###### LIBRERIAS EN R ###### library(TeachingDemos) library(mvtnorm) library(cluster) library(MASS) library(tree) ###### COMANDOS DE R ###### # (1) Lectura de los datos ----------------------------------------marg<-read.table("http://allman.rhon.itam.mx/~lnieto/index_archivos/Marg90.txt",hea der=T,row.names=1) head(marg) # (2) Estadísticas descriptivas -----------------------------------# (2.1) Media apply(marg,2,mean) colMeans(marg) # (2.2) Varianzas-Covarianzas y Correlaciones var(marg) cor(marg) marg.var<-var(marg[,1:9]) marg.cor<-cor(marg[,1:9]) # (2.3) Otros resúmenes apply(marg,2,summary) summary(marg) # (3) Análisis gráfico --------------------------------------------# (3.1) Diagramas de dispersión bidimensional plot(marg[,1],marg[,2],xlab="ANALF",ylab="S.PRI") plot(marg[,1],marg[,2],type="n",xlab="ANALF",ylab="S.PRI") text(marg[,1],marg[,2],labels=dimnames(marg)[[1]]) # (3.2) Matriz de dispersión bidimensional pairs(marg[,1:9]) pairs(marg[,1:4]) pairs(marg[,5:9]) # (3.3) Diagramas de dispersión tridimensional Graph > 3D Plot > 3D Scatter Plot brush(marg[,1:3]) # (3.4) Diagramas de burbujas symbols(marg[,1],marg[,2],circles=marg[,3],xlab="ANALF",ylab="S.PRI") text(marg[,1],marg[,2],labels=dimnames(marg)[[1]]) # (3.5) Caras de Chernoff faces(marg[,1:9]) faces2(marg[,1:9]) # (3.6) Diagrama de estrellas stars(marg[,1:9]) # (3.7) Estandarización Página 1 Rcom marg.est<-scale(marg[,1:9]) # (3.8) Diagrama de Andrews source("http://allman.rhon.itam.mx/~lnieto/index_archivos/Standrews.txt") st.andrews(marg.est[,1:9]) # (4) Distribución normal multivariada ---------------------------#(4.1) Muestra mu<-c(0,0) sigma<-matrix(c(1,0,0,1),2,2) x<-rmvnorm(100,mu,sigma) plot(x) #(4.2) Grafica de densidad bivariada x1<-seq(-5,5,,50) x2<-seq(-5,5,,50) dnbg<-function(x1, x2, mu, sigma) { require(mvtnorm) n <- length(x1) z <- matrix(0, n, n) for(i in 1:n) { for(j in 1:n) { z[i, j] <- dmvnorm(c(x1[i], x2[j]), mu, sigma) } } z } z<-dnbg(x1,x2,mu,sigma) par(mfrow=c(1,2)) persp(x1,x2,z, theta=120, phi=15) contour(x1,x2,z) #(4.3) Prueba de normalidad univariada par(mfrow=c(2,2)) qqnorm(marg[,1]) hist(marg[,1]) boxplot(marg[,1]) #(4.4) Prueba de normalidad bivariada x<-cbind(log(marg[,1]),marg[,2]) mu<-apply(x,2,mean) sigma<-var(x) n<-dim(x)[1] d<-1:n for (i in 1:n){d[i]<-(x[i,]-mu)%*%solve(sigma)%*%(x[i,]-mu)} d2<-qchisq(((1:n)-0.5)/n, 2) par(mfrow=c(1,1)) qqplot(d,d2) # (5) Inferencia para rho ---------------------------------------# (5.1) Intervalos de confianza para la correlacion tanh(c(atanh(marg.cor[1,2])-pnorm(.975)/sqrt(nrow(marg)-3), atanh(marg.cor[1,2])+pnorm(.975)/sqrt(nrow(marg)-3))) # (5.2) Pruebas de hipotesis sobre la correlacion cor.test(marg[,1],marg[,2]) marg.test<-matrix(0,0,ncol=13,nrow=13) for (j in 2:13){ for (i in 1:(j-1)){ Página 2 Rcom marg.test[i,j]<-cor.test(marg[,i],marg[,j])$p.value } } marg.test # (6) Albegra de Matrices ---------------------------------------marg.var<-var(marg[,1:9]) # (6.1) Eigenvalores y Eigenvectores eigen(marg.var) # (6.2) Determinante det(marg.var) prod(eigen(marg.var)$values) # (6.3) Traza sum(eigen(marg.var)$values) # (7) Análisis de componentes principales -----------------------marg.pc<-princomp(marg[,1:9]) print(marg.pc,loadings=T) summary(marg.pc,loadings=T) screeplot(marg.pc) barplot(marg.pc$sdev^2/sum(marg.pc$sdev^2),xlab="Component",ylab="Prop. Var.") biplot(marg.pc) biplot(marg.pc,choices=c(1,3)) biplot(marg.pc,choices=2:3) plot(-marg.pc$scores[,1],marg[,12]) marg.pc.cor<-princomp(marg[,1:9],cor=T) summary(marg.pc.cor,loadings=T) screeplot(marg.pc.cor) barplot(marg.pc.cor$sdev^2/sum(marg.pc.cor$sdev^2),xlab="Component",ylab="Prop. Var.") biplot(marg.pc.cor) plot(-marg.pc.cor$scores[,1],marg[,12]) # (8) Análisis de cúmulos ---------------------------------------# (8.1) Distancia marg.dist<-dist(marg[,1:9]) # (8.2) Métodos de liga marg.cl.com<-hclust(marg.dist,method="com") plclust(marg.cl.com,labels=dimnames(marg)[[1]]) rect.hclust(marg.cl.com, k=5, border="red") cutree(marg.cl.com,5,) marg.cl.com.gr<-cutree(marg.cl.com,5,) marg[marg.cl.com.gr==1,] summary(marg[marg.cl.com.gr==1,]) by(marg[,1:9],marg.cl.com.gr,summary) Página 3 Rcom plot(cutree(marg.cl.com,5,)[1:32],marg[,13]) plclust(subtree(marg.cl.com,c(2,9,19))) marg.dist.est<-dist(marg.est[,1:9]) marg.cl.com.est<-hclust(marg.dist.est,method="com") plclust(marg.cl.com.est,labels=dimnames(marg)[[1]]) rect.hclust(marg.cl.com.est, k=5, border="red") # (8.3) Método de Ward marg.cl.ward<-hclust(marg.dist,method="ward") plclust(marg.cl.ward) rect.hclust(marg.cl.ward, k=5, border="red") # (8.4) Método de k-medias marg.kmeans<-kmeans(marg[,1:9],5) # (8.5) Agrupación de variables marg.cor<-cor(marg[,1:9]) marg.vcl.com<-hclust(as.dist(1-abs(marg.cor)),method="com") plclust(marg.vcl.com,labels=dimnames(marg[,1:9])[[2]]) rect.hclust(marg.vcl.com, k=5, border="red") # (8.6) Método divisivo marg.cl.div<-diana(marg[,1:9]) marg.cl.div$dc plot(marg.cl.div) # (9) Escalamiento multidimensional ------------------------------marg.scale<-cmdscale(marg.dist) plot(marg.scale,type="n") text(marg.scale[,1],marg.scale[,2],labels=dimnames(marg)[[1]]) source("http://allman.rhon.itam.mx/~lnieto/index_archivos/Stress.txt") marg.stress<-cmdscale.gof(marg.dist,k=9) par(mfrow=c(2,1)) plot(marg.stress$gof1,main="gof 1",ylab="") plot(marg.stress$gof2,main="gof 2",ylab="") # (10) Análisis de factores --------------------------------------marg.fa3<-factanal(marg[,1:9],3,rotation="none") summary(marg.fa3) print(marg.fa3) marg.fa3$loadings varimax(marg.fa3$loadings) promax(marg.fa3$loadings) marg.fa3<-factanal(marg[,1:9],3,rotation="varimax",scores="reg") par(mfrow=c(1,1)) biplot(marg.fa3$scores[,1:2],marg.fa3$loadings[,1:2]) biplot(marg.fa3$scores[,c(1,3)],marg.fa3$loadings[,c(1,3)]) biplot(marg.fa3$scores[,c(2,3)],marg.fa3$loadings[,c(2,3)]) marg.fa4<-factanal(marg[,1:9],4,rotation="none") summary(marg.fa4) print(marg.fa4) Página 4 Rcom marg.fa4<-factanal(marg[,1:9],4,rotation="varimax",scores="reg") marg.fa4$loadings par(mfrow=c(2,3)) biplot(marg.fa4$scores[,1:2],marg.fa4$loadings[,1:2]) biplot(marg.fa4$scores[,c(1,3)],marg.fa4$loadings[,c(1,3)]) biplot(marg.fa4$scores[,c(1,4)],marg.fa4$loadings[,c(1,4)]) biplot(marg.fa4$scores[,2:3],marg.fa4$loadings[,2:3]) biplot(marg.fa4$scores[,c(2,4)],marg.fa4$loadings[,c(2,4)]) biplot(marg.fa4$scores[,c(3,4)],marg.fa4$loadings[,c(3,4)]) marg.dist.fa4<-dist(marg.fa4$scores) marg.clust.fa4<-hclust(marg.dist.fa4,method="com") par(mfrow=c(1,1)) plclust(marg.clust.fa4) rect.hclust(marg.clust.fa4, k=4, border="red") marg.clust.fa4.cut<-cutree(marg.clust.fa4,4,) by(marg.fa4$scores,marg.clust.fa4.cut,summary) # (11) Análisis discriminante -----------------------------------# (11.1) Marginación # (11.1.A) Discriminante Lineal marg.lda<-lda(marg[,13] ~ marg[,1]+marg[,2]+marg[,3]+marg[,4]+marg[,5]+marg[,6]+ marg[,7]+marg[,8]+marg[,9]) marg.dv<-as.matrix(marg[,1:9])%*%as.matrix(marg.lda$scaling) plot(marg.dv[,1],marg.dv[,2],type="n") #text(marg.dv[,1],marg.dv[,2],labels=dimnames(marg)[[1]]) text(marg.dv[,1],marg.dv[,2],labels=factor(marg[,13])) marg.lda.cv<-lda(marg[,13] ~ marg[,1]+marg[,2]+marg[,3]+marg[,4]+marg[,5]+marg[,6]+ marg[,7]+marg[,8]+marg[,9],CV=T) marg.lda.cv$class table(marg[,13],marg.lda.cv$class) # (11.1.B) Arbol de clasificación marg.tree<-tree(factor(marg[,13]) ~ marg[,1]+marg[,2]+marg[,3]+marg[,4]+marg[,5]+ marg[,6]+marg[,7]+marg[,8]+marg[,9]) summary(marg.tree) plot(marg.tree) text(marg.tree) plot(marg[,1],marg[,8],type="n") text(marg[,1],marg[,8],labels=factor(marg[,13])) abline(h=29.665) abline(v=c(5.55,12.48,17.79)) marg.tree.p<-predict(marg.tree,marg[,1:9]) pclass<-1:32 for (i in 1:32){ pclass[i]<-order(marg.tree.p[i,])[5] } table(marg[,13],pclass) # (11.2) Crédito credit<-read.table("http://allman.rhon.itam.mx/~lnieto/index_archivos/Credit.txt",h eader=T) Página 5 head(credit) credit.est<-scale(credit) Rcom # (11.2.A) Discriminante Lineal credit.lda<-lda(credit[,1] ~ credit.est[,2]+credit.est[,3]+credit.est[,4]+ credit.est[,5]+credit.est[,6]+credit.est[,7]+credit.est[,8]) credit.dv<-as.matrix(credit.est[,2:8])%*%as.matrix(credit.lda$scaling) n<-length(credit.dv[,1]) y<-runif(n) plot(credit.dv[,1],y,type="n") text(credit.dv[,1],y,labels=factor(credit[,1])) credit.lda.cv<-lda(credit[,1] ~ credit.est[,2]+credit.est[,3]+credit.est[,4]+ credit.est[,5]+credit.est[,6]+credit.est[,7]+credit.est[,8],CV=T) credit.lda.cv$class table(credit[,1],credit.lda.cv$class) # (11.2.B) Discriminante Cuadrático credit.qda<-qda(credit[,1] ~ credit.est[,2]+credit.est[,3]+credit.est[,4]+ credit.est[,5]+credit.est[,6]+credit.est[,7]+credit.est[,8]) credit.qda.cv<-qda(credit[,1] ~ credit.est[,2]+credit.est[,3]+credit.est[,4]+ credit.est[,5]+credit.est[,6]+credit.est[,7]+credit.est[,8],CV=T) credit.qda.cv$class table(credit[,1],credit.qda.cv$class) # (11.1.B) Arbol de clasificación credit.tree<-tree(factor(credit[,1]) ~ credit[,2]+credit[,3]+credit[,4]+ credit[,5]+credit[,6]+credit[,7]+credit[,8]) summary(credit.tree) plot(credit.tree) text(credit.tree) credit.tree.p<-predict(credit.tree,credit[,2:8]) pclass<-1:113 for (i in 1:113){ pclass[i]<-order(credit.tree.p[i,])[2] } table(credit[,1],pclass) Página 6