78 Homework on Bayesian Mixture Models for Classification of Banknotes - M4L7HW3

Caution

Section omitted to comply with the Honor Code

--- title : 'Homework on Bayesian Mixture Models for Classification of Banknotes - M4L7HW3' subtitle : 'Bayesian Statistics: Mixture Models' categories: - Bayesian Statistics keywords: - Mixture Models - Homework - Density Estimation - MCMC algorithm - Location and scale of 2 Gaussians --- ::::: {.content-visible unless-profile="HC"} ::: {.callout-caution} Section omitted to comply with the Honor Code ::: ::::: ::::: {.content-hidden unless-profile="HC"} \index{dataset!banknotes } The data set banknote contains six measurements (length of bill, width of left edge, width of right edge, bottom margin width, top margin width, and length of diagonal, all in mm) made on 100 genuine and 100 counterfeit old-Swiss 1000-franc bank notes ::: {.callout-note} ### Instructions To load the dataset in R, use the command load("banknoteclassification.Rdata"), but first make sure that your working directory is set to the directory containing the file. You should see four objects: - banknote.training contains the characteristics for 30 notes (15 genuine and 15 counterfeit) in the training set. - banknote.training.labels contains the labels ("genuine" or "counterfeit") for the 30 notes in the training set - banknote.test contains the characteristics for 170 notes (85 genuine and 85 counterfeit) in the test set. - banknote.test.labels contains the labels ("genuine" or "counterfeit") for the 170 notes in the test set. These are provided only for validation purposes. You are asked to modify the MCMC algorithm in "Sample code for MCMC example 2" to create an algorithm for semi-supervised classification that is the Bayesian equivalent of that provided under "Sample EM algorithm for classification problems" and apply it to classify the observations contained in the test set. You are then asked to compare your results against those generated by the qda function in R. As your priors, use distributions in the same families as in "Sample code for MCMC example 2". In particular, use a uniform distribution for the weights, multivariate normal distributions for the means of the components, and inverse Wishart priors for the variance-covariance matrices of the components. The parameters of the priors should be set using the same empirical Bayes approach used in that example. ::: ::: {.callout-note} ### Grading overview Reviewers will check that the code has been properly adapted, and whether the classification results you provide are correct. ::: ::: {.callout-note collapse="true"} ### Sample code for MCMC example 2 ```{r} #| label: lbl-sample-code-for-mcmc-sample-2 #### Example of an MCMC algorithm for fitting a mixtures of K p-variate Gaussian components #### The algorithm is tested using simulated data ## Clear the environment and load required libraries rm(list=ls()) library(mvtnorm) library(ellipse) library(MCMCpack) ## Generate data from a mixture with 3 components KK = 3 p = 2 w.true = c(0.5,0.3,0.2) # True weights associated with the components mu.true = array(0, dim=c(KK,p)) mu.true[1,] = c(0,0) #True mean for the first component mu.true[2,] = c(5,5) #True mean for the second component mu.true[3,] = c(-3,7) #True mean for the third component Sigma.true = array(0, dim=c(KK,p,p)) Sigma.true[1,,] = matrix(c(1,0,0,1),p,p) #True variance for the first component Sigma.true[2,,] = matrix(c(2,0.9,0.9,1),p,p) #True variance for the second component Sigma.true[3,,] = matrix(c(1,-0.9,-0.9,4),p,p) #True variance for the third component set.seed(63252) #Keep seed the same so that we can reproduce results n = 120 cc.true = sample(1:3, n, replace=T, prob=w.true) x = array(0, dim=c(n,p)) for(i in 1:n){ x[i,] = rmvnorm(1, mu.true[cc.true[i],], Sigma.true[cc.true[i],,]) } par(mfrow=c(1,1)) plot(x[,1], x[,2], col=cc.true, xlab=expression(x[1]), ylab=expression(x[2]), type="n") text(x[,1], x[,2], seq(1,n), col=cc.true, cex=0.6) for(k in 1:KK){ lines(ellipse(x=Sigma.true[k,,], centre=mu.true[k,], level=0.50), col="grey", lty=2) lines(ellipse(x=Sigma.true[k,,], centre=mu.true[k,], level=0.82), col="grey", lty=2) lines(ellipse(x=Sigma.true[k,,], centre=mu.true[k,], level=0.95), col="grey", lty=2) } title(main="Data + True Components") ## Initialize the parameters w = rep(1,KK)/KK #Assign equal weight to each component to start with mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same Sigma[1,,] = var(x)/KK Sigma[2,,] = var(x)/KK Sigma[3,,] = var(x)/KK cc = sample(1:KK, n, replace=TRUE, prob=w) par(mfrow=c(1,1)) plot(x[,1], x[,2], col=cc.true, xlab=expression(x[1]), ylab=expression(x[2])) for(k in 1:KK){ lines(ellipse(x=Sigma[k,,], centre=mu[k,], level=0.50), col="grey", lty=2, lwd=2) lines(ellipse(x=Sigma[k,,], centre=mu[k,], level=0.82), col="grey", lty=2, lwd=2) lines(ellipse(x=Sigma[k,,], centre=mu[k,], level=0.95), col="grey", lty=2, lwd=2) } title(main="Initial estimate + Observations") # Priors aa = rep(1, KK) dd = apply(x,2,mean) DD = 10*var(x) nu = p SS = var(x)/3 # Number of iteration of the sampler rrr = 1000 # Storing the samples cc.out = array(0, dim=c(rrr, n)) w.out = array(0, dim=c(rrr, KK)) mu.out = array(0, dim=c(rrr, KK, p)) Sigma.out = array(0, dim=c(rrr, KK, p, p)) logpost = rep(0, rrr) for(s in 1:rrr){ # Sample the indicators for(i in 1:n){ v = rep(0,KK) for(k in 1:KK){ v[k] = log(w[k]) + mvtnorm::dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights } v = exp(v - max(v))/sum(exp(v - max(v))) cc[i] = sample(1:KK, 1, replace=TRUE, prob=v) } # Sample the weights w = as.vector(rdirichlet(1, aa + tabulate(cc))) # Sample the means DD.st = matrix(0, nrow=p, ncol=p) for(k in 1:KK){ mk = sum(cc==k) xsumk = apply(x[cc==k,], 2, sum) DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD)) dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd) mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st)) } # Sample the variances xcensumk = array(0, dim=c(KK,p,p)) for(i in 1:n){ xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],]) } for(k in 1:KK){ Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,]) } # Store samples cc.out[s,] = cc w.out[s,] = w mu.out[s,,] = mu Sigma.out[s,,,] = Sigma for(i in 1:n){ logpost[s] = logpost[s] + log(w[cc[i]]) + mvtnorm::dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE) } logpost[s] = logpost[s] + ddirichlet(w, aa) for(k in 1:KK){ logpost[s] = logpost[s] + mvtnorm::dmvnorm(mu[k,], dd, DD, log=TRUE) logpost[s] = logpost[s] + log(diwish(Sigma[k,,], nu, SS)) } if(s/250==floor(s/250)){ print(paste("s = ", s)) } } ## Plot the logposterior distribution for various samples par(mfrow=c(1,1)) par(mar=c(4,4,1,1)+0.1) plot(logpost, type="l", xlab="Iterations", ylab="Log posterior") ## Plot the density estimate for the last iteration of the MCMC par(mfrow=c(1,1)) par(mar=c(4,4,2,1)+0.1) plot(x[,1], x[,2], col=cc.true, main=paste("s =",s," logpost =", round(logpost[s],4)), xlab=expression(x[1]), ylab=expression(x[2])) for(k in 1:KK){ lines(ellipse(x=Sigma[k,,], centre=mu[k,], level=0.50), col="grey", lty=2, lwd=2) lines(ellipse(x=Sigma[k,,], centre=mu[k,], level=0.82), col="grey", lty=2, lwd=2) lines(ellipse(x=Sigma[k,,], centre=mu[k,], level=0.95), col="grey", lty=2, lwd=2) } ``` sample code for classification of the wine dataset using a mixture model with EM algorithm ```{r} #| label: lbl-sample-code-for-wine-dataset ## Using mixture models for classification in the wine ## Compare linear and quadratic discriminant analysis and a ## (semi-supervised) location and scale mixture model with K normals ## Comparing only against the EM algorithm # Semi-supervised, quadratic discriminant analysis ### Loading data and setting up global variables library(MASS) library(mvtnorm) wine.training = read.table("data/wine_training.txt", sep=",", header=TRUE) wine.test = read.table("data/wine_test.txt", sep=",", header=TRUE) n = dim(wine.training)[1] # Size of the training set m = dim(wine.test)[1] # Size of the test set x = rbind(as.matrix(wine.training[,-1]), as.matrix(wine.test[,-1])) # Create dataset of observations, first n belong to the training set, and the rest belong to the test set p = dim(x)[2] # Number of features KK = 3 epsilon = 0.00001 par(mfrow=c(1,1)) par(mar=c(2,2,2,2)+0.1) colscale = c("black","red","blue") pairs(wine.training[,-1], col=colscale[wine.training[,1]], pch=wine.training[,1]) # Initialize the parameters of the algorithm set.seed(63252) w = rep(1,KK)/KK #Assign equal weight to each component to start with mu = rmvnorm(KK, apply(x,2,mean), var(x)) #Cluster centers randomly spread over the support of the data Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same Sigma[1,,] = var(x)/KK Sigma[2,,] = var(x)/KK Sigma[3,,] = var(x)/KK sw = FALSE KL = -Inf KL.out = NULL s = 0 while(!sw){ ## E step v = array(0, dim=c(n+m,KK)) for(k in 1:KK){ #Compute the log of the weights v[1:n,k] = ifelse(wine.training[,1]==k,0,-Inf) # Training set v[(n+1):(n+m),k] = log(w[k]) + mvtnorm::dmvnorm(x[(n+1):(n+m),], mu[k,], Sigma[k,,],log=TRUE) # Test set } for(i in 1:(n+m)){ v[i,] = exp(v[i,] - max(v[i,]))/sum(exp(v[i,] - max(v[i,]))) #Go from logs to actual weights in a numerically stable manner } ## M step w = apply(v,2,mean) mu = matrix(0, nrow=KK, ncol=p) for(k in 1:KK){ for(i in 1:(n+m)){ mu[k,] = mu[k,] + v[i,k]*x[i,] } mu[k,] = mu[k,]/sum(v[,k]) } Sigma = array(0,dim=c(KK,p,p)) for(k in 1:KK){ for(i in 1:(n+m)){ Sigma[k,,] = Sigma[k,,] + v[i,k]*(x[i,] - mu[k,])%*%t(x[i,] - mu[k,]) } Sigma[k,,] = Sigma[k,,]/sum(v[,k]) } ##Check convergence KLn = 0 for(i in 1:(n+m)){ for(k in 1:KK){ KLn = KLn + v[i,k]*(log(w[k]) + mvtnorm::dmvnorm(x[i,],mu[k,],Sigma[k,,],log=TRUE)) } } if(abs(KLn-KL)/abs(KLn)<epsilon){ sw=TRUE } KL = KLn KL.out = c(KL.out, KL) s = s + 1 print(paste(s, KLn)) } ## Predicted labels apply(v, 1, which.max)[(n+1):(n+m)] ## True labels wine.test[,1] ## Comparison apply(v, 1, which.max)[(n+1):(n+m)] == wine.test[,1] sum(!(apply(v, 1, which.max)[(n+1):(n+m)] == wine.test[,1])) # One error # Using the qda and lda functions in R # qda modqda = qda(grouping=wine.training[,1], x=wine.training[,-1], method="mle") ccpredqda = predict(modqda,newdata=wine.test[,-1]) sum(!(ccpredqda$class == wine.test[,1])) # One error # lda modlda = lda(grouping=wine.training[,1], x=wine.training[,-1], method="mle") ccpredlda = predict(modlda,newdata=wine.test[,-1]) sum(!(ccpredlda$class == wine.test[,1])) # No errors!!! ``` ::: ## Solution ```{r} #| label: lbl-loading data & libraries library(MASS) library(mvtnorm) library(MCMCpack) load("data/banknoteclassification.Rdata") ``` Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data. ```{r} #| label: lbl-mcmc-banknote-classification # Combine data for semi-supervised setup n = nrow(banknote.training) m = nrow(banknote.test) x = rbind(banknote.training, banknote.test) p = ncol(banknote.training) K = 2 # Two classes: genuine/counterfeit # Convert labels to numeric (1/2) label_map = setNames(1:2, c("genuine", "counterfeit")) train_labels = label_map[banknote.training.labels] test_labels = label_map[banknote.test.labels] # Priors from empirical Bayes aa = rep(1, K) dd = colMeans(x) DD = 10 * var(x) nu = p SS = var(x) / 3 # Initial values w = rep(1, K) / K mu = rmvnorm(K, dd, var(x)) Sigma = array(0, dim=c(K, p, p)) for (k in 1:K) Sigma[k,,] = var(x) / K cc = c(train_labels, sample(1:K, m, replace=TRUE)) # MCMC settings iters = 1000 burn = 500 cc.out = array(0, dim=c(iters, n+m)) for (s in 1:iters) { # 1. Sample class indicators (cc) # - Fixed for training set, update for test set for (i in (n+1):(n+m)) { logp = sapply(1:K, function(k) log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE)) p_i = exp(logp - max(logp)); p_i = p_i / sum(p_i) cc[i] = sample(1:K, 1, prob=p_i) } # 2. Update weights w = as.vector(rdirichlet(1, aa + tabulate(cc, nbins=K))) # 3. Update means for (k in 1:K) { idx = which(cc == k) mk = length(idx) DD_st = solve(mk * solve(Sigma[k,,]) + solve(DD)) dd_st = DD_st %*% (solve(Sigma[k,,]) %*% colSums(x[idx,,drop=FALSE]) + solve(DD) %*% dd) mu[k,] = as.vector(rmvnorm(1, dd_st, DD_st)) } # 4. Update covariances for (k in 1:K) { idx = which(cc == k) mk = length(idx) xcensumk = matrix(0, p, p) for (i in idx) { d = x[i,] - mu[k,] xcensumk = xcensumk + d %*% t(d) } Sigma[k,,] = riwish(nu + mk, SS + xcensumk) } cc.out[s,] = cc } ``` What is the classification error for the test set? - [] Is the classification error for the "genuine" class generated by the algorithm correct? should be 0 - [] Is the classification error for the "counterfeit" class generated by the algorithm correct? should be 0 ```{r} #| label: lbl-classification-error # Posterior prediction: for each test sample, majority vote of assignments cc.test.posterior = cc.out[burn:iters, (n+1):(n+m), drop=FALSE] cc.test.pred = apply(cc.test.posterior, 2, function(z) as.integer(names(which.max(table(z))))) classification_error = mean(cc.test.pred != test_labels) cat("MCMC Test Classification Error:", classification_error, "\n") ``` - [] Is the R function `qda` (which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error? - [] Is the classification error for the "genuine" class generated by the algorithm correct? should be 0 out of 85 - [] Is the classification error for the "counterfeit" class generated by the algorithm correct? should be 3 out of 85 i.e. 3.52% ```{r} #| label: lbl-qda-classification-error-qda mod_qda = qda(x=banknote.training, grouping=banknote.training.labels, method="mle") qda_pred = predict(mod_qda, banknote.test)$class qda_error = mean(qda_pred != banknote.test.labels) cat("QDA Test Classification Error:", qda_error, "\n") ``` ::: {.callout-note collapse="true"} ## Solution code ```{r} #| label: lbl-solution-code #### Semisupervised classification for the banknote dataset rm(list=ls()) library(mvtnorm) library(MCMCpack) ## Load data load("data/banknoteclassification.Rdata") #load("banknoteclassification.Rdata") x = rbind(banknote.training,banknote.test) ## Generate data from a mixture with 3 components KK = length(unique(banknote.training.labels)) p = dim(banknote.training)[2] n = dim(banknote.training)[1] m = dim(unique(banknote.test))[1] ## Initialize the parameters w = rep(1,KK)/KK #Assign equal weight to each component to start with mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same Sigma[1,,] = var(x)/KK Sigma[2,,] = var(x)/KK cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w)) # Priors aa = rep(1, KK) dd = apply(x,2,mean) DD = 10*var(x) nu = p+1 SS = var(x)/3 # Number of iteration of the sampler rrr = 11000 burn = 1000 # Storing the samples cc.out = array(0, dim=c(rrr, n+m)) w.out = array(0, dim=c(rrr, KK)) mu.out = array(0, dim=c(rrr, KK, p)) Sigma.out = array(0, dim=c(rrr, KK, p, p)) logpost = rep(0, rrr) for(s in 1:rrr){ # Sample the indicators for(i in (n+1):(n+m)){ v = rep(0,KK) for(k in 1:KK){ v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights } v = exp(v - max(v))/sum(exp(v - max(v))) cc[i] = sample(1:KK, 1, replace=TRUE, prob=v) } # Sample the weights w = as.vector(rdirichlet(1, aa + tabulate(cc))) # Sample the means DD.st = matrix(0, nrow=p, ncol=p) for(k in 1:KK){ mk = sum(cc==k) xsumk = apply(x[cc==k,], 2, sum) DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD)) dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd) mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st)) } # Sample the variances xcensumk = array(0, dim=c(KK,p,p)) for(i in 1:(n+m)){ xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],]) } for(k in 1:KK){ Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,]) } # Store samples cc.out[s,] = cc w.out[s,] = w mu.out[s,,] = mu Sigma.out[s,,,] = Sigma for(i in 1:n){ logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE) } logpost[s] = logpost[s] + ddirichlet(w, aa) for(k in 1:KK){ logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD) logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS) } if(s/250==floor(s/250)){ print(paste("s = ", s)) } } ``` ```{r} #| label: lbl-classification-error-2 probgenuine = rep(NA, m) for(i in 1:m){ probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn) } probcounterfeit = rep(NA, m) for(i in 1:m){ probcounterfeit[i] = sum(cc.out[-seq(1,burn),n+i]==1)/(rrr-burn) } # Confusion matrix using threshold 0.5 predicted_class <- ifelse(probcounterfeit > 0.5, 1, 2) # 1 = counterfeit, 2 = genuine # Compare with true labels (already converted to numeric in your setup) table(True = banknote.test.labels, Pred = predicted_class) ``` classification_error_genuine = 0 classification_error_counterfit = 0 for (i in 1:m){ if (banknote.test.labels[i] == "genuine" && probgenuine[i] < 0.5) { classification_error_genuine = classification_error_genuine + 1 } if (banknote.test.labels[i] == "counterfeit" && probcounterfeit[i] < 0.5) { classification_error_counterfit = classification_error_counterfit + 1 } } classification_error = (classification_error_genuine + classification_error_counterfit) / m print(paste("Genuine error count: ", classification_error_genuine)) print(paste("Genuine error rate: ", classification_error_genuine/m)) print(c("Counterfeit error count: ", classification_error_counterfit)) print(paste("Counterfeit error rate: ", classification_error_counterfit/n)) print(paste("Overall classification error",classification_error)) ``` ```{r} #| label: lbl-qda-classification banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels, method="mle") qda_preds = predict(banknote.qda, banknote.test)$class table(True = banknote.test.labels, Pred = qda_preds) ``` qda_classification_error_genuine = 0 qda_classification_error_counterfit = 0 for (i in 1:m){ if (banknote.test.labels[i] == "genuine" && qda_preds[i] == "counterfeit") { qda_classification_error_genuine = qda_classification_error_genuine + 1 } if (banknote.test.labels[i] == "counterfeit" && qda_preds[i] == "genuine") { qda_classification_error_counterfit = qda_classification_error_counterfit + 1 } } print(paste("QDA genuine error count: ", qda_classification_error_genuine)) print(paste("QDA genuine error rate: ",qda_classification_error_genuine/m)) print(paste("QDA counterfeit error count: ", qda_classification_error_counterfit)) print(paste("QDA counterfeit error rate: ", qda_classification_error_counterfit/m)) ```{r} probgenuine = rep(NA, m) for(i in 1:m){ probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn) } predicted_class <- ifelse(probgenuine > 0.5, 2, 1) # 1 = counterfeit, 2 = genuine labmap <- c("counterfeit", "genuine") predicted_labels <- labmap[predicted_class] tab_mcmc <- table(True = banknote.test.labels, Pred = predicted_labels) # Per-class error rates genuine_error <- 1 - tab_mcmc["genuine", "genuine"] / sum(tab_mcmc["genuine", ]) counterfeit_error <- 1 - tab_mcmc["counterfeit", "counterfeit"] / sum(tab_mcmc["counterfeit", ]) print(paste("genuine: ", round(100 * genuine_error,2))) print(paste("counterfeit: ", round(100 * counterfeit_error, 2))) ``` ```{r} banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels, method="mle") qda_preds = predict(banknote.qda, banknote.test)$class tab_qda <- table(True = banknote.test.labels, Pred = qda_preds) # Per-class error rates genuine_error <- 1 - tab_qda["genuine", "genuine"] / sum(tab_qda["genuine", ]) counterfeit_error <- 1 - tab_qda["counterfeit", "counterfeit"] / sum(tab_qda["counterfeit", ]) print(paste("genuine: ", round(100 * genuine_error,2))) print(paste("counterfeit: ", round(100 * counterfeit_error, 2))) ``` ::::::