Caution
Section omitted to comply with the Honor Code
Bayesian Statistics: Mixture Models
Oren Bochman
Mixture Models, Homework, Density Estimation, MCMC algorithm, Location and scale of 2 Gaussians
Section omitted to comply with the Honor Code
---
title : 'Homework on Bayesian Mixture Models for Classification of Banknotes - M4L7HW3'
subtitle : 'Bayesian Statistics: Mixture Models'
categories:
- Bayesian Statistics
keywords:
- Mixture Models
- Homework
- Density Estimation
- MCMC algorithm
- Location and scale of 2 Gaussians
---
::::: {.content-visible unless-profile="HC"}
::: {.callout-caution}
Section omitted to comply with the Honor Code
:::
:::::
::::: {.content-hidden unless-profile="HC"}
\index{dataset!banknotes }
The data set banknote contains six measurements (length of bill, width of left edge, width of right edge, bottom margin width, top margin width, and length of diagonal, all in mm) made on 100 genuine and 100 counterfeit old-Swiss 1000-franc bank notes
::: {.callout-note}
### Instructions
To load the dataset in R, use the command load("banknoteclassification.Rdata"), but first make sure that your working directory is set to the directory containing the file. You should see four objects:
- banknote.training contains the characteristics for 30 notes (15 genuine and 15 counterfeit) in the training set.
- banknote.training.labels contains the labels ("genuine" or "counterfeit") for the 30 notes in the training set
- banknote.test contains the characteristics for 170 notes (85 genuine and 85 counterfeit) in the test set.
- banknote.test.labels contains the labels ("genuine" or "counterfeit") for the 170 notes in the test set. These are provided only for validation purposes.
You are asked to modify the MCMC algorithm in "Sample code for MCMC example 2" to create an algorithm for semi-supervised classification that is the Bayesian equivalent of that provided under "Sample EM algorithm for classification problems" and apply it to classify the observations contained in the test set. You are then asked to compare your results against those generated by the qda function in R.
As your priors, use distributions in the same families as in "Sample code for MCMC example 2". In particular, use a uniform distribution for the weights, multivariate normal distributions for the means of the components, and inverse Wishart priors for the variance-covariance matrices of the components. The parameters of the priors should be set using the same empirical Bayes approach used in that example.
:::
::: {.callout-note}
### Grading overview
Reviewers will check that the code has been properly adapted, and whether the classification results you provide are correct.
:::
::: {.callout-note collapse="true"}
### Sample code for MCMC example 2
```{r}
#| label: lbl-sample-code-for-mcmc-sample-2
#### Example of an MCMC algorithm for fitting a mixtures of K p-variate Gaussian components
#### The algorithm is tested using simulated data
## Clear the environment and load required libraries
rm(list=ls())
library(mvtnorm)
library(ellipse)
library(MCMCpack)
## Generate data from a mixture with 3 components
KK = 3
p = 2
w.true = c(0.5,0.3,0.2) # True weights associated with the components
mu.true = array(0, dim=c(KK,p))
mu.true[1,] = c(0,0) #True mean for the first component
mu.true[2,] = c(5,5) #True mean for the second component
mu.true[3,] = c(-3,7) #True mean for the third component
Sigma.true = array(0, dim=c(KK,p,p))
Sigma.true[1,,] = matrix(c(1,0,0,1),p,p) #True variance for the first component
Sigma.true[2,,] = matrix(c(2,0.9,0.9,1),p,p) #True variance for the second component
Sigma.true[3,,] = matrix(c(1,-0.9,-0.9,4),p,p) #True variance for the third component
set.seed(63252) #Keep seed the same so that we can reproduce results
n = 120
cc.true = sample(1:3, n, replace=T, prob=w.true)
x = array(0, dim=c(n,p))
for(i in 1:n){
x[i,] = rmvnorm(1, mu.true[cc.true[i],], Sigma.true[cc.true[i],,])
}
par(mfrow=c(1,1))
plot(x[,1], x[,2], col=cc.true, xlab=expression(x[1]), ylab=expression(x[2]), type="n")
text(x[,1], x[,2], seq(1,n), col=cc.true, cex=0.6)
for(k in 1:KK){
lines(ellipse(x=Sigma.true[k,,], centre=mu.true[k,], level=0.50), col="grey", lty=2)
lines(ellipse(x=Sigma.true[k,,], centre=mu.true[k,], level=0.82), col="grey", lty=2)
lines(ellipse(x=Sigma.true[k,,], centre=mu.true[k,], level=0.95), col="grey", lty=2)
}
title(main="Data + True Components")
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
Sigma[3,,] = var(x)/KK
cc = sample(1:KK, n, replace=TRUE, prob=w)
par(mfrow=c(1,1))
plot(x[,1], x[,2], col=cc.true, xlab=expression(x[1]), ylab=expression(x[2]))
for(k in 1:KK){
lines(ellipse(x=Sigma[k,,], centre=mu[k,], level=0.50), col="grey", lty=2, lwd=2)
lines(ellipse(x=Sigma[k,,], centre=mu[k,], level=0.82), col="grey", lty=2, lwd=2)
lines(ellipse(x=Sigma[k,,], centre=mu[k,], level=0.95), col="grey", lty=2, lwd=2)
}
title(main="Initial estimate + Observations")
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p
SS = var(x)/3
# Number of iteration of the sampler
rrr = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in 1:n){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + mvtnorm::dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:n){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + mvtnorm::dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + mvtnorm::dmvnorm(mu[k,], dd, DD, log=TRUE)
logpost[s] = logpost[s] + log(diwish(Sigma[k,,], nu, SS))
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## Plot the logposterior distribution for various samples
par(mfrow=c(1,1))
par(mar=c(4,4,1,1)+0.1)
plot(logpost, type="l", xlab="Iterations", ylab="Log posterior")
## Plot the density estimate for the last iteration of the MCMC
par(mfrow=c(1,1))
par(mar=c(4,4,2,1)+0.1)
plot(x[,1], x[,2], col=cc.true, main=paste("s =",s," logpost =", round(logpost[s],4)), xlab=expression(x[1]), ylab=expression(x[2]))
for(k in 1:KK){
lines(ellipse(x=Sigma[k,,], centre=mu[k,], level=0.50), col="grey", lty=2, lwd=2)
lines(ellipse(x=Sigma[k,,], centre=mu[k,], level=0.82), col="grey", lty=2, lwd=2)
lines(ellipse(x=Sigma[k,,], centre=mu[k,], level=0.95), col="grey", lty=2, lwd=2)
}
```
sample code for classification of the wine dataset using a mixture model with EM algorithm
```{r}
#| label: lbl-sample-code-for-wine-dataset
## Using mixture models for classification in the wine
## Compare linear and quadratic discriminant analysis and a
## (semi-supervised) location and scale mixture model with K normals
## Comparing only against the EM algorithm
# Semi-supervised, quadratic discriminant analysis
### Loading data and setting up global variables
library(MASS)
library(mvtnorm)
wine.training = read.table("data/wine_training.txt", sep=",", header=TRUE)
wine.test = read.table("data/wine_test.txt", sep=",", header=TRUE)
n = dim(wine.training)[1] # Size of the training set
m = dim(wine.test)[1] # Size of the test set
x = rbind(as.matrix(wine.training[,-1]), as.matrix(wine.test[,-1])) # Create dataset of observations, first n belong to the training set, and the rest belong to the test set
p = dim(x)[2] # Number of features
KK = 3
epsilon = 0.00001
par(mfrow=c(1,1))
par(mar=c(2,2,2,2)+0.1)
colscale = c("black","red","blue")
pairs(wine.training[,-1], col=colscale[wine.training[,1]], pch=wine.training[,1])
# Initialize the parameters of the algorithm
set.seed(63252)
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #Cluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
Sigma[3,,] = var(x)/KK
sw = FALSE
KL = -Inf
KL.out = NULL
s = 0
while(!sw){
## E step
v = array(0, dim=c(n+m,KK))
for(k in 1:KK){ #Compute the log of the weights
v[1:n,k] = ifelse(wine.training[,1]==k,0,-Inf) # Training set
v[(n+1):(n+m),k] = log(w[k]) + mvtnorm::dmvnorm(x[(n+1):(n+m),], mu[k,], Sigma[k,,],log=TRUE) # Test set
}
for(i in 1:(n+m)){
v[i,] = exp(v[i,] - max(v[i,]))/sum(exp(v[i,] - max(v[i,]))) #Go from logs to actual weights in a numerically stable manner
}
## M step
w = apply(v,2,mean)
mu = matrix(0, nrow=KK, ncol=p)
for(k in 1:KK){
for(i in 1:(n+m)){
mu[k,] = mu[k,] + v[i,k]*x[i,]
}
mu[k,] = mu[k,]/sum(v[,k])
}
Sigma = array(0,dim=c(KK,p,p))
for(k in 1:KK){
for(i in 1:(n+m)){
Sigma[k,,] = Sigma[k,,] + v[i,k]*(x[i,] - mu[k,])%*%t(x[i,] - mu[k,])
}
Sigma[k,,] = Sigma[k,,]/sum(v[,k])
}
##Check convergence
KLn = 0
for(i in 1:(n+m)){
for(k in 1:KK){
KLn = KLn + v[i,k]*(log(w[k]) + mvtnorm::dmvnorm(x[i,],mu[k,],Sigma[k,,],log=TRUE))
}
}
if(abs(KLn-KL)/abs(KLn)<epsilon){
sw=TRUE
}
KL = KLn
KL.out = c(KL.out, KL)
s = s + 1
print(paste(s, KLn))
}
## Predicted labels
apply(v, 1, which.max)[(n+1):(n+m)]
## True labels
wine.test[,1]
## Comparison
apply(v, 1, which.max)[(n+1):(n+m)] == wine.test[,1]
sum(!(apply(v, 1, which.max)[(n+1):(n+m)] == wine.test[,1])) # One error
# Using the qda and lda functions in R
# qda
modqda = qda(grouping=wine.training[,1], x=wine.training[,-1], method="mle")
ccpredqda = predict(modqda,newdata=wine.test[,-1])
sum(!(ccpredqda$class == wine.test[,1])) # One error
# lda
modlda = lda(grouping=wine.training[,1], x=wine.training[,-1], method="mle")
ccpredlda = predict(modlda,newdata=wine.test[,-1])
sum(!(ccpredlda$class == wine.test[,1])) # No errors!!!
```
:::
## Solution
```{r}
#| label: lbl-loading data & libraries
library(MASS)
library(mvtnorm)
library(MCMCpack)
load("data/banknoteclassification.Rdata")
```
Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data.
```{r}
#| label: lbl-mcmc-banknote-classification
# Combine data for semi-supervised setup
n = nrow(banknote.training)
m = nrow(banknote.test)
x = rbind(banknote.training, banknote.test)
p = ncol(banknote.training)
K = 2 # Two classes: genuine/counterfeit
# Convert labels to numeric (1/2)
label_map = setNames(1:2, c("genuine", "counterfeit"))
train_labels = label_map[banknote.training.labels]
test_labels = label_map[banknote.test.labels]
# Priors from empirical Bayes
aa = rep(1, K)
dd = colMeans(x)
DD = 10 * var(x)
nu = p
SS = var(x) / 3
# Initial values
w = rep(1, K) / K
mu = rmvnorm(K, dd, var(x))
Sigma = array(0, dim=c(K, p, p))
for (k in 1:K) Sigma[k,,] = var(x) / K
cc = c(train_labels, sample(1:K, m, replace=TRUE))
# MCMC settings
iters = 1000
burn = 500
cc.out = array(0, dim=c(iters, n+m))
for (s in 1:iters) {
# 1. Sample class indicators (cc)
# - Fixed for training set, update for test set
for (i in (n+1):(n+m)) {
logp = sapply(1:K, function(k)
log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE))
p_i = exp(logp - max(logp)); p_i = p_i / sum(p_i)
cc[i] = sample(1:K, 1, prob=p_i)
}
# 2. Update weights
w = as.vector(rdirichlet(1, aa + tabulate(cc, nbins=K)))
# 3. Update means
for (k in 1:K) {
idx = which(cc == k)
mk = length(idx)
DD_st = solve(mk * solve(Sigma[k,,]) + solve(DD))
dd_st = DD_st %*% (solve(Sigma[k,,]) %*% colSums(x[idx,,drop=FALSE]) + solve(DD) %*% dd)
mu[k,] = as.vector(rmvnorm(1, dd_st, DD_st))
}
# 4. Update covariances
for (k in 1:K) {
idx = which(cc == k)
mk = length(idx)
xcensumk = matrix(0, p, p)
for (i in idx) {
d = x[i,] - mu[k,]
xcensumk = xcensumk + d %*% t(d)
}
Sigma[k,,] = riwish(nu + mk, SS + xcensumk)
}
cc.out[s,] = cc
}
```
What is the classification error for the test set?
- [] Is the classification error for the "genuine" class generated by the algorithm correct? should be 0
- [] Is the classification error for the "counterfeit" class generated by the algorithm correct? should be 0
```{r}
#| label: lbl-classification-error
# Posterior prediction: for each test sample, majority vote of assignments
cc.test.posterior = cc.out[burn:iters, (n+1):(n+m), drop=FALSE]
cc.test.pred = apply(cc.test.posterior, 2, function(z) as.integer(names(which.max(table(z)))))
classification_error = mean(cc.test.pred != test_labels)
cat("MCMC Test Classification Error:", classification_error, "\n")
```
- [] Is the R function `qda` (which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error?
- [] Is the classification error for the "genuine" class generated by the algorithm correct? should be 0 out of 85
- [] Is the classification error for the "counterfeit" class generated by the algorithm correct? should be 3 out of 85 i.e. 3.52%
```{r}
#| label: lbl-qda-classification-error-qda
mod_qda = qda(x=banknote.training, grouping=banknote.training.labels, method="mle")
qda_pred = predict(mod_qda, banknote.test)$class
qda_error = mean(qda_pred != banknote.test.labels)
cat("QDA Test Classification Error:", qda_error, "\n")
```
::: {.callout-note collapse="true"}
## Solution code
```{r}
#| label: lbl-solution-code
#### Semisupervised classification for the banknote dataset
rm(list=ls())
library(mvtnorm)
library(MCMCpack)
## Load data
load("data/banknoteclassification.Rdata")
#load("banknoteclassification.Rdata")
x = rbind(banknote.training,banknote.test)
## Generate data from a mixture with 3 components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
```
```{r}
#| label: lbl-classification-error-2
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
probcounterfeit = rep(NA, m)
for(i in 1:m){
probcounterfeit[i] = sum(cc.out[-seq(1,burn),n+i]==1)/(rrr-burn)
}
# Confusion matrix using threshold 0.5
predicted_class <- ifelse(probcounterfeit > 0.5, 1, 2) # 1 = counterfeit, 2 = genuine
# Compare with true labels (already converted to numeric in your setup)
table(True = banknote.test.labels, Pred = predicted_class)
```
classification_error_genuine = 0
classification_error_counterfit = 0
for (i in 1:m){
if (banknote.test.labels[i] == "genuine" && probgenuine[i] < 0.5) {
classification_error_genuine = classification_error_genuine + 1
}
if (banknote.test.labels[i] == "counterfeit" && probcounterfeit[i] < 0.5) {
classification_error_counterfit = classification_error_counterfit + 1
}
}
classification_error = (classification_error_genuine + classification_error_counterfit) / m
print(paste("Genuine error count: ", classification_error_genuine))
print(paste("Genuine error rate: ", classification_error_genuine/m))
print(c("Counterfeit error count: ", classification_error_counterfit))
print(paste("Counterfeit error rate: ", classification_error_counterfit/n))
print(paste("Overall classification error",classification_error))
```
```{r}
#| label: lbl-qda-classification
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels, method="mle")
qda_preds = predict(banknote.qda, banknote.test)$class
table(True = banknote.test.labels, Pred = qda_preds)
```
qda_classification_error_genuine = 0
qda_classification_error_counterfit = 0
for (i in 1:m){
if (banknote.test.labels[i] == "genuine" && qda_preds[i] == "counterfeit") {
qda_classification_error_genuine = qda_classification_error_genuine + 1
}
if (banknote.test.labels[i] == "counterfeit" && qda_preds[i] == "genuine") {
qda_classification_error_counterfit = qda_classification_error_counterfit + 1
}
}
print(paste("QDA genuine error count: ", qda_classification_error_genuine))
print(paste("QDA genuine error rate: ",qda_classification_error_genuine/m))
print(paste("QDA counterfeit error count: ", qda_classification_error_counterfit))
print(paste("QDA counterfeit error rate: ", qda_classification_error_counterfit/m))
```{r}
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
predicted_class <- ifelse(probgenuine > 0.5, 2, 1) # 1 = counterfeit, 2 = genuine
labmap <- c("counterfeit", "genuine")
predicted_labels <- labmap[predicted_class]
tab_mcmc <- table(True = banknote.test.labels, Pred = predicted_labels)
# Per-class error rates
genuine_error <- 1 - tab_mcmc["genuine", "genuine"] / sum(tab_mcmc["genuine", ])
counterfeit_error <- 1 - tab_mcmc["counterfeit", "counterfeit"] / sum(tab_mcmc["counterfeit", ])
print(paste("genuine: ", round(100 * genuine_error,2)))
print(paste("counterfeit: ", round(100 * counterfeit_error, 2)))
```
```{r}
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels, method="mle")
qda_preds = predict(banknote.qda, banknote.test)$class
tab_qda <- table(True = banknote.test.labels, Pred = qda_preds)
# Per-class error rates
genuine_error <- 1 - tab_qda["genuine", "genuine"] / sum(tab_qda["genuine", ])
counterfeit_error <- 1 - tab_qda["counterfeit", "counterfeit"] / sum(tab_qda["counterfeit", ])
print(paste("genuine: ", round(100 * genuine_error,2)))
print(paste("counterfeit: ", round(100 * counterfeit_error, 2)))
```
::::::