The issue is in how the computer represents numbers. The computer uses a finite number of bits to represent numbers, which can lead to numerical instability when performing calculations that involve very large or very small numbers. This can result in loss of precision and incorrect results.
The solution is to use logarithmic transformations to avoid numerical instability. By taking the logarithm of the numbers, we can work with smaller and more manageable values, which reduces the risk of numerical instability.
79.1.2 Sample code to illustrate numerical stability issues 🗒️
## Consider a mixture of two normal distributions with equal weights (w1 = w2 = 1/2)## Component 1 has mean 0 and standard deviation 1## Component 2 has mean 1 and standard deviation 1## The observation is x = 50## What is Pr(c = 1 | x)?dnorm(50, 0, 1)
## What if x=3? Two ways to do the calculation## One way: Direct calculationz1 =dnorm(3, 0, 1)z2 =dnorm(3, 1, 1)z1/(z1+z2)
[1] 0.07585818
## A second way: Compute in the logarithm scale, add b ## to all values, and then exponentiate before standardizinglz1 =dnorm(3, 0, 1, log=T)lz2 =dnorm(3, 1, 1, log=T)b =3exp(lz1+b)/(exp(lz1+b) +exp(lz2+b))
[1] 0.07585818
## Going back to the case x - 50:## Wronglz1 =log(dnorm(50, 0, 1))lz2 =log(dnorm(50, 1, 1))b =max(lz1, lz2)exp(lz1-b)/(exp(lz1-b) +exp(lz2-b))
## Also right (just more cumbersome)lz1 =-0.5*log(2*pi) -0.5*50^2lz2 =-0.5*log(2*pi) -0.5*49^2b =max(lz1, lz2)exp(lz1-b)/(exp(lz1-b) +exp(lz2-b))
[1] 3.179971e-22
79.1.3 Computational issues associated with multimodality 🎥
Figure 79.2: multimodality issues
79.1.4 Sample code to illustrate multimodality issues 1 🗒️
## Illustrating the fact that the likelihood for a mixture model is multimodal### Loading data and setting up global variablesrm(list=ls())library(mclust)
Package 'mclust' version 6.1.1
Type 'citation("mclust")' for citing this R package in publications.
library(mvtnorm)
Attaching package: 'mvtnorm'
The following object is masked from 'package:mclust':
dmvnorm
### Defining a custom function to create pair plots### This is an alternative to the R function pairs() that allows for ### more flexibility. In particular, it allows us to use text to label ### the pointspairs2 =function(x, col="black", pch=16, labels=NULL, names =colnames(x)){ n =dim(x)[1] p =dim(x)[2]par(mfrow=c(p,p))for(k in1:p){for(l in1:p){if(k!=l){par(mar=c(3,3,1,1)+0.1)plot(x[,k], x[,l], type="n", xlab="", ylab="")if(is.null(labels)){points(x[,k], x[,l], pch=pch, col=col) }else{text(x[,k], x[,l], labels=labels, col=col) } }else{plot(seq(0,5), seq(0,5), type="n", xlab="", ylab="", axes=FALSE)text(2.5,2.5,names[k], cex=1.2) } } }}## Setup datadata(iris)x =as.matrix(iris[,-5])n =dim(x)[1]p =dim(x)[2] # Number of featuresKK =3epsilon =0.0000001par(mfrow=c(1,1))par(mar=c(4,4,1,1))colscale =c("black","blue","red")shortnam =c("s","c","g")# Initialize the parameters of the algorithmset.seed(63252)numruns =15v.sum =array(0, dim=c(numruns, n, KK))QQ.sum =rep(0, numruns)for(ss in1:numruns){ w =rep(1,KK)/KK #Assign equal weight to each component to start with#mu = as.matrix(aggregate(x, list(iris[,5]), mean)[,2:5]) # Initialize in the true values#Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same#Sigma[1,,] = var(x[iris[,5]=="setosa",])#Sigma[2,,] = var(x[iris[,5]=="versicolor",])#Sigma[3,,] = var(x[iris[,5]=="virginica",]) mu =rmvnorm(KK, apply(x,2,mean), 3*var(x)) #Cluster centers randomly spread over the support of the data Sigma =array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same Sigma[1,,] =var(x) Sigma[2,,] =var(x) Sigma[3,,] =var(x) sw =FALSE QQ =-Inf QQ.out =NULL s =0while(!sw){## E step v =array(0, dim=c(n,KK))for(k in1:KK){ #Compute the log of the weights v[,k] =log(w[k]) +dmvnorm(x, mu[k,], Sigma[k,,], log=TRUE) }for(i in1:n){ v[i,] =exp(v[i,] -max(v[i,]))/sum(exp(v[i,] -max(v[i,]))) #Go from logs to actual weights in a numerically stable manner }## M step w =apply(v,2,mean) mu =matrix(0, nrow=KK, ncol=p)for(k in1:KK){for(i in1:n){ mu[k,] = mu[k,] + v[i,k]*x[i,] } mu[k,] = mu[k,]/sum(v[,k]) } Sigma =array(0,dim=c(KK, p, p))for(k in1:KK){for(i in1:n){ Sigma[k,,] = Sigma[k,,] + v[i,k]*(x[i,] - mu[k,])%*%t(x[i,] - mu[k,]) } Sigma[k,,] = Sigma[k,,]/sum(v[,k]) }##Check convergence QQn =0for(i in1:n){for(k in1:KK){ QQn = QQn + v[i,k]*(log(w[k]) +dmvnorm(x[i,],mu[k,],Sigma[k,,],log=TRUE)) } }if(abs(QQn-QQ)/abs(QQn)<epsilon){ sw=TRUE } QQ = QQn QQ.out =c(QQ.out, QQ) s = s +1 } v.sum[ss,,] = v QQ.sum[ss] = QQ.out[s]print(paste("ss =", ss))}
## Boxplot of final values of the Q function for all runs of the algorithmpar(mfrow=c(1,1))par(mar=c(4,4,1,1))boxplot(QQ.out, ylab="Q", xlab="Iterations",las=2)
## Graphical representation of the best solution cc =apply(v.sum[which.max(QQ.sum),,], 1 ,which.max)colscale =c("black","blue","red")#pairs(x, col=colscale[cc], pch=cc)pairs2(x, col=colscale[cc], labels=cc)
## Graphical representation of the worst solutioncc =apply(v.sum[which.min(QQ.sum),,], 1 ,which.max)colscale =c("black","blue","red")#pairs(x, col=colscale[cc], pch=cc)pairs2(x, col=colscale[cc], labels=cc)
79.1.5 Sample code to illustrate multimodality issues 2 🗒️
This code fails to converge because the algorithm is stuck in a local maximum of the likelihood function. The problem is that one of the components is “numerically empty” (i.e., it has no data points assigned to it). This can happen when the initial values for the means are too far apart or when the data is not well-separated.
## Illustrating that the EM might fail for numerical reasons if a component is “numerically empty”### Loading data and setting up global variablesrm(list=ls())library(mclust)library(mvtnorm)library(ellipse)
Attaching package: 'ellipse'
The following object is masked from 'package:graphics':
pairs
## Setup datadata(iris)x =as.matrix(iris[,-5])n =dim(x)[1]p =dim(x)[2] # Number of featuresKK =3epsilon =0.00000001# Initialize the parameters of the algorithmw =rep(1,KK)/KK #Assign equal weight to each component to start withmu =matrix(0, KK, p) # Initialize in the true valuesmu[1,] =apply(x, 2, mean)mu[2,] =apply(x, 2, mean) +c(2.2, 2.2, 2.2, 2.2)mu[3,] =apply(x, 2, mean) +c(-2.2, -2.2, -2.2, -2.2)Sigma =array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the sameSigma[1,,] =var(x)/3Sigma[2,,] =var(x)/3Sigma[3,,] =var(x)/3# Plot the data along with the estimates of the componentscolscale =c("black","blue","red")par(mfrow=c(p,p))for(k in1:p){for(l in1:p){if(k!=l){par(mar=c(3,3,1,1)+0.1)plot(x[,k], x[,l], type="n", xlab="", ylab="", xlim=c(min(c(x[,k], mu[,k])),max(c(x[,k], mu[,k]))), ylim=c(min(c(x[,l], mu[,l])),max(c(x[,l], mu[,l]))))for(r in1:KK){lines(ellipse(x=Sigma[r,c(k,l),c(k,l)], centre=mu[r,c(k,l)], level=0.50), col="gold1", lty=1, lwd=1)lines(ellipse(x=Sigma[r,c(k,l),c(k,l)], centre=mu[r,c(k,l)], level=0.82), col="gold1", lty=1, lwd=1)lines(ellipse(x=Sigma[r,c(k,l),c(k,l)], centre=mu[r,c(k,l)], level=0.95), col="gold1", lty=1, lwd=1) }text(x[,k], x[,l], labels=as.numeric(iris[,5]), col=colscale[iris[,5]])points(mu[,k], mu[,l], pch=19, col="gold1", cex=2) }else{plot(seq(0,5), seq(0,5), type="n", xlab="", ylab="", axes=FALSE)text(2.5,2.5,colnames(x)[k], cex=1.5) } }}
## Run the EM algorithm. It will fail in the first iterationsw =FALSEQQ =-InfQQ.out =NULLs =0while(!sw){## E step v =array(0, dim=c(n,KK))for(k in1:KK){ #Compute the log of the weights v[,k] =log(w[k]) +dmvnorm(x, mu[k,], Sigma[k,,], log=TRUE) }for(i in1:n){ v[i,] =exp(v[i,] -max(v[i,]))/sum(exp(v[i,] -max(v[i,]))) #Go from logs to actual weights in a numerically stable manner }## M step w =apply(v,2,mean) mu =matrix(0, nrow=KK, ncol=p)for(k in1:KK){for(i in1:n){ mu[k,] = mu[k,] + v[i,k]*x[i,] } mu[k,] = mu[k,]/sum(v[,k]) } Sigma =array(0,dim=c(KK, p, p))for(k in1:KK){for(i in1:n){ Sigma[k,,] = Sigma[k,,] + v[i,k]*(x[i,] - mu[k,])%*%t(x[i,] - mu[k,]) } Sigma[k,,] = Sigma[k,,]/sum(v[,k]) }##Check convergence QQn =0for(i in1:n){for(k in1:KK){ QQn = QQn + v[i,k]*(log(w[k]) +dmvnorm(x[i,],mu[k,],Sigma[k,,],log=TRUE)) } }if(abs(QQn-QQ)/abs(QQn)<epsilon){ sw=TRUE } QQ = QQn QQ.out =c(QQ.out, QQ) s = s +1}
Error in if (abs(QQn - QQ)/abs(QQn) < epsilon) {: missing value where TRUE/FALSE needed
---title : 'Computational Considerations - M5L8'subtitle : 'Bayesian Statistics: Mixture Models'categories: - Bayesian Statisticskeywords: - Mixture Models - Maximum Likelihood Estimation - BIC - Bayesian Information Criteria - Multimodality - Numerical Instability - notes---## Computational Considerations :movie_camera: {#fig-s01-numerical-stability .column-margin width="53mm"}### Numerical stability\index{mixture!Numerical stability}\index{numerical stability}The issue is in how the computer represents numbers. The computer uses a finite number of bits to represent numbers, which can lead to numerical instability when performing calculations that involve very large or very small numbers. This can result in loss of precision and incorrect results. \index{transformations, logarithmic}The solution is to use **logarithmic transformations** to avoid numerical instability. By taking the logarithm of the numbers, we can work with smaller and more manageable values, which reduces the risk of numerical instability.### Sample code to illustrate numerical stability issues :spiral_notepad: ```{r}#| label: lst-numeric-stability-sample-code## Consider a mixture of two normal distributions with equal weights (w1 = w2 = 1/2)## Component 1 has mean 0 and standard deviation 1## Component 2 has mean 1 and standard deviation 1## The observation is x = 50## What is Pr(c = 1 | x)?dnorm(50, 0, 1)dnorm(50, 1, 1)dnorm(50, 0, 1)/(dnorm(50, 0, 1) +dnorm(50, 1, 1))## What if x=3? Two ways to do the calculation## One way: Direct calculationz1 =dnorm(3, 0, 1)z2 =dnorm(3, 1, 1)z1/(z1+z2)## A second way: Compute in the logarithm scale, add b ## to all values, and then exponentiate before standardizinglz1 =dnorm(3, 0, 1, log=T)lz2 =dnorm(3, 1, 1, log=T)b =3exp(lz1+b)/(exp(lz1+b) +exp(lz2+b))## Going back to the case x - 50:## Wronglz1 =log(dnorm(50, 0, 1))lz2 =log(dnorm(50, 1, 1))b =max(lz1, lz2)exp(lz1-b)/(exp(lz1-b) +exp(lz2-b))## Wronglz1 =log(exp(-0.5*50^2)/sqrt(2*pi))lz2 =log(exp(-0.5*49^2)/sqrt(2*pi))b =max(lz1, lz2)exp(lz1-b)/(exp(lz1-b) +exp(lz2-b))## Rightlz1 =dnorm(50, 0, 1, log=TRUE)lz2 =dnorm(50, 1, 1, log=TRUE)b =max(lz1, lz2)exp(lz1-b)/(exp(lz1-b) +exp(lz2-b))## Also right (just more cumbersome)lz1 =-0.5*log(2*pi) -0.5*50^2lz2 =-0.5*log(2*pi) -0.5*49^2b =max(lz1, lz2)exp(lz1-b)/(exp(lz1-b) +exp(lz2-b))```### Computational issues associated with multimodality :movie_camera: {#sec-computational-issues}{#fig-s02-multimodality .column-margin width="53mm"}### Sample code to illustrate multimodality issues 1 :spiral_notepad: {#sec-multimodality-1}\index{dataset!iris}\index{multimodality}\index{numerical instability}```{r}#| label: lst-multimodality-sample-code-1## Illustrating the fact that the likelihood for a mixture model is multimodal### Loading data and setting up global variablesrm(list=ls())library(mclust)library(mvtnorm)### Defining a custom function to create pair plots### This is an alternative to the R function pairs() that allows for ### more flexibility. In particular, it allows us to use text to label ### the pointspairs2 =function(x, col="black", pch=16, labels=NULL, names =colnames(x)){ n =dim(x)[1] p =dim(x)[2]par(mfrow=c(p,p))for(k in1:p){for(l in1:p){if(k!=l){par(mar=c(3,3,1,1)+0.1)plot(x[,k], x[,l], type="n", xlab="", ylab="")if(is.null(labels)){points(x[,k], x[,l], pch=pch, col=col) }else{text(x[,k], x[,l], labels=labels, col=col) } }else{plot(seq(0,5), seq(0,5), type="n", xlab="", ylab="", axes=FALSE)text(2.5,2.5,names[k], cex=1.2) } } }}## Setup datadata(iris)x =as.matrix(iris[,-5])n =dim(x)[1]p =dim(x)[2] # Number of featuresKK =3epsilon =0.0000001par(mfrow=c(1,1))par(mar=c(4,4,1,1))colscale =c("black","blue","red")shortnam =c("s","c","g")# Initialize the parameters of the algorithmset.seed(63252)numruns =15v.sum =array(0, dim=c(numruns, n, KK))QQ.sum =rep(0, numruns)for(ss in1:numruns){ w =rep(1,KK)/KK #Assign equal weight to each component to start with#mu = as.matrix(aggregate(x, list(iris[,5]), mean)[,2:5]) # Initialize in the true values#Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same#Sigma[1,,] = var(x[iris[,5]=="setosa",])#Sigma[2,,] = var(x[iris[,5]=="versicolor",])#Sigma[3,,] = var(x[iris[,5]=="virginica",]) mu =rmvnorm(KK, apply(x,2,mean), 3*var(x)) #Cluster centers randomly spread over the support of the data Sigma =array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same Sigma[1,,] =var(x) Sigma[2,,] =var(x) Sigma[3,,] =var(x) sw =FALSE QQ =-Inf QQ.out =NULL s =0while(!sw){## E step v =array(0, dim=c(n,KK))for(k in1:KK){ #Compute the log of the weights v[,k] =log(w[k]) +dmvnorm(x, mu[k,], Sigma[k,,], log=TRUE) }for(i in1:n){ v[i,] =exp(v[i,] -max(v[i,]))/sum(exp(v[i,] -max(v[i,]))) #Go from logs to actual weights in a numerically stable manner }## M step w =apply(v,2,mean) mu =matrix(0, nrow=KK, ncol=p)for(k in1:KK){for(i in1:n){ mu[k,] = mu[k,] + v[i,k]*x[i,] } mu[k,] = mu[k,]/sum(v[,k]) } Sigma =array(0,dim=c(KK, p, p))for(k in1:KK){for(i in1:n){ Sigma[k,,] = Sigma[k,,] + v[i,k]*(x[i,] - mu[k,])%*%t(x[i,] - mu[k,]) } Sigma[k,,] = Sigma[k,,]/sum(v[,k]) }##Check convergence QQn =0for(i in1:n){for(k in1:KK){ QQn = QQn + v[i,k]*(log(w[k]) +dmvnorm(x[i,],mu[k,],Sigma[k,,],log=TRUE)) } }if(abs(QQn-QQ)/abs(QQn)<epsilon){ sw=TRUE } QQ = QQn QQ.out =c(QQ.out, QQ) s = s +1 } v.sum[ss,,] = v QQ.sum[ss] = QQ.out[s]print(paste("ss =", ss))}## Boxplot of final values of the Q function for all runs of the algorithmpar(mfrow=c(1,1))par(mar=c(4,4,1,1))boxplot(QQ.out, ylab="Q", xlab="Iterations",las=2)## Graphical representation of the best solution cc =apply(v.sum[which.max(QQ.sum),,], 1 ,which.max)colscale =c("black","blue","red")#pairs(x, col=colscale[cc], pch=cc)pairs2(x, col=colscale[cc], labels=cc)## Graphical representation of the worst solutioncc =apply(v.sum[which.min(QQ.sum),,], 1 ,which.max)colscale =c("black","blue","red")#pairs(x, col=colscale[cc], pch=cc)pairs2(x, col=colscale[cc], labels=cc)```### Sample code to illustrate multimodality issues 2 :spiral_notepad: {#sec-multimodality-2}This code fails to converge because the algorithm is stuck in a local maximum of the likelihood function. The problem is that one of the components is "numerically empty" (i.e., it has no data points assigned to it). [This can happen when the initial values for the means are too far apart or when the data is not well-separated.]{.mark}```{r}#| label: lst-multimodality-sample-code-2#| error: true## Illustrating that the EM might fail for numerical reasons if a component is “numerically empty”### Loading data and setting up global variablesrm(list=ls())library(mclust)library(mvtnorm)library(ellipse)## Setup datadata(iris)x =as.matrix(iris[,-5])n =dim(x)[1]p =dim(x)[2] # Number of featuresKK =3epsilon =0.00000001# Initialize the parameters of the algorithmw =rep(1,KK)/KK #Assign equal weight to each component to start withmu =matrix(0, KK, p) # Initialize in the true valuesmu[1,] =apply(x, 2, mean)mu[2,] =apply(x, 2, mean) +c(2.2, 2.2, 2.2, 2.2)mu[3,] =apply(x, 2, mean) +c(-2.2, -2.2, -2.2, -2.2)Sigma =array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the sameSigma[1,,] =var(x)/3Sigma[2,,] =var(x)/3Sigma[3,,] =var(x)/3# Plot the data along with the estimates of the componentscolscale =c("black","blue","red")par(mfrow=c(p,p))for(k in1:p){for(l in1:p){if(k!=l){par(mar=c(3,3,1,1)+0.1)plot(x[,k], x[,l], type="n", xlab="", ylab="", xlim=c(min(c(x[,k], mu[,k])),max(c(x[,k], mu[,k]))), ylim=c(min(c(x[,l], mu[,l])),max(c(x[,l], mu[,l]))))for(r in1:KK){lines(ellipse(x=Sigma[r,c(k,l),c(k,l)], centre=mu[r,c(k,l)], level=0.50), col="gold1", lty=1, lwd=1)lines(ellipse(x=Sigma[r,c(k,l),c(k,l)], centre=mu[r,c(k,l)], level=0.82), col="gold1", lty=1, lwd=1)lines(ellipse(x=Sigma[r,c(k,l),c(k,l)], centre=mu[r,c(k,l)], level=0.95), col="gold1", lty=1, lwd=1) }text(x[,k], x[,l], labels=as.numeric(iris[,5]), col=colscale[iris[,5]])points(mu[,k], mu[,l], pch=19, col="gold1", cex=2) }else{plot(seq(0,5), seq(0,5), type="n", xlab="", ylab="", axes=FALSE)text(2.5,2.5,colnames(x)[k], cex=1.5) } }}## Run the EM algorithm. It will fail in the first iterationsw =FALSEQQ =-InfQQ.out =NULLs =0while(!sw){## E step v =array(0, dim=c(n,KK))for(k in1:KK){ #Compute the log of the weights v[,k] =log(w[k]) +dmvnorm(x, mu[k,], Sigma[k,,], log=TRUE) }for(i in1:n){ v[i,] =exp(v[i,] -max(v[i,]))/sum(exp(v[i,] -max(v[i,]))) #Go from logs to actual weights in a numerically stable manner }## M step w =apply(v,2,mean) mu =matrix(0, nrow=KK, ncol=p)for(k in1:KK){for(i in1:n){ mu[k,] = mu[k,] + v[i,k]*x[i,] } mu[k,] = mu[k,]/sum(v[,k]) } Sigma =array(0,dim=c(KK, p, p))for(k in1:KK){for(i in1:n){ Sigma[k,,] = Sigma[k,,] + v[i,k]*(x[i,] - mu[k,])%*%t(x[i,] - mu[k,]) } Sigma[k,,] = Sigma[k,,]/sum(v[,k]) }##Check convergence QQn =0for(i in1:n){for(k in1:KK){ QQn = QQn + v[i,k]*(log(w[k]) +dmvnorm(x[i,],mu[k,],Sigma[k,,],log=TRUE)) } }if(abs(QQn-QQ)/abs(QQn)<epsilon){ sw=TRUE } QQ = QQn QQ.out =c(QQ.out, QQ) s = s +1}QQn```