n1=100; n2=400;
Ca<-cbind(rnorm(n1,mean=5,sd=0.5),rnorm(n1,mean=4,sd=0.5))
Cb<-cbind(rnorm(n2,mean=14,sd=1),rnorm(n2,mean=7,sd=1))
X<-rbind(Ca,Cb)
cat(crayon::bold(" Glimpse of Dataset X: \n"))
print(X[1:5,])
cat(crayon::bold("\n Dimension of Dataset: ")," Samples:",dim(X)[1],"\t Features:",dim(X)[2])
true=c(rep(1,n1),rep(2,n2))
colvec = c("coral3","darkseagreen3")[true]
pchs= c(22,24)[true]
options(repr.plot.width=5, repr.plot.height=5)
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Scatter plot of Data")
n=dim(X)[1]
$\mathbf{W}_{ij} = \exp\left(-\frac{\parallel x_i -x_j \parallel^2}{2 \sigma^2}\right)$
Similarity<-function(Dmat,rbfsg=2)
{
show=5
n=dim(Dmat)[1]
W=matrix(0,n,n)
for(i in 1:n)
for(j in 1:i)
W[i,j]=W[j,i]=exp(-sum((Dmat[i,]-Dmat[j,])^2)/(2*rbfsg*rbfsg))
return(W)
}
W=Similarity(X)
cat(crayon::bold("\nDimension of W: "),dim(W)[1],"x",dim(W)[2])
rs=1;re=5;cs=1;ce=5
cat(crayon::bold("\nPairwise similarity W[",rs,":",re,",",cs,":",ce,"]\n\n"))
print(W[rs:re,cs:ce])
rs=1;re=5;cs=98;ce=102
cat(crayon::bold("\nPairwise similarity W[",rs,":",re,",",cs,":",ce,"]\n\n"))
print(W[rs:re,cs:ce])
library(RColorBrewer)
library(repr)
options(repr.plot.width=3, repr.plot.height=3)
cat(crayon::bold("\nHeatmap of similarity matrix:\n"))
heatmap(W,Colv = NA, Rowv = NA,labRow = FALSE, labCol = FALSE, margins=c(0.2,0.2),col=brewer.pal(9,"Oranges"))
$\mathbf{L} = \mathbf{D}^{-\frac{1}{2}} (\mathbf{D} - \mathbf{W}) \mathbf{D}^{-\frac{1}{2}} = \mathbf{I}_n - \mathbf{D}^{-\frac{1}{2}} \mathbf{W} \mathbf{D}^{-\frac{1}{2}}$
Laplacian<-function(W)
{
n=dim(W)[1]
D=rowSums(W)
DH=diag(D^(-0.5))
WN=DH%*%W%*%DH
I=diag(n)
L=I-WN
return(list(L=L,D=D,WN=WN))
}
L=Laplacian(W)$L
cat(crayon::bold("\nDimension of L: "),dim(L)[1],"x",dim(L)[2])
rs=1;re=5;cs=1;ce=5
cat(crayon::bold("\nNormalized Laplacian L[",rs,":",re,",",cs,":",ce,"]\n\n"))
print(L[rs:re,cs:ce])
rs=1;re=5;cs=98;ce=102
cat(crayon::bold("\nNormalized Laplacian L[",rs,":",re,",",cs,":",ce,"]\n\n"))
print(L[rs:re,cs:ce])
options(repr.plot.width=3, repr.plot.height=3)
cat(crayon::bold("\nHeatmap of graph Laplacian:\n"))
heatmap(L,Colv = NA, Rowv = NA,labRow = FALSE, labCol = FALSE, margins=c(0.2,0.2),col=brewer.pal(9,"Oranges"))
$\mathbf{L} = U \Sigma U^T$
eg=eigen(L)
eVals=eg$values
eVecs=eg$vectors
cat(crayon::bold("\nNumber of eigenvalues of L: "), length(eVals))
sh=5
cat(crayon::bold("\nEigenvalues of L: "),eVals[1:sh],"...",eVals[(n-sh+1):n])
cat(crayon::bold("\n\nSmallest eigenvalue of L: ",eVals[n]))
cat(crayon::bold("\nSecond smallest eigenvalue of L: ",eVals[n-1]))
cat(crayon::bold("\n\nThird smallest eigenvalue of L: ",eVals[n-2]))
$1. \text{ All the eigenvalues are} \geq 0$
$2. \text{ } \mathbf{D}^{-\frac{1}{2}} \mathbf{1} \text{ is an eigenvector of } \mathbf{L} \text{ with eigenvalue 0}$
D=Laplacian(W)$D
DH=diag(D^(0.5))
cat(crayon::bold("\n\nSmallest eigenvalue of L: ",eVals[n]))
rs=1;re=5;cs=1;ce=5
cat(crayon::bold("\nPairwise similarity W[",rs,":",re,",",cs,":",ce,"]\n\n"))
print(W[rs:re,cs:ce])
cat(crayon::bold("\nDegree values di's: "),D[1:5])
cat(crayon::bold("\nMatrix D^{1/2}: \n"))
print(DH[rs:re,cs:ce])
nrm=sqrt(sum(diag(DH)^2))
DH=DH/nrm
cat(crayon::bold("\nMatrix D^{1/2} with diagonal being unit norm: \n"))
print(DH[rs:re,cs:ce])
cat(crayon::bold("\nSmallest eigenvector of L: \n"))
print(eVecs[1:5,n, drop=FALSE])
$\lambda_2 = \underset{z \in \Re^{n}} {\mathrm{minimize}} \text{ } \frac{z^T \mathbf{L} z}{z^T z} \text{ such that } z^T \left(\mathbf{D}^{\frac{1}{2}} \mathbf{1}\right) = 0$
$u_2 = \underset{z \in \Re^{n}} {\text{arg min}} \text{ } \frac{z^T \mathbf{L} z}{z^T z} \text{ such that } z^T \left(\mathbf{D}^{\frac{1}{2}} \mathbf{1}\right) = 0$
cat(crayon::bold("\n\nSecond smallest eigenvalue of L: ",eVals[n-1]))
cat(crayon::bold("\nSecond smallest eigenvector of L: \n"))
print(eVecs[1:5,n-1, drop=FALSE])
u2=eVecs[,n-1]
options(repr.plot.width=10, repr.plot.height=5)
plot(seq(1,n),u2,pch=22,xlab="Samples",ylab="Element of eigenvector",col="black",bg="orange")
lowRank=eVecs[,n-1,drop=FALSE]
km=kmeans(lowRank,2,iter.max=100,nstart=10)
cat(crayon::bold("\nCluster assignments: \n"), km$cluster)
colvec = c("coral3","darkseagreen3")[true]
pchs= c(22,24)[true]
options(repr.plot.width=5, repr.plot.height=5)
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Ground-truth assignment")
colvec = c("darkblue","orange")[ km$cluster]
options(repr.plot.width=5, repr.plot.height=5)
plot(X,col="black",bg=colvec,pch=22,xlab="Feature1",ylab="Feature2",main="Cluster assignment")
randind<-function(true, clust)
{
n=length(true)
n11=0
n10=0
n01=0
n00=0
for(i in 1:(n-1))
{
for(j in (i+1):n)
{
if((true[i]==true[j])&(clust[i]==clust[j]))
n11=n11+1
if((true[i]==true[j])&(clust[i]!=clust[j]))
n10=n10+1
if((true[i]!=true[j])&(clust[i]==clust[j]))
n01=n01+1
if((true[i]!=true[j])&(clust[i]!=clust[j]))
n00=n00+1
}
}
Rand=(n11+n00)/(n11+n10+n01+n00)
ARI=(2*(n00*n11-n01*n10))/((n00+n01)*(n01+n11)+(n00+n10)*(n10+n11))
return(list(Rand=Rand,ARI=ARI))
}
eval=randind(true, km$cluster)
cat(crayon::bold("\nRand index: ",eval$Rand,"\t\tARI: ",eval$ARI))
X <- as.matrix(read.table("CircleData", sep=" ", header=FALSE, row.names=NULL))
cat("\nDimension of Data: ",dim(X))
true <- as.numeric(readLines("CircleLabels"))
cat("\nDim Label=",length(true))
true=true+1
n=dim(X)[1]
srt=sort(true, index.return=TRUE)
true=srt$x
index=srt$ix
X=X[index,]
cat(crayon::bold("\nGlimpse of Dataset X: \n"))
print(X[1:5,])
cat(crayon::bold("\n Dimension of Dataset: ")," Samples:",dim(X)[1],"\t Features:",dim(X)[2])
colvec = c("coral3","darkseagreen3")[true]
pchs= c(22,24)[true]
options(repr.plot.width=4, repr.plot.height=4)
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Scatter plot of Data")
km=kmeans(X,2,iter.max=100,nstart=10)
cat(crayon::bold("\nCluster assignments for k-Means: \n"), km$cluster)
colvec = c("darkblue","orange")[ km$cluster]
options(repr.plot.width=4, repr.plot.height=4)
plot(X,col="black",bg=colvec,pch=22,xlab="Feature1",ylab="Feature2",main="Cluster assignment for k-Means")
eval=randind(true, km$cluster)
cat(crayon::bold("\nFor k-Means\nRand index: ",eval$Rand,"\t\tARI: ",eval$ARI))
Using $k$ Nearest Neighbors similarity
SimilarityKNN<-function(X,nn=10)
{
D <- as.matrix(dist(X)) # matrix of euclidean distances between data points in X
knn_mat <- matrix(0,nrow = nrow(X),ncol = nrow(X))
# find the 10 nearest neighbors for each point
for (i in 1: nrow(X)) {
neighbor_index <- order(D[i,])[2:(nn + 1)]
knn_mat[i,][neighbor_index] <- 1
}
# xi and xj are neighbors iff K[i,j] = 1 or K[j,i] = 1
knn_mat <- knn_mat + t(knn_mat) # find mutual knn
knn_mat[ knn_mat == 2 ] = 1
return(knn_mat)
}
W=SimilarityKNN(X)
library(RColorBrewer)
options(repr.plot.width=3, repr.plot.height=3)
cat(crayon::bold("\nHeatmap of similarity matrix:\n"))
heatmap(W,Colv = NA, Rowv = NA,labRow = FALSE, labCol = FALSE, margins=c(0.2,0.2),col=brewer.pal(9,"Oranges"))
L=Laplacian(W)$L
eg=eigen(L)
eVals=eg$values
eVecs=eg$vectors
u2=eVecs[,n-1]
options(repr.plot.width=10, repr.plot.height=4)
plot(seq(1,n),u2,pch=22,xlab="Samples",ylab="Element of eigenvector",col="black",bg="brown")
lowRank=eVecs[,n-1,drop=FALSE]
km=kmeans(lowRank,2,iter.max=100,nstart=10)
cat(crayon::bold("\nCluster assignments for spectral clustering: \n"), km$cluster)
colvec = c("darkblue","orange")[ km$cluster]
options(repr.plot.width=4.5, repr.plot.height=4.5)
plot(X,col="black",bg=colvec,pch=22,xlab="Feature1",ylab="Feature2",main="Cluster assignment for spectral clustering")
eval=randind(true, km$cluster)
cat(crayon::bold("\nFor spectral clustering\nRand index: ",eval$Rand,"\t\tARI: ",eval$ARI))
X <- as.matrix(read.table("HalfMoonData", sep=" ", header=FALSE, row.names=NULL))
cat("\nDimension of Data: ",dim(X))
true <- as.numeric(readLines("HalfMoonLabels"))
cat("\nDim Label=",length(true))
true=true+1
n=dim(X)[1]
srt=sort(true, index.return=TRUE)
true=srt$x
index=srt$ix
X=X[index,]
cat(crayon::bold("\nGlimpse of Dataset X: \n"))
print(X[1:5,])
cat(crayon::bold("\n Dimension of Dataset: ")," Samples:",dim(X)[1],"\t Features:",dim(X)[2])
colvec = c("coral3","darkseagreen3")[true]
pchs= c(22,24)[true]
options(repr.plot.width=4, repr.plot.height=4)
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Scatter plot of Data")
km=kmeans(X,2,iter.max=100,nstart=10)
cat(crayon::bold("\nCluster assignments for k-Means: \n"), km$cluster)
colvec = c("darkblue","orange")[ km$cluster]
options(repr.plot.width=4, repr.plot.height=4)
plot(X,col="black",bg=colvec,pch=22,xlab="Feature1",ylab="Feature2",main="Cluster assignment for k-Means")
eval=randind(true, km$cluster)
cat(crayon::bold("\nFor k-Means\nRand index: ",eval$Rand,"\t\tARI: ",eval$ARI))
W=SimilarityKNN(X, nn=5)
library(RColorBrewer)
options(repr.plot.width=3, repr.plot.height=3)
cat(crayon::bold("\nHeatmap of similarity matrix:\n"))
heatmap(W,Colv = NA, Rowv = NA,labRow = FALSE, labCol = FALSE, margins=c(0.2,0.2),col=brewer.pal(9,"Oranges"))
L=Laplacian(W)$L
eg=eigen(L)
eVals=eg$values
eVecs=eg$vectors
u2=eVecs[,n-1]
options(repr.plot.width=10, repr.plot.height=4)
plot(seq(1,n),u2,pch=22,xlab="Samples",ylab="Element of eigenvector",col="black",bg="brown")
lowRank=eVecs[,n-1,drop=FALSE]
km=kmeans(lowRank,2,iter.max=100,nstart=10)
cat(crayon::bold("\nCluster assignments for spectral clustering: \n"), km$cluster)
colvec = c("darkblue","orange")[ km$cluster]
options(repr.plot.width=4.5, repr.plot.height=4.5)
plot(X,col="black",bg=colvec,pch=22,xlab="Feature1",ylab="Feature2",main="Cluster assignment for spectral clustering")
eval=randind(true, km$cluster)
cat(crayon::bold("\nFor spectral clustering\nRand index: ",eval$Rand,"\t\tARI: ",eval$ARI))
X <- as.matrix(read.table("LGGDNA267",sep=" ",header=TRUE,row.names=1))
truedata=read.table("LGGLabels267",stringsAsFactors=FALSE)
true=as.vector(truedata[,2],mode="numeric")
n=dim(X)[1]
k=max(true)
srt=sort(true, index.return=TRUE)
true=srt$x
index=srt$ix
X=X[index,]
cat(crayon::bold("\nGlimpse of Dataset X: \n"))
print(X[1:10,1:5])
# X=log(1+X,base=10)
cat(crayon::bold("\n Dimension of Dataset: ")," Samples:",dim(X)[1],"\t Features/DNA locations:",dim(X)[2])
colvec = c("coral3","darkseagreen3","slateblue")[true]
pchs= c(22,24,21)[true]
options(repr.plot.width=4, repr.plot.height=4)
plot(X[,1:2],col="black",bg=colvec,pch=pchs,xlab="DNA cg07346310",ylab="DNA cg12315302",main="Scatter plot of Data")
km=kmeans(X,3,iter.max=100,nstart=10)
cat(crayon::bold("\nCluster assignments for k-Means: \n"), km$cluster)
colvec = c("darkblue","orange","brown")[ km$cluster]
options(repr.plot.width=4, repr.plot.height=4)
plot(X,col="black",bg=colvec,pch=22,xlab="Feature1",ylab="Feature2",main="Cluster assignment for k-Means")
eval=randind(true, km$cluster)
cat(crayon::bold("\nFor k-Means\nRand index: ",eval$Rand,"\t\tARI: ",eval$ARI))
Datas=scale(X,center=T,scale=F)
Dist=max(as.numeric(dist(Datas)))
sigma=0.5*Dist
W=Similarity(X, rbfsg=sigma)
library(RColorBrewer)
options(repr.plot.width=3, repr.plot.height=3)
cat(crayon::bold("\nHeatmap of similarity matrix:\n"))
heatmap(W,Colv = NA, Rowv = NA,labRow = FALSE, labCol = FALSE, margins=c(0.2,0.2),col=brewer.pal(9,"Oranges"))
L=Laplacian(W)$L
eg=eigen(L)
eVals=eg$values
eVecs=eg$vectors
cat(crayon::bold("\nEigenvalues of Laplacian: \n"), eVals[1:5])
ind=seq(n,n-2*k+1)
cat("\n indexes: ",ind)
lowRank=eVecs[,ind,drop=FALSE]
km=kmeans(lowRank,3,iter.max=100,nstart=10)
cat(crayon::bold("\nCluster assignments for spectral clustering: \n"), km$cluster)
colvec = c("darkblue","orange","brown")[ km$cluster]
options(repr.plot.width=4.5, repr.plot.height=4.5)
plot(lowRank[,2:3],col="black",bg=colvec,pch=22,xlab="Feature1",ylab="Feature2",main="Cluster assignment for spectral clustering")
eval=randind(true, km$cluster)
cat(crayon::bold("\nFor spectral clustering\nRand index: ",eval$Rand,"\t\tARI: ",eval$ARI))
Tutorial: U. Von Luxburg,“A tutorial on spectral clustering,” Statistics and Computing, vol.17, no. 4, pp. 395–416, 2007.
Aparajita Khan
Senior Research Fellow
Indian Statistical Institute
E-mail: aparajitakhan1107@gmail.com