Here's the actual code file:
#plot the clusters
#
PlotClusters = function(obs, clust, iter, t)
{
plot(
xlim = c(0, 100),
ylim = c(0, 100),
obs[,'X'], obs[,'Y'],
asp = 1.0,
pch = 20, #small points
cex = .5,
main = sprintf("Interation: %d; t = %6.3f", iter, t),
frame.plot = TRUE,
xlab = "",
ylab = ""
)
rect(clust[,'Xl'], clust[,'Yl'], clust[,'Xh'], clust[,'Yh'])
}
#
# initialize data frames for points and clusters
#
InitPoints = function(x, y)
{
data.frame(X=x, Y=y, Dist=rep(1,length(x)), Cluster=rep(0,length(x)))
}
InitClusters = function()
{
clusters = structure(
list(
Xl=double(),
Xh=double(),
Yl=double(),
Yh=double()
), class="data.frame"
)
}
#
# generate seeds for clusters from points currently in default bucket
#
Seeds = function(obs)
{
subset(obs, obs$Cluster == 0 & runif(length(obs$Cluster)) < t/10)
}
#
# turn seeds into clusters
#
NewClusters = function(seeds, border)
{
data.frame(Xl=seeds$X-border, Xh=seeds$X+border, Yl=seeds$Y-border, Yh=seeds$Y+border)
}
#
# compute distance from fixed point to all cluster
#
ClusterDistance = function(x, y, cluster)
{
max(0, x-cluster['Xh'], cluster['Xl']-x) + max(0, y-cluster['Yh'], cluster['Yl']-y)
}
#
# find best fit for point
#
BestFit = function(obs, clusters)
{
costs = apply(clusters, 1, function(cluster) ClusterDistance(obs['X'], obs['Y'], cluster))
return( c(min(costs), which.min(costs)))
}
#
# vectorize best fit across all points
#
AssignClusters = function(observations, clusters)
{
t(apply(observations, 1, function(obs) BestFit(obs, clusters)))
}
#
# determine points to be dropped because they are too far away
#
DropVector = function(obs, t)
{
obs$Dist > (rexp(length(obs$Dist), 1/t) * 10)
}
#
# reshape clusters around points
# on first pass (t>0), trim the boundaries so the border points drop off
# (allows big reductions in parition area if just one point is holding it)
# also check that cluster is viable (n>2, prior to dropping border)
# on second pass (t=0), don't trim so any border points added back stay
# always drop the default cluster, since it's not really a cluster
#
ReformClusters = function(obs, t)
{
xl = tapply(obs$X, obs$Cluster, min) + (.01*t)
xh = tapply(obs$X, obs$Cluster, max) - (.01*t)
yl = tapply(obs$Y, obs$Cluster, min) + (.01*t)
yh = tapply(obs$Y, obs$Cluster, max) - (.01*t)
keep =
(tapply(obs$Cluster, obs$Cluster, min)!= 0) & #not default
( (t > 0) | (table(obs$Cluster) > 2) ) #check size only if first pass
data.frame(Xl = xl[keep], Xh = xh[keep], Yl = yl[keep], Yh = yh[keep], row.names = NULL)
}
And, here's the interactive session that actually runs the algorithm:
#
# running algorithm in interactive session to make it easier to
# generate intermediate plots
#
#
# data setup for uniform random in (0,100)x(0,100)
#
set.seed(100)
x = runif(100, max=100)
y = runif(100, max=100)
#
# with some actual clusters
#
set.seed(200)
x = c(
runif(20, min=5, max=10),
runif(30, min=50, max=80),
runif(30, min=40, max=45),
runif(20, max=100) #noise
)
y = c(
runif(20, min=90, max=99),
runif(30, min=20, max=30),
runif(30, min=60, max=70),
runif(20, max=100) #noise
)
#
# clustering along a single dimension
#
set.seed(300)
x = c(
runif(80, min=10, max=15),
runif(80, min=60, max=70),
runif(40, max=100) #noise
)
y = runif(100, max=100)
#
# after choosing the data scenario, run from here
# choose while condition to suit needs
#
#initialize
observations = InitPoints(x, y)
clusters = InitClusters()
t = .99
i = 0
#loop
j=2 #number of intermediate steps
while (j > 0) #for intermediate steps
#while (t > 0.25) #to run until desired temperature
{
#add in new seeds
clusters=rbind(clusters,NewClusters(Seeds(observations), 1.0))
#find nearest cluster
observations[,c("Dist","Cluster")] = AssignClusters(observations, clusters)
#drop observations that are too far away into default bucket
observations$Cluster[DropVector(observations, t)] = 0
#reform boundaries to be just smaller than current to kick out edge cases
clusters = ReformClusters(observations, t)
#reassign to reformed clusters
observations[,c("Dist","Cluster")] = AssignClusters(observations, clusters)
#kick out anybody not actually in the reformed cluster
observations$Cluster[observations$Dist > 0] = 0
#and now reset the bourdaries for only the points still in there
clusters = ReformClusters(observations, 0)
#cool it down and go again
t = t*.99
i = i+1
j = j-1
}
PlotClusters(observations, clusters, i, t)
No comments:
Post a Comment