Wednesday, April 27, 2016

R code for SA Clustering project

Here's the actual code file:

#plot the clusters
#
PlotClusters = function(obs, clust, iter, t)
{
  plot(
    xlim = c(0, 100),
    ylim = c(0, 100),
    obs[,'X'], obs[,'Y'],
    asp = 1.0,
    pch = 20, #small points
    cex = .5,
    main = sprintf("Interation: %d; t = %6.3f", iter, t),
    frame.plot = TRUE,
    xlab = "",
    ylab = ""
  )
  rect(clust[,'Xl'], clust[,'Yl'], clust[,'Xh'], clust[,'Yh'])
}
#
# initialize data frames for points and clusters
#
InitPoints = function(x, y)
{
  data.frame(X=x, Y=y, Dist=rep(1,length(x)), Cluster=rep(0,length(x)))
}

InitClusters = function()
{
  clusters = structure(
    list(
      Xl=double(),
      Xh=double(),
      Yl=double(),
      Yh=double()
    ), class="data.frame"
  )
}
#
# generate seeds for clusters from points currently in default bucket
#
Seeds = function(obs)
{
  subset(obs, obs$Cluster == 0 & runif(length(obs$Cluster)) < t/10)
}
#
# turn seeds into clusters
#
NewClusters = function(seeds, border)
{
  data.frame(Xl=seeds$X-border, Xh=seeds$X+border, Yl=seeds$Y-border, Yh=seeds$Y+border)
}
#
# compute distance from fixed point to all cluster
#
ClusterDistance = function(x, y, cluster)
{
  max(0, x-cluster['Xh'], cluster['Xl']-x) + max(0, y-cluster['Yh'], cluster['Yl']-y)
}
#
# find best fit for point
#
BestFit = function(obs, clusters)
{
  costs = apply(clusters, 1, function(cluster) ClusterDistance(obs['X'], obs['Y'], cluster))
  return( c(min(costs), which.min(costs)))
}
#
# vectorize best fit across all points
#
AssignClusters = function(observations, clusters)
{
  t(apply(observations, 1, function(obs) BestFit(obs, clusters)))
}
#
# determine points to be dropped because they are too far away
#
DropVector = function(obs, t)
{
  obs$Dist > (rexp(length(obs$Dist), 1/t) * 10)
}
#
# reshape clusters around points
# on first pass (t>0), trim the boundaries so the border points drop off
#  (allows big reductions in parition area if just one point is holding it)
#  also check that cluster is viable (n>2, prior to dropping border)
# on second pass (t=0), don't trim so any border points added back stay
# always drop the default cluster, since it's not really a cluster
#
ReformClusters = function(obs, t)
{
  xl = tapply(obs$X, obs$Cluster, min) + (.01*t)
  xh = tapply(obs$X, obs$Cluster, max) - (.01*t)
  yl = tapply(obs$Y, obs$Cluster, min) + (.01*t)
  yh = tapply(obs$Y, obs$Cluster, max) - (.01*t)
  keep =
    (tapply(obs$Cluster, obs$Cluster, min)!= 0) & #not default
    ( (t > 0) | (table(obs$Cluster) > 2) )        #check size only if first pass
  data.frame(Xl = xl[keep], Xh = xh[keep], Yl = yl[keep], Yh = yh[keep], row.names = NULL)
}

And, here's the interactive session that actually runs the algorithm:

#
# running algorithm in interactive session to make it easier to
#  generate intermediate plots
#

#
# data setup for uniform random in (0,100)x(0,100)
#
set.seed(100)
x = runif(100, max=100)
y = runif(100, max=100)

#
# with some actual clusters
#
set.seed(200)
x = c(
  runif(20, min=5, max=10),
  runif(30, min=50, max=80),
  runif(30, min=40, max=45),
  runif(20, max=100) #noise
)
y = c(
  runif(20, min=90, max=99),
  runif(30, min=20, max=30),
  runif(30, min=60, max=70),
  runif(20, max=100) #noise
)

#
# clustering along a single dimension
#
set.seed(300)
x = c(
  runif(80, min=10, max=15),
  runif(80, min=60, max=70),
  runif(40, max=100) #noise
)
y = runif(100, max=100)

#
# after choosing the data scenario, run from here
# choose while condition to suit needs
#
#initialize
observations = InitPoints(x, y)
clusters = InitClusters()
t = .99
i = 0

#loop
j=2 #number of intermediate steps
while (j > 0) #for intermediate steps
#while (t > 0.25) #to run until desired temperature
{
  #add in new seeds
  clusters=rbind(clusters,NewClusters(Seeds(observations), 1.0))

  #find nearest cluster
  observations[,c("Dist","Cluster")] = AssignClusters(observations, clusters)

  #drop observations that are too far away into default bucket
  observations$Cluster[DropVector(observations, t)] = 0

  #reform boundaries to be just smaller than current to kick out edge cases
  clusters = ReformClusters(observations, t)

  #reassign to reformed clusters
  observations[,c("Dist","Cluster")] = AssignClusters(observations, clusters)

  #kick out anybody not actually in the reformed cluster
  observations$Cluster[observations$Dist > 0] = 0

  #and now reset the bourdaries for only the points still in there
  clusters = ReformClusters(observations, 0)

  #cool it down and go again
  t = t*.99
  i = i+1
  j = j-1
}

PlotClusters(observations, clusters, i, t)

No comments:

Post a Comment