import math
import random

### code, slightly modified, from Ch 19, Guttag Python textboo
###  "Introduction to Computation and Programming Using Python"
###

def stdDev(nums):
    mean = float(sum(nums))/len(nums)
    total = 0.0
    for num in nums:
        total = total + (num - mean)**2
    return math.sqrt(total/len(nums)) 

# when p == 2, returns Euclidean distance between v1 and v2
# when p == 1, returns Manhattan distance between v1 and v2
#
def minkowskiDist(v1, v2, p):
    dist = 0.0
    for i in range(len(v1)):
        dist = dist + abs(v1[i] - v2[i])**p
    return dist**(1.0/p)

# a generic class to represent objects/samples/examples to be clustered.
#
class Example(object):
    
    def __init__(self, name, features, label = None):
        self.name = name
        self.features = features
        self.label = label
        
    def dimensionality(self):
        return len(self.features)
    
    def getFeatures(self):
        return self.features
    
    def getLabel(self):
        return self.label
    
    def getName(self):
        return self.name
    
    def distance(self, other):
        return minkowskiDist(self.features, other.getFeatures(), 2)
    
    def __repr__(self):
        return self.name +':'+ str(self.features) + ':' + str(self.label)

class Cluster(object):
    
    def __init__(self, examples):
        self.examples = examples
        self.centroid = self.computeCentroid()

    # Replace the examples in the cluster with the given new examples
    # and return how much the centroid has changed      
    def update(self, examples):
        oldCentroid = self.centroid
        self.examples = examples
        if len(examples) > 0:
            self.centroid = self.computeCentroid()
            return oldCentroid.distance(self.centroid)
        else:
            return 0.0
        
    def members(self):
        return self.examples[:]
        
    def size(self):
        return len(self.examples)
    
    def getCentroid(self):
        return self.centroid
    
    def computeCentroid(self):
        dim = self.examples[0].dimensionality()
        totVals = [0.0]*dim

        for e in self.examples:
            features = e.getFeatures()
            for i in range(dim):
                totVals[i] = totVals[i] + features[i]

        numEx = float(len(self.examples))
        for i in range(dim):
            totVals[i] = totVals[i]/numEx
        centroid = Example('centroid',totVals)
        return centroid
    
    def variance(self):
        totDist = 0.0
        for e in self.examples:
            totDist += (e.distance(self.centroid))**2
        return totDist**0.5
    
    def __repr__(self):
        names = []
        for e in self.examples:
            names.append(e.getName())
        names.sort()
        result = 'Cluster with centroid '\
                 + str(self.centroid.getFeatures()) + ' contains:\n  '
        for e in names:
            result = result + e + ', '
        return result[:-2]

# Assumes:
#     examples is a list of examples - the objects/sample/examples to be clustered
#     k is a positive integer specifying the number of clusters to find
#     verbose is a Boolean.  If verbose is True, cluster  and other information
#                   is printed after each iteration of the algorithm.
#  Returns a list containing k clusters.
#
def kmeans(examples, k, verbose):

    # Get k randomly chosen initial centroids
    initialCentroids = random.sample(examples, k)
    print('Initial centriods:'),
    for c in initialCentroids:
        print(c.features, end=" ")
    print()
    
    # Create k clusters ,each initially containing one of the initial centroids
    clusters = []
    for e in initialCentroids:
        clusters.append(Cluster([e]))
        
    # Iterate until centroids do not change
    #  (converged will be True at the end of a loop iteration if centroids do not change)
    converged = False
    numIterations = 0
    while not converged:
        numIterations += 1

        newClusters = []
        for i in range(k):
            newClusters.append([])

        #Associate each example with closest centroid
        for e in examples:
            #Find the centroid closest to e
            smallestDistance = e.distance(clusters[0].getCentroid())
            index = 0
            for i in range(1, k):
                distance = e.distance(clusters[i].getCentroid())
                if distance < smallestDistance:
                    smallestDistance = distance
                    index = i
            # Add e to the list of examples for the appropriate cluster
            newClusters[index].append(e)
            
        # Upate each cluster by replacing "old" set of examples with newly computed ones
        # Set converged to False if at least one centroid has changed
        converged = True
        for i in range(len(clusters)):
            if clusters[i].update(newClusters[i]) > 0.0:
                converged = False
               
        if verbose:
            print('At end of iteration #' + str(numIterations))
            print ('  New centroids:', end = "")
            for c in clusters:
                print(c.centroid.features, end = " ") 
            print ()
            for c in clusters:
                print(c)     
    return clusters


def dissimilarity(clusters):
    totDist = 0.0
    for c in clusters:
        totDist += c.variance()
    return totDist

# Executes kmeans numTrials times and
# returns the result with the lowest dissimilarity
#
def trykmeans(examples, numClusters, numTrials,
              verbose = False):
    best = kmeans(examples, numClusters, verbose)
    minDissimilarity = dissimilarity(best)
    for trial in range(1, numTrials):
        clusters = kmeans(examples, numClusters, verbose)
        currDissimilarity = dissimilarity(clusters)
        print("New dissimilarity: {}, current best: {}".format(currDissimilarity, minDissimilarity))
        if currDissimilarity < minDissimilarity:
            best = clusters
            minDissimilarity = currDissimilarity
    return best


# If you have pylab, you can uncomment the 'import' below and
# the commented code in plotSamples below to get a picture
# of the data in contrivedTest and contrivedTest2

import pylab

def plotSamples(samples, marker):
    xVals, yVals = [], []
    for s in samples:
        x = s.getFeatures()[0]
        y = s.getFeatures()[1]
        pylab.annotate(s.getName(), xy = (x, y),
                       xytext = (x+0.13, y-0.07),
                       fontsize = 'x-large')
        xVals.append(x)
        yVals.append(y)
    pylab.plot(xVals, yVals, marker)
    return
    
def genDistribution(xMean, xSD, yMean, ySD, n, namePrefix):
    samples = []
    for s in range(n):
        x = random.gauss(xMean, xSD)
        y = random.gauss(yMean, ySD)
        samples.append(Example(namePrefix+str(s), [x, y]))
    return samples

# First, statistically generate two sets of 2D points
# Then run kmeans numTrials times, using k as number of clusters to find
# The first set of points will be clustered around x,y point: (xMean1, yMean1)
# and the second set around (xMean2, yMean2). 
# The "tightness" of the point sets is influence by standard deviations, xSD and ySD.
# You can modify local variables xMean1, yMean1, xMean2, yMean2, xSD and ySD
# to change the centers of the sets and how tightly clustered their points are.
# For very tight sets, e.g., you could set xSD, ySD to .1
#
# Example use: contrivedTest(1, 2, True)
#  - just do one "trial" of finding 2 clusters from the data
#
def contrivedTest(numTrials, k, verbose):
    xMean1 = 1
    yMean1 = 1
    xMean2 = xMean1 + 2
    yMean2 = yMean1 + 3
    xSD = 1
    ySD = 1
    n = 10
    pylab.clf()
    aSamples = genDistribution(xMean1, xSD, yMean1, ySD, n, 'a.')
    plotSamples(aSamples, 'b^')
    bSamples = genDistribution(xMean2, xSD, yMean2, ySD, n, 'b.')
    plotSamples(bSamples, 'ro')
    clusters = trykmeans(aSamples + bSamples, k,
                         numTrials, verbose)
    print('Final result')
    for c in clusters:
        print(c)

# Example use: contrivedTest2(1, 2, True)
#  do one "trial" of finding 2 clusters from the data
#  in this case, although you generated three sets of data, you are forcing 
#  those three sets of data to be grouped into two clusters
# Or, contrivedTest2(1,3,True)
#  do one "trial" of finding 3 clusters from the data
#  In this case, if the standard deviations are not too big, you should expect
#  the points from each three original data sets to mostly be assigned to 
#  three corresponding clusters
# 
def contrivedTest2(numTrials, k, verbose):
    xMean1 = 1
    yMean1 = 1
    xMean2 = xMean1 + 5
    yMean2 = yMean1
    xMean3 = xMean1
    yMean3 = yMean1 + 5
    xSD = 1
    ySD = 1
    n = 8
    pylab.clf()
    aSamples = genDistribution(xMean1,xSD, yMean1, ySD, n, 'a.')
    plotSamples(aSamples, 'b^')
    bSamples = genDistribution(xMean2,xSD,yMean2, ySD, n, 'b.')
    plotSamples(bSamples, 'ro')
    cSamples = genDistribution(xMean3, xSD, yMean3, ySD, n, 'c.')
    plotSamples(cSamples, 'gd')
    clusters = trykmeans(aSamples + bSamples + cSamples,
                         k, numTrials, verbose)
    print('Final result')
    for c in clusters:
        print(c)

def test2D(numTrials, k, verbose):
    pylab.clf()
    aSamples = [Example('a1', [1,1]), Example('a2', [1.1,1]),Example('a3', [1,1.1]),Example('a4', [1.1,1.1])]               
    plotSamples(aSamples, 'b^')
    bSamples = [Example('b1', [2,1]), Example('b2', [2.1,1]),Example('b3', [2,1.1])]               
    plotSamples(bSamples, 'ro')
    cSamples = [Example('c1', [2,2]), Example('c2', [2.1,2.1]),Example('c3', [2,2.6])]               
    plotSamples(cSamples, 'gd')
    dSamples = [Example('d1', [1,2]), Example('d2', [1.1,2.1]),Example('d3', [1,2.6])]               
    plotSamples(dSamples, 'kv')
   
    clusters = trykmeans(aSamples + bSamples + cSamples + dSamples,
                         k, numTrials, verbose)
    print('Final result')
    for c in clusters:
        print(c)