import os,string import sys,re import random """ Load datas as floats You read k floats per line """ def load(filename, K): data=[] width=-1 height = -1 try: io = open(filename, "r") except IOError: print "Error, cannot open the file [%s]" % filename else: sep=" " for f in io.readlines(): if not (f[0] == '#'): t = f.split(sep) t = epure(t) if t[-1] == '\n': del t[-1] if width == -1: width = len(t) if width == len(t): vect = [] for i in t: try: f = float(i) except: # if we cannot convert as float, it's zero? f = 0 vect.append(f) if len(vect) == K: data.append(vect) else: t = f.split("separator=") if len(t) > 1: t = t[1].split('"') sep = t[1] return data def dist_hamming(obs1, obs2, error = 0.10): # error = 0.10 : 10% de l'erreur dist = abs(len(obs1) - len(obs2)) for i in range(min(len(obs1),len(obs2))): if abs(obs1.get(i) - obs2.get(i)) > abs(error*obs1.get(i)): dist += 1 return dist """ Clean up the container (non empty observations) """ def epure(t): new_t = [] for k in t: k = k.replace('\n','') if k != '': new_t.append(k) return new_t class observation: def __init__(self): self.obs = [''] def __init__(self, line): self.obs = line def get(self,i): return self.obs[i] def __eq__(self, line, error = 3): #if (len(self.obs) != len(line.data())): # return False #for i in range(len(self.obs)): # if (abs(self.obs[i] - line.get(i)) > error): # return False #return True if dist_hamming(self, line, error): return False return True def __repr__(self): for k in self.obs: str += (",%f" % k) str = "{" + str[1:] + "}" return str def __str__(self): return self.__repr__() def data(self): return self.obs def __len__(self): return len(self.obs) """ load each line in the Observation structure the build_structure may using a dictionnary (__dict__) s[number_of_cluster] = [list of Observations] """ def build_structure(data, K): dtemp = [] for d in data: o = observation(d) dtemp.append(o) print "construction 1, shake the list..." random.shuffle(dtemp) # push the first observation in the first cluster s = { 0 : []} i = True glop = False for obs in dtemp: if i: s[0].append(obs) i = False else: # try to put it in existing cluster for k in range(K): if s.has_key(k) and len(s[k]): # the cluster exists and is not empty for elmt in s[k]: if (elmt == obs): s[k].append(obs) glop = True break else: glop = False if not glop: k_temp = random.randint(k,K-1) if not s.has_key(k_temp): s[k_temp] = [] s[k_temp].append(obs) glop = True else: s[k] = [] if glop: glop = False break # # # return s