Package ete2 :: Package clustering :: Module clustvalidation
[hide private]
[frames] | no frames]

Source Code for Module ete2.clustering.clustvalidation

  1  __VERSION__="ete2-2.0rev104"  
  2  # #START_LICENSE########################################################### 
  3  # 
  4  # Copyright (C) 2009 by Jaime Huerta Cepas. All rights reserved.   
  5  # email: jhcepas@gmail.com 
  6  # 
  7  # This file is part of the Environment for Tree Exploration program (ETE).  
  8  # http://ete.cgenomics.org 
  9  #   
 10  # ETE is free software: you can redistribute it and/or modify it 
 11  # under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation, either version 3 of the License, or 
 13  # (at your option) any later version. 
 14  #   
 15  # ETE is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19  #   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with ETE.  If not, see <http://www.gnu.org/licenses/>. 
 22  # 
 23  # #END_LICENSE############################################################# 
 24   
 25  import numpy 
 26  from math import sqrt  
 27   
28 -def safe_mean(values):
29 """ Returns mean value discarding non finite values """ 30 valid_values = [] 31 for v in values: 32 if numpy.isfinite(v): 33 valid_values.append(v) 34 return numpy.mean(valid_values), numpy.std(valid_values)
35
36 -def safe_mean_vector(vectors):
37 """ Returns mean profile discarding non finite values. 38 """ 39 # if only one vector, avg = itself 40 if len(vectors)==1: 41 return vectors[0], numpy.zeros(len(vectors[0])) 42 # Takes the vector length form the first item 43 length = len(vectors[0]) 44 45 safe_mean = [] 46 safe_std = [] 47 48 for pos in xrange(length): 49 pos_mean = [] 50 for v in vectors: 51 if numpy.isfinite(v[pos]): 52 pos_mean.append(v[pos]) 53 safe_mean.append(numpy.mean(pos_mean)) 54 safe_std.append(numpy.std(pos_mean)) 55 return numpy.array(safe_mean), numpy.array(safe_std)
56
57 -def get_silhouette_width(fdist, cluster):
58 sisters = cluster.get_sisters() 59 60 # Calculates silhouette 61 silhouette = [] 62 intra_dist = [] 63 inter_dist = [] 64 for st in sisters: 65 if st.profile is None: 66 continue 67 for i in cluster.iter_leaves(): 68 # Skip nodes without profile 69 if i._profile is not None: 70 # item intraclsuterdist -> Centroid Diameter 71 a = fdist(i.profile, cluster.profile)*2 72 # intracluster dist -> Centroid Linkage 73 b = fdist(i.profile, st.profile) 74 75 if (b-a) == 0.0: 76 s = 0.0 77 else: 78 s = (b-a) / max(a,b) 79 80 intra_dist.append(a) 81 inter_dist.append(b) 82 silhouette.append(s) 83 84 silhouette, std = safe_mean(silhouette) 85 intracluster_dist, std = safe_mean(intra_dist) 86 intercluster_dist, std = safe_mean(inter_dist) 87 return silhouette, intracluster_dist, intercluster_dist
88
89 -def get_avg_profile(node):
90 """ This internal function updates the mean profile 91 associated to an internal node. """ 92 93 if not node.is_leaf(): 94 leaf_vectors = [n._profile for n in node.get_leaves() \ 95 if n._profile is not None] 96 if len(leaf_vectors)>0: 97 node._profile, node._std_profile = safe_mean_vector(leaf_vectors) 98 else: 99 node._profile, node._std_profile = None, None 100 return node._profile, node._std_profile 101 else: 102 node._std_profile = [0.0]*len(node._profile) 103 return node._profile, [0.0]*len(node._profile)
104 105
106 -def get_dunn_index(fdist, *clusters):
107 """ 108 Returns the Dunn index for the given selection of nodes. 109 110 J.C. Dunn. Well separated clusters and optimal fuzzy 111 partitions. 1974. J.Cybern. 4. 95-104. 112 113 """ 114 115 if len(clusters)<2: 116 raise ValueError, "At least 2 clusters are required" 117 118 intra_dist = [] 119 for c in clusters: 120 for i in c.get_leaves(): 121 if i is not None: 122 # item intraclsuterdist -> Centroid Diameter 123 a = fdist(i.profile, c.profile)*2 124 intra_dist.append(a) 125 max_a = numpy.max(intra_dist) 126 inter_dist = [] 127 for i, ci in enumerate(clusters): 128 for cj in clusters[i+1:]: 129 # intracluster dist -> Centroid Linkage 130 b = fdist(ci.profile, cj.profile) 131 inter_dist.append(b) 132 min_b = numpy.min(inter_dist) 133 134 if max_a == 0.0: 135 D = 0.0 136 else: 137 D = min_b / max_a 138 return D
139 140 141 142 # #################### 143 # distance functions 144 # #################### 145
146 -def pearson_dist(v1, v2):
147 if (v1 == v2).all(): 148 return 0.0 149 else: 150 return 1.0 - stats.pearsonr(list(v1),list(v2))[0]
151
152 -def spearman_dist(v1, v2):
153 if (v1 == v2).all(): 154 return 0.0 155 else: 156 return 1.0 - stats.spearmanr(list(v1),list(v2))[0]
157
158 -def euclidean_dist(v1,v2):
159 if (v1 == v2).all(): 160 return 0.0 161 else: 162 return sqrt( square_euclidean_dist(v1,v2) )
163
164 -def square_euclidean_dist(v1,v2):
165 if (v1 == v2).all(): 166 return 0.0 167 valids = 0 168 distance= 0.0 169 for i in xrange(len(v1)): 170 if numpy.isfinite(v1[i]) and numpy.isfinite(v2[i]): 171 valids += 1 172 d = v1[i]-v2[i] 173 distance += d*d 174 if valids==0: 175 raise ValueError, "Cannot calculate values" 176 return distance/valids
177 178 try: 179 from scipy import stats 180 except ImportError: 181 try: 182 import stats 183 default_dist = spearman_dist 184 except ImportError: 185 default_dist = euclidean_dist 186 else: 187 default_dist = spearman_dist 188