Package ete2 :: Package clustering :: Module clustvalidation
[hide private]
[frames] | no frames]

Source Code for Module ete2.clustering.clustvalidation

  1  __VERSION__="ete2-2.0rev86"  
  2  # #START_LICENSE########################################################### 
  3  # 
  4  # Copyright (C) 2009 by Jaime Huerta Cepas. All rights reserved.   
  5  # email: jhcepas@gmail.com 
  6  # 
  7  # This file is part of the Environment for Tree Exploration program (ETE).  
  8  # http://ete.cgenomics.org 
  9  #   
 10  # ETE is free software: you can redistribute it and/or modify it 
 11  # under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation, either version 3 of the License, or 
 13  # (at your option) any later version. 
 14  #   
 15  # ETE is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19  #   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with ETE.  If not, see <http://www.gnu.org/licenses/>. 
 22  # 
 23  # #END_LICENSE############################################################# 
 24   
 25  import numpy 
 26   
27 -def safe_mean(values):
28 """ Returns mean value discarding non finite values """ 29 valid_values = [] 30 for v in values: 31 if numpy.isfinite(v): 32 valid_values.append(v) 33 return numpy.mean(valid_values), numpy.std(valid_values)
34
35 -def safe_mean_vector(vectors):
36 """ Returns mean profile discarding non finite values. 37 """ 38 # if only one vector, avg = itself 39 if len(vectors)==1: 40 return vectors[0], numpy.zeros(len(vectors[0])) 41 # Takes the vector length form the first item 42 length = len(vectors[0]) 43 44 safe_mean = [] 45 safe_std = [] 46 47 for pos in xrange(length): 48 pos_mean = [] 49 for v in vectors: 50 if numpy.isfinite(v[pos]): 51 pos_mean.append(v[pos]) 52 safe_mean.append(numpy.mean(pos_mean)) 53 safe_std.append(numpy.std(pos_mean)) 54 return numpy.array(safe_mean), numpy.array(safe_std)
55
56 -def get_silhouette_width(fdist, cluster):
57 sisters = cluster.get_sisters() 58 59 # Calculates silhouette 60 silhouette = [] 61 intra_dist = [] 62 inter_dist = [] 63 for st in sisters: 64 if st.profile is None: 65 continue 66 for i in cluster.iter_leaves(): 67 # Skip nodes without profile 68 if i._profile is not None: 69 # item intraclsuterdist -> Centroid Diameter 70 a = fdist(i.profile, cluster.profile)*2 71 # intracluster dist -> Centroid Linkage 72 b = fdist(i.profile, st.profile) 73 74 if (b-a) == 0.0: 75 s = 0.0 76 else: 77 s = (b-a) / max(a,b) 78 79 intra_dist.append(a) 80 inter_dist.append(b) 81 silhouette.append(s) 82 83 silhouette, std = safe_mean(silhouette) 84 intracluster_dist, std = safe_mean(intra_dist) 85 intercluster_dist, std = safe_mean(inter_dist) 86 return silhouette, intracluster_dist, intercluster_dist
87
88 -def get_avg_profile(node):
89 """ This internal function updates the mean profile 90 associated to an internal node. """ 91 92 if not node.is_leaf(): 93 leaf_vectors = [n._profile for n in node.get_leaves() \ 94 if n._profile is not None] 95 if len(leaf_vectors)>0: 96 node._profile, node._std_profile = safe_mean_vector(leaf_vectors) 97 else: 98 node._profile, node._std_profile = None, None 99 return node._profile, node._std_profile 100 else: 101 node._std_profile = [0.0]*len(node._profile) 102 return node._profile, [0.0]*len(node._profile)
103 104
105 -def get_dunn_index(fdist, *clusters):
106 """ 107 Returns the Dunn index for the given selection of nodes. 108 109 J.C. Dunn. Well separated clusters and optimal fuzzy 110 partitions. 1974. J.Cybern. 4. 95-104. 111 112 """ 113 114 if len(clusters)<2: 115 raise ValueError, "At least 2 clusters are required" 116 117 intra_dist = [] 118 for c in clusters: 119 for i in c.get_leaves(): 120 if i is not None: 121 # item intraclsuterdist -> Centroid Diameter 122 a = fdist(i.profile, c.profile)*2 123 intra_dist.append(a) 124 max_a = numpy.max(intra_dist) 125 inter_dist = [] 126 for i, ci in enumerate(clusters): 127 for cj in clusters[i+1:]: 128 # intracluster dist -> Centroid Linkage 129 b = fdist(ci.profile, cj.profile) 130 inter_dist.append(b) 131 min_b = numpy.min(inter_dist) 132 133 if max_a == 0.0: 134 D = 0.0 135 else: 136 D = min_b / max_a 137 return D
138 139 140 141 # #################### 142 # distance functions 143 # #################### 144
145 -def pearson_dist(v1, v2):
146 if (v1 == v2).all(): 147 return 0.0 148 else: 149 return 1.0 - stats.pearsonr(v1,v2)[0]
150
151 -def spearman_dist(v1, v2):
152 if (v1 == v2).all(): 153 return 0.0 154 else: 155 return 1.0 - stats.spearmanr(v1,v2)[0]
156
157 -def euclidean_dist(v1,v2):
158 if (v1 == v2).all(): 159 return 0.0 160 else: 161 return math.sqrt( square_euclidean_dist(v1,v2) )
162
163 -def square_euclidean_dist(v1,v2):
164 if (v1 == v2).all(): 165 return 0.0 166 valids = 0 167 distance= 0.0 168 for i in xrange(len(v1)): 169 if numpy.isfinite(v1[i]) and numpy.isfinite(v2[i]): 170 valids += 1 171 d = v1[i]-v2[i] 172 distance += d*d 173 if valids==0: 174 raise ValueError, "Cannot calculate values" 175 return distance/valids
176 177 try: 178 from scipy import stats 179 default_dist = spearman_dist 180 except ImportError: 181 print "'scipy' module is not found in your system." 182 print "Correlation based distances will not be avaliable." 183 default_dist = euclidean_dist 184