Package ete2 :: Package parser :: Module phylip
[hide private]
[frames] | no frames]

Source Code for Module ete2.parser.phylip

  1  __VERSION__="ete2-2.0rev104"  
  2  # #START_LICENSE########################################################### 
  3  # 
  4  # Copyright (C) 2009 by Jaime Huerta Cepas. All rights reserved.   
  5  # email: jhcepas@gmail.com 
  6  # 
  7  # This file is part of the Environment for Tree Exploration program (ETE).  
  8  # http://ete.cgenomics.org 
  9  #   
 10  # ETE is free software: you can redistribute it and/or modify it 
 11  # under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation, either version 3 of the License, or 
 13  # (at your option) any later version. 
 14  #   
 15  # ETE is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19  #   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with ETE.  If not, see <http://www.gnu.org/licenses/>. 
 22  # 
 23  # #END_LICENSE############################################################# 
 24   
 25  import os 
 26  import re 
 27  from sys import stderr as STDERR 
 28   
29 -def read_phylip(source, interleaved=True, obj=None):
30 if obj is None: 31 from ete2.coretype import SeqGroup 32 SG = SeqGroup() 33 else: 34 SG = obj 35 36 # Prepares handle from which read sequences 37 if os.path.isfile(source): 38 _source = open(source, "rU") 39 else: 40 _source = iter(source.split("\n")) 41 42 nchar, ntax = None, None 43 counter = 0 44 id_counter = 0 45 for line in _source: 46 line = line.strip("\n") 47 # Passes comments and blank lines 48 if not line or line[0] == "#": 49 continue 50 # Reads head 51 if not nchar or not ntax: 52 m = re.match("^\s*(\d+)\s+(\d+)",line) 53 if m: 54 ntax = int (m.groups()[0]) 55 nchar = int (m.groups()[1]) 56 else: 57 raise Exception, \ 58 "A first line with the alignment dimension is required" 59 # Reads sequences 60 else: 61 if not interleaved: 62 # Reads names and sequences 63 if SG.id2name.get(id_counter, None) is None: 64 m = re.match("^(.{10})(.+)", line) 65 if m: 66 name = m.groups()[0].strip() 67 if name in SG.name2id: 68 69 tag = str(len([k for k in SG.name2id.keys() \ 70 if k.endswith(name)])) 71 old_name = name 72 # Tag is in the beginning to avoid being 73 # cut it by the 10 chars limit 74 name = tag+"_"+name 75 print >>STDERR, \ 76 "Duplicated entry [%s] was renamed to [%s]" %\ 77 (old_name, name) 78 SG.id2name[id_counter] = name 79 SG.name2id[name] = id_counter 80 SG.id2seq[id_counter] = "" 81 line = m.groups()[1] 82 else: 83 raise Exception, \ 84 "Wrong phylip sequencial format." 85 SG.id2seq[id_counter] += re.sub("\s","", line) 86 if len(SG.id2seq[id_counter]) == nchar: 87 id_counter += 1 88 name = None 89 elif len(SG.id2seq[id_counter]) > nchar: 90 raise Exception, \ 91 "Unexpected length of sequence [%s] [%s]." %(name,SG.id2seq[id_counter]) 92 else: 93 if len(SG)<ntax: 94 m = re.match("^(.{10})(.+)",line) 95 if m: 96 name = m.groups()[0].strip() 97 98 seq = re.sub("\s","",m.groups()[1]) 99 SG.id2seq[id_counter] = seq 100 SG.id2name[id_counter] = name 101 if name in SG.name2id: 102 tag = str(len([k for k in SG.name2id.keys() \ 103 if k.endswith(name)])) 104 old_name = name 105 name = tag+"_"+name 106 print >>STDERR, \ 107 "Duplicated entry [%s] was renamed to [%s]" %\ 108 (old_name, name) 109 SG.name2id[name] = id_counter 110 id_counter += 1 111 else: 112 raise Exception, \ 113 "Unexpected number of sequences." 114 else: 115 seq = re.sub("\s", "", line) 116 if id_counter == len(SG): 117 id_counter = 0 118 SG.id2seq[id_counter] += seq 119 id_counter += 1 120 121 if len(SG) != ntax: 122 raise Exception, \ 123 "Unexpected number of sequences." 124 125 # Check lenght of all seqs 126 for i in SG.id2seq.keys(): 127 if len(SG.id2seq[i]) != nchar: 128 raise Exception, \ 129 "Unexpected lenght of sequence [%s]" %SG.id2name[i] 130 131 return SG
132
133 -def write_phylip(aln, outfile=None, interleaved=True):
134 width = 60 135 seq_visited = set([]) 136 137 show_name_warning = False 138 lenghts = set((len(seq) for seq in aln.id2seq.values())) 139 if len(lenghts) >1: 140 raise Exception, "Phylip format requires sequences of equal lenght." 141 seqlength = lenghts.pop() 142 143 alg_text = " %d %d\n" %(len(aln), seqlength) 144 if interleaved: 145 visited = set([]) 146 for i in xrange(0, seqlength, width): 147 for j in xrange(len(aln)): 148 name = aln.id2name[j] 149 if len(name)>10: 150 name = name[:10] 151 show_name_warning = True 152 153 seq = aln.id2seq[j][i:i+width] 154 if j not in visited: 155 alg_text += "%10s " %name 156 visited.add(j) 157 else: 158 alg_text += " "*13 159 160 alg_text += ' '.join([seq[k:k+10] for k in xrange(0, len(seq), 10)]) 161 alg_text += "\n" 162 alg_text += "\n" 163 else: 164 for name, seq, comments in aln.iter_entries(): 165 if len(name)>10: 166 name = name[:10] 167 show_name_warning = True 168 alg_text += "%10s %s\n%s\n" %\ 169 (name, seq[0:width-13], '\n'.join([seq[k:k+width] \ 170 for k in xrange(width-13, len(seq), width)])) 171 if show_name_warning: 172 print >>STDERR, "Warning! Some seqnames were truncated to 10 characters" 173 174 if outfile is not None: 175 OUT = open(outfile, "w") 176 OUT.write(alg_text) 177 OUT.close() 178 else: 179 return alg_text
180