Package ete2 :: Package parser :: Module fasta
[hide private]
[frames] | no frames]

Source Code for Module ete2.parser.fasta

  1  __VERSION__="ete2-2.0rev90"  
  2  # #START_LICENSE########################################################### 
  3  # 
  4  # Copyright (C) 2009 by Jaime Huerta Cepas. All rights reserved.   
  5  # email: jhcepas@gmail.com 
  6  # 
  7  # This file is part of the Environment for Tree Exploration program (ETE).  
  8  # http://ete.cgenomics.org 
  9  #   
 10  # ETE is free software: you can redistribute it and/or modify it 
 11  # under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation, either version 3 of the License, or 
 13  # (at your option) any later version. 
 14  #   
 15  # ETE is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19  #   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with ETE.  If not, see <http://www.gnu.org/licenses/>. 
 22  # 
 23  # #END_LICENSE############################################################# 
 24   
 25  import os 
 26  import string 
 27  from sys import stderr as STDERR 
 28   
29 -def read_fasta(source, obj=None):
30 """ Reads a collection of sequences econded in FASTA format.""" 31 32 if obj is None: 33 from ete2.coretype import seqgroup 34 SC = seqgroup.SeqGroup() 35 else: 36 SC = obj 37 38 names = set([]) 39 seq_id = -1 40 41 # Prepares handle from which read sequences 42 if os.path.isfile(source): 43 _source = open(source, "rU") 44 else: 45 _source = iter(source.split("\n")) 46 47 seq_name = None 48 for line in _source: 49 line = line.strip() 50 if line.startswith('#') or not line: 51 continue 52 # Reads seq number 53 elif line.startswith('>'): 54 # Checks if previous name had seq 55 if seq_id>-1 and SC.id2seq[seq_id] == "": 56 raise Exception, "No sequence found for "+seq_name 57 58 seq_id += 1 59 # Takes header info 60 seq_header_fields = map(string.strip, line[1:].split("\t")) 61 seq_name = seq_header_fields[0] 62 63 # Checks for duplicated seq names 64 if seq_name in names: 65 tag = str(len([k for k in SC.name2id.keys() if k.endswith(seq_name)])) 66 old_name = seq_name 67 seq_name = tag+"_"+seq_name 68 print >>STDERR, "Duplicated entry [%s] was renamed to [%s]" %(old_name, seq_name) 69 70 # stores seq_name 71 SC.id2seq[seq_id] = "" 72 SC.id2name[seq_id] = seq_name 73 SC.name2id[seq_name] = seq_id 74 SC.id2comment[seq_id] = seq_header_fields[1:] 75 names.add(seq_name) 76 77 else: 78 if seq_name is None: 79 raise Exception, "Error readind sequences: Wrong format." 80 81 # removes all white spaces in line 82 s = line.strip().replace(" ","") 83 84 # append to seq_string 85 SC.id2seq[seq_id] += s 86 87 if seq_name and SC.id2seq[seq_id] == "": 88 print >>STDERR, seq_name,"has no sequence" 89 return None 90 91 # Everything ok 92 return SC
93
94 -def write_fasta(sequences, outfile = None, seqwidth = 80):
95 """ Writes a SeqGroup python object using FASTA format. """ 96 97 text = '\n'.join([">%s\n%s" %( "\t".join([name]+comment), _seq2str(seq)) for 98 name, seq, comment in sequences]) 99 100 if outfile is not None: 101 OUT = open(outfile,"w") 102 OUT.write(text) 103 OUT.close() 104 else: 105 return text
106
107 -def _seq2str(seq, seqwidth = 80):
108 sequence = "" 109 for i in xrange(0,len(seq),seqwidth): 110 sequence+= seq[i:i+seqwidth] + "\n" 111 return sequence
112