Package ete2 :: Package phylomedb :: Module phylomeDB
[hide private]
[frames] | no frames]

Source Code for Module ete2.phylomedb.phylomeDB

  1  __VERSION__="ete2-2.0rev86"  
  2  # #START_LICENSE########################################################### 
  3  # 
  4  # Copyright (C) 2009 by Jaime Huerta Cepas. All rights reserved.   
  5  # email: jhcepas@gmail.com 
  6  # 
  7  # This file is part of the Environment for Tree Exploration program (ETE).  
  8  # http://ete.cgenomics.org 
  9  #   
 10  # ETE is free software: you can redistribute it and/or modify it 
 11  # under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation, either version 3 of the License, or 
 13  # (at your option) any later version. 
 14  #   
 15  # ETE is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19  #   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with ETE.  If not, see <http://www.gnu.org/licenses/>. 
 22  # 
 23  # #END_LICENSE############################################################# 
 24   
 25   
 26  """ 
 27  'phylomedb' provides an access API to the data stored in the 
 28  phylogenetic database PhylomeDB *[1]. 
 29   
 30  All methods to perform queries are implemented within the 
 31  PhylomeDBConnector class. 
 32   
 33   *[1] PhylomeDB: a database for genome-wide collections of gene 
 34   phylogenies Jaime Huerta-Cepas, Anibal Bueno, Joaquin Dopazo and Toni 
 35   Gabaldon. 
 36   
 37        PhylomeDB is a database of complete phylomes derived for 
 38        different genomes within a specific taxonomic range. All 
 39        phylomes in the database are built using a high-quality 
 40        phylogenetic pipeline that includes evolutionary model testing 
 41        and alignment trimming phases. For each genome, PhylomeDB 
 42        provides the alignments, phylogentic trees and tree-based 
 43        orthology predictions for every single encoded protein. 
 44  """ 
 45  import re 
 46  from string import strip 
 47  import MySQLdb 
 48  from ete2 import PhyloTree 
 49   
 50  __all__ = ["PhylomeDBConnector", "ROOTED_PHYLOMES"] 
 51   
 52  # This dictionary sets the default age dictionary (if any) must be 
 53  # used to root certain phylome trees. 
 54  ROOTED_PHYLOMES = { 
 55      1: { 
 56          # Basal eukariotes 
 57          "Ath":10, # Arabidopsis thaliana 
 58          "Cre":10, 
 59          "Pfa":10, 
 60          "Pyo":10, 
 61          "Ddi":10, 
 62          "Gth":10, 
 63          "Lma":10, # leismania 
 64          "Pte":10, 
 65          # Fungi 
 66          "Ago":9, 
 67          "Cal":9, 
 68          "Cgl":9, 
 69          "Cne":9, 
 70          "Dha":9, 
 71          "Ecu":9, 
 72          "Gze":9, 
 73          "Kla":9, 
 74          "Ncr":9, 
 75          "Sce":9, # S.cerevisiae 
 76          "Spb":9, 
 77          "Yli":9, 
 78          # metazoa non chordates 
 79          "Aga":8, # Anopheles 
 80          "Dme":8, # Drosophila melanogaster 
 81          "Ame":8, # Apis meliferae 
 82          "Cel":8, # Caenorabditis elegans 
 83          "Cbr":8, # Caenorabditis brigssae 
 84          # chordates non vertebrates 
 85          "Cin":7, # Ciona intestinalis 
 86          # vertebrates non tetrapodes 
 87          "Dre":6, # Danio rerio 
 88          "Tni":6, # Tetraodom 
 89          "Fru":6, # Fugu rubripes 
 90          # tetrapodes non birds nor mammals 
 91          "Xtr":5, # Xenopus 
 92          # birds 
 93          "Gga":4, # Chicken 
 94          # Mammals non primates 
 95          "Mdo":3, # Monodelphis domestica 
 96          "Mms":3, # Mouse 
 97          "Rno":3, # Rat 
 98          "Cfa":3, # Dog 
 99          "Bta":3, # Cow 
100          # primates non hominids 
101          "Ptr":2, # chimp 
102          "Mmu":2, # Macaca 
103          # hominids 
104          "Hsa":1, # human 
105          }, 
106   
107      # Pea aphid Phylome 
108      16: { 
109          "Cel" :10, # C.elegans outgroup 
110          "Hsa" :9,  # Human outgroup 
111          "Cin" :9,  # Ciona outgroup 
112          "Dpu":7, 
113          "Dps":5, 
114          "Tca":5, 
115          "Phu":5, 
116          "Dme":5, 
117          "Api":5, 
118          "Dmo":5, 
119          "Nvi":5, 
120          "Dya":5, 
121          "Aga":5, 
122          "Cpi":5, 
123          "Bom":5, 
124          "Ame":5, 
125          "Aae":5 
126          } 
127      } 
128   
129 -class PhylomeDBConnector(object):
130 """ Reuturns a connector to a phylomeDB database. 131 132 ARGUMENTS: 133 ========== 134 host: hostname in which phylomeDB is hosted. 135 user: username to the database. 136 passwd: password to connect database. 137 port: port used to connect database. 138 139 RETURNS: 140 ======== 141 An object whose methods can be used to query the database. 142 """
143 - def __init__(self, host="84.88.66.245", \ 144 db="phylomeDB", \ 145 user="public", \ 146 passwd="public", \ 147 port=3306):
148 149 """ Connects to a phylomeDB database and returns an object to 150 perform custom queries on it. """ 151 152 # Reads phylome config file 153 self._SQLconnection = MySQLdb.connect(host = host,\ 154 user = user,\ 155 passwd = passwd,\ 156 db = db,\ 157 port = int(port)) 158 159 self._SQL = self._SQLconnection.cursor() 160 161 if user == "phyAdmin": 162 self._trees_table = "trees" 163 self._algs_table = "algs" 164 self._phylomes_table = "phylomes" 165 else: 166 self._trees_table = "trees_"+user 167 self._algs_table = "algs_"+user 168 self._phylomes_table = "phylomes_"+user
169
170 - def _execute_block(self, cmd):
171 """ Executes a multi-line SQL command and returns the nombre of 172 affected rows. """ 173 commands = cmd.split(";") 174 for c in commands: 175 c = c.strip() 176 if c != "": 177 try: 178 rows = self._SQL.execute(c+";") 179 except MySQLdb.Error: 180 raise 181 return rows
182 183 # Access API methods
184 - def get_longest_isoform(self, phyID):
185 """ returns the protID of the """ 186 187 try: 188 spc_code = phyID[:3] 189 prot_number = int(phyID[3:]) 190 except ValueError: 191 raise ValueError, "invalid phylome protein ID" 192 else: 193 cmd = ' SELECT species, IF(gene="" OR gene=NULL,%s,protid) FROM proteins WHERE species="%s" AND (gene,proteome_id)=(SELECT gene, proteome_id FROM proteins WHERE species="%s" AND protid=%s) ORDER BY length(seq) DESC LIMIT 1; ' % (prot_number,spc_code,spc_code,prot_number) 194 if self._SQL.execute(cmd): 195 spc,protid = self._SQL.fetchone() 196 return "%s%07d" % (spc,protid) 197 else: 198 return None
199
200 - def get_id_by_external(self, external):
201 """ Returns the phylomeID of the given external ID""" 202 203 command = 'SELECT species,protid from id_conversion where external_id="%s"' % (external) 204 ids = [] 205 if self._SQL.execute(command): 206 matches = self._SQL.fetchall() 207 # Build phyprotID 208 for m in matches: 209 phyID = self.get_longest_isoform("%s%07d" % (m[0],m[1])) 210 if phyID: 211 ids.append( phyID ) 212 return ids
213
214 - def get_id_translations(self, seqid):
215 """ returns all the registered translations of a given seqid """ 216 217 cmd = 'SELECT external_db,external_id from id_conversion where species="%s" and protid=%d' % (seqid[:3],int(seqid[3:])) 218 conversion = {} 219 if self._SQL.execute(cmd): 220 extids = self._SQL.fetchall() 221 for db, eid in extids: 222 conversion.setdefault(db, []).append(eid) 223 return conversion
224
225 - def search_id(self, queryID):
226 """ Returns a list of phylome protein Ids associated to the 227 given external queryID. If queryID is a phylomeDB id, it 228 returns the longest isoform associated to the queryID's gene 229 """ 230 queryID = queryID.strip() 231 232 # This is to avoid weird queryIDs which make create slow or 233 # invalid MYSQL queries 234 QUERYID_GENERAL_REGEXP_FILTER = "^[\w\d\-_,;:.|#@\/\\\()'<>!]+$" 235 QUERYID_INTERNAL_ID_REGEXP_FILTER = "^\w{3}\d{7}$" 236 237 phyID_matches = [] 238 # First check if id is a phylomeID 239 if re.match(QUERYID_INTERNAL_ID_REGEXP_FILTER, queryID): 240 phyID = self.get_longest_isoform(queryID) 241 phyID_matches.append(phyID) 242 243 elif re.match(QUERYID_GENERAL_REGEXP_FILTER, queryID): 244 # Second checks if id is the original name or gene for of a phylome ID 245 cmd = 'SELECT species,protid from proteins where name="%s" or gene="%s"' % (queryID,queryID) 246 if self._SQL.execute(cmd): 247 for spc_code, protid in self._SQL.fetchall(): 248 phyID = self.get_longest_isoform("%s%07d" % (spc_code,protid)) 249 phyID_matches.append( phyID ) 250 251 # Last checks if id is in the id conversion table and adds the resulting mathes 252 hits = self.get_id_by_external(queryID) 253 if hits: 254 phyID_matches.extend(hits) 255 256 return phyID_matches
257
258 - def get_proteomes(self):
259 """ Returns all current available proteomes""" 260 cmd = 'SELECT * FROM proteomes' 261 if self._SQL.execute(cmd): 262 return self._SQL.fetchall()
263
264 - def get_species(self):
265 """ Returns all current available species""" 266 267 cmd = 'SELECT * FROM species' 268 if self._SQL.execute(cmd): 269 return self._SQL.fetchall()
270 - def get_phylomes(self):
271 """ Returns all current available phylomes """ 272 cmd = 'SELECT phylome_id,seed_proteome,proteomes,DATE(ts),name,description,comments FROM %s' %(self._phylomes_table) 273 phylomes = {} 274 if self._SQL.execute(cmd): 275 for phylo in self._SQL.fetchall(): 276 phylome_id = phylo[0] 277 phylomes[phylome_id]={} 278 phylomes[phylome_id]["seed_proteome"] = phylo[1] 279 phylomes[phylome_id]["seed_species"] = phylo[1][:3] 280 phylomes[phylome_id]["proteomes"] = phylo[2] 281 phylomes[phylome_id]["name"] = phylo[4] 282 phylomes[phylome_id]["description"] = phylo[5] 283 phylomes[phylome_id]["date"] = phylo[3] 284 phylomes[phylome_id]["comments"] = phylo[6] 285 return phylomes
286
287 - def get_proteomes_in_phylome(self,phylome_id):
288 """ Returns a list of proteomes associated to a given phylome_id""" 289 290 cmd = 'SELECT proteomes FROM %s WHERE phylome_id="%s" ' \ 291 % (self._phylomes_table, phylome_id) 292 self._SQL.execute(cmd) 293 entries = self._SQL.fetchone() 294 if entries: 295 proteomes_string = map(strip, entries[0].split(",")) 296 else: 297 proteomes_string = None 298 return proteomes_string
299
300 - def get_seqids_in_proteome(self, proteome_id, filter_isoforms=True):
301 """ Returns all sequences of a given proteome """ 302 303 seqids = [] 304 if filter_isoforms: 305 cmd = 'SELECT species,protid,gene,seq FROM proteins WHERE proteome_id="%s" AND species="%s" ' \ 306 % (proteome_id[3:],proteome_id[:3]) 307 if self._SQL.execute(cmd): 308 entries = self._SQL.fetchall() 309 largest_isoforms = {} 310 unknown_counter = 0 311 for spcs,protid,gene,seq in entries: 312 gene = gene.strip() 313 unknown_counter += 1 314 if not gene: 315 gene="phyunknown%d" % unknown_counter 316 unknown_counter += 1 317 if gene not in largest_isoforms: 318 largest_isoforms[gene] = (spcs,protid,gene,seq) 319 elif len(seq) > len(largest_isoforms[gene][3]): 320 largest_isoforms[gene] = (spcs,protid,gene,seq) 321 seqids = ["%s%07d"%(v[0], v[1]) for v in largest_isoforms.values()] 322 else: 323 cmd = 'SELECT species,protid FROM proteins WHERE proteome_id="%s" AND species="%s" ' \ 324 % (proteome_id[3:],proteome_id[:3]) 325 if self._SQL.execute(cmd): 326 seqids = ["%s%07d"%(spc,protid) for spc,protid in self._SQL.fetchall()] 327 328 return seqids
329
330 - def get_seqs_in_proteome(self, proteome_id, filter_isoforms=True):
331 cmd = 'SELECT species,protid,gene,seq FROM proteins WHERE proteome_id="%s" AND species="%s" ' \ 332 % (proteome_id[3:],proteome_id[:3]) 333 if self._SQL.execute(cmd): 334 entries = self._SQL.fetchall() 335 if filter_isoforms: 336 largest_isoforms = {} 337 unknown_counter = 0 338 for spcs,protid,gene,seq in entries: 339 gene = gene.strip() 340 unknown_counter += 1 341 if not gene: 342 gene="phyunknown%d" % unknown_counter 343 unknown_counter += 1 344 if gene not in largest_isoforms: 345 largest_isoforms[gene] = (spcs,protid,gene,seq) 346 elif len(seq) > len(largest_isoforms[gene][3]): 347 largest_isoforms[gene] = (spcs,protid,gene,seq) 348 seqs = largest_isoforms.values() 349 else: 350 seqs = entries 351 else: 352 seqs = None 353 354 return seqs
355
356 - def get_proteome_info(self,proteome_id):
357 """ Returns all info about a registered proteome""" 358 359 cmd = 'SELECT proteome_id,species,source,comments,date FROM proteomes WHERE proteome_id ="%s" AND species="%s" ' \ 360 % (proteome_id[3:],proteome_id[:3]) 361 info = {} 362 if self._SQL.execute(cmd): 363 entry = self._SQL.fetchone() 364 info["proteome_id"] = entry[0] 365 info["species"] = entry[1] 366 info["source"] = entry[2] 367 info["comments"] = entry[3] 368 info["date"] = entry[4] 369 return info
370
371 - def get_seqid_info(self, protid):
372 """ Returns orginal info about a given protid""" 373 cmd = 'SELECT species,protid,proteome_id,name,gene,comments,seq FROM proteins WHERE species="%s" and protid="%s"' \ 374 % (protid[:3],protid[3:]) 375 376 info = {} 377 if self._SQL.execute(cmd): 378 entry = self._SQL.fetchone() 379 info["species"] = entry[0] 380 info["seqid"] = entry[1] 381 info["proteome_id"] = entry[2] 382 info["name"] = entry[3] 383 info["gene"] = entry[4] 384 info["comments"] = entry[5] 385 info["seq"] = entry[6] 386 return info
387
388 - def get_phylome_info(self, phylomeid):
389 """ Returns info on a given phylome""" 390 cmd = 'SELECT seed_proteome,proteomes,DATE(ts),name,description,comments FROM %s WHERE phylome_id="%s" ' %\ 391 (self._phylomes_table, phylomeid) 392 info = {} 393 if self._SQL.execute(cmd): 394 all_info = self._SQL.fetchone() 395 info["id"] = int(phylomeid) 396 info["seed_proteome"] = all_info[0] 397 info["seed_species"] = all_info[0][:3] 398 info["proteomes"] = all_info[1] 399 info["name"] = all_info[3] 400 info["description"] = all_info[4] 401 info["date"] = all_info[2] 402 info["comments"] = all_info[5] 403 return info
404
405 - def get_species_info(self, taxid_or_code):
406 """ Returns all information on a given species_code""" 407 408 command = 'SELECT taxid,code,name from species where taxid="%s"' % (taxid_or_code) 409 if self._SQL.execute(command): 410 return self._SQL.fetchone() 411 else: 412 command = 'SELECT taxid,code,name from species where code="%s"' % (taxid_or_code) 413 info = {} 414 if self._SQL.execute(command): 415 entry = self._SQL.fetchone() 416 info["taxid"] = entry[0] 417 info["code"] = entry[1] 418 info["name"] = entry[2] 419 return info
420
421 - def get_seed_ids(self, phylome_id, filter_isoforms=True):
422 # WORKS VERY SLOW !! 423 cmd = 'SELECT seed_proteome FROM %s WHERE phylome_id="%s";' % (self._phylomes_table, phylome_id) 424 if self._SQL.execute(cmd): 425 seed_proteome = self._SQL.fetchone()[0] 426 seedids = self.get_seqids_in_proteome(seed_proteome, filter_isoforms=filter_isoforms) 427 else: 428 seedids = [] 429 return seedids
430
431 - def get_collateral_seeds(self, seqid):
432 cmd = 'SELECT seed_id, phylome_id FROM seed_friends WHERE species="%s" and protid="%s";' %\ 433 (seqid[:3],int(seqid[3:])) 434 if self._SQL.execute(cmd): 435 return self._SQL.fetchall() 436 else: 437 return []
438
439 - def get_available_trees(self, seqid, collateral=True):
440 trees = {seqid:{}} 441 cmd = 'SELECT phylome_id, method FROM %s WHERE species="%s" AND protid="%s" ' \ 442 %(self._trees_table, seqid[:3], seqid[3:]) 443 if self._SQL.execute(cmd): 444 for phylome_id, method in self._SQL.fetchall(): 445 if phylome_id in trees[seqid]: 446 trees[seqid][phylome_id].append(method) 447 else: 448 trees[seqid][phylome_id] = [method] 449 450 if collateral: 451 for cseed, phyid in self.get_collateral_seeds(seqid): 452 cmd = 'SELECT method FROM %s WHERE species="%s" AND protid="%s" and phylome_id ="%s" ' \ 453 %(self._trees_table, cseed[:3], cseed[3:], phyid) 454 if self._SQL.execute(cmd): 455 trees[cseed] = {} 456 trees[cseed][phyid] = [method[0] for method in self._SQL.fetchall()] 457 return trees
458
459 - def get_available_trees_by_phylome(self, seqid, collateral=True):
460 trees = {seqid:{}} 461 cmd = 'SELECT phylome_id, method FROM %s WHERE species="%s" AND protid="%s" ' \ 462 %(self._trees_table, seqid[:3], seqid[3:]) 463 464 phyid2trees = {} 465 if self._SQL.execute(cmd): 466 for phylome_id, method in self._SQL.fetchall(): 467 if phylome_id not in phyid2trees: 468 phyid2trees[phylome_id] = {seqid: [method]} 469 elif seqid in phyid2trees[phylome_id]: 470 phyid2trees[phylome_id][seqid].append(method) 471 elif seqid not in phyid2trees[phylome_id]: 472 phyid2trees[phylome_id][seqid] = [method] 473 474 if collateral: 475 for cseed, phyid in self.get_collateral_seeds(seqid): 476 cmd = 'SELECT method FROM %s WHERE species="%s" AND protid="%s" and phylome_id ="%s" ' \ 477 %(self._trees_table, cseed[:3], cseed[3:], phyid) 478 if self._SQL.execute(cmd): 479 phyid2trees.setdefault(phyid, {})[cseed] = [method[0] for method in self._SQL.fetchall()] 480 481 return phyid2trees
482
483 - def get_available_trees_in_phylome(self, seqid, phylomeid):
484 trees = {seqid:{}} 485 cmd = 'SELECT method, lk FROM %s WHERE species="%s" AND protid="%s" AND phylome_id=%s' \ 486 %(self._trees_table, seqid[:3], seqid[3:], phylomeid) 487 if self._SQL.execute(cmd): 488 return dict(self._SQL.fetchall()) 489 else: 490 return {}
491
492 - def get_tree(self, protid, method, phylome_id):
493 """ Returns the method-tree associated to a given protid. """ 494 495 cmd = 'SELECT newick,lk FROM %s WHERE phylome_id=%s AND species="%s" AND protid="%s" AND method ="%s"' %\ 496 (self._trees_table, phylome_id, protid[:3],protid[3:],method) 497 if self._SQL.execute(cmd): 498 entry = self._SQL.fetchone() 499 nw = entry[0] 500 lk = float(entry[1]) 501 t = PhyloTree(nw) 502 else: 503 t = None 504 lk = None 505 return t,lk
506 - def get_best_tree(self, protid, phylome_id):
507 """ Returns the winner ML tree""" 508 509 likelihoods = {} 510 winner_model = None 511 winner_lk = None 512 winner_newick = None 513 t = None 514 command ='SELECT newick,method,lk FROM %s WHERE phylome_id=%s AND species="%s" and protid="%s";' \ 515 % (self._trees_table,phylome_id, protid[:3], protid[3:]) 516 self._SQL.execute(command) 517 result = self._SQL.fetchall() 518 for r in result: 519 nw,m,lk = r 520 if lk < 0: 521 likelihoods[m] = lk 522 if winner_lk==None or lk > winner_lk: 523 winner_lk = lk 524 winner_model = m 525 winner_newick = nw 526 if winner_newick: 527 t = PhyloTree(winner_newick) 528 return winner_model,likelihoods,t
529 - def get_algs(self, protid, phylome_id):
530 """ Given a protID, it returns a tuple with the raw_alg, clean_alg and 531 the number of seqs included. 532 """ 533 534 command = 'SELECT raw_alg,clean_alg,seqnumber FROM %s WHERE phylome_id=%s AND species="%s" AND protid="%s"' %\ 535 (self._algs_table, phylome_id, protid[:3],protid[3:]) 536 self._SQL.execute(command) 537 return self._SQL.fetchone()
538
539 - def get_raw_alg(self, protid, phylome_id):
540 """ Given a protID, it returns a tuple with the raw_alg and 541 the number of seqs included. 542 """ 543 544 command = 'SELECT raw_alg,seqnumber FROM %s WHERE phylome_id=%s AND species="%s" AND protid="%s"' %\ 545 (self._algs_table, phylome_id, protid[:3],protid[3:]) 546 if self._SQL.execute(command): 547 return self._SQL.fetchone()
548
549 - def get_clean_alg(self, protid, phylome_id):
550 """ Given a protID, it returns a tuple with the clean_alg and 551 the number of seqs included. 552 """ 553 554 command = 'SELECT clean_alg,seqnumber FROM %s WHERE phylome_id=%s AND species="%s" AND protid="%s"' %\ 555 (self._algs_table, phylome_id, protid[:3],protid[3:]) 556 if self._SQL.execute(command): 557 return self._SQL.fetchone()
558 559
560 - def get_phylome_trees(self, phylomeid):
561 cmd = 'SELECT species, protid, method from %s where phylome_id=%s' \ 562 %(self._trees_table,phylomeid) 563 method2seqid = {} 564 if self._SQL.execute(cmd): 565 for sp, protid, method in self._SQL.fetchall(): 566 method2seqid.setdefault(method, []).append("%s%07d" %(sp, protid)) 567 return method2seqid
568 - def get_phylome_algs(self, phylomeid):
569 cmd = 'SELECT species, protid from %s where phylome_id =%s' \ 570 %(self._algs_table, phylomeid) 571 if self._SQL.execute(cmd): 572 return self._SQL.fetchall() 573 else: 574 return ()
575
576 - def count_trees(self, phylomeid):
577 cmd = 'SELECT method,count(*) from %s where phylome_id=%s GROUP BY method' \ 578 %(self._trees_table,phylomeid) 579 counter = {} 580 if self._SQL.execute(cmd): 581 for method, n in self._SQL.fetchall(): 582 counter[method] = n 583 return counter
584
585 - def count_algs(self, phylomeid):
586 cmd = 'SELECT count(*) from %s where phylome_id=%s;' \ 587 %(self._algs_table,phylomeid) 588 if self._SQL.execute(cmd): 589 return self._SQL.fetchone()[0] 590 else: 591 return 0
592
593 - def get_orthologs(self, seqid, sp2age=None):
594 """ Returns the orthology predictions of the given seqid in all 595 phylomes. 596 597 Only seed trees will be used to detect orthologies, and trees will 598 be rooted as the default policy defined in the API. If phylome has 599 an asociated dictionary of species ages, 600 root_to_farthest_oldest_leaf algorithm will be applied. Otherwise, 601 midpoint is used. 602 603 You can also provide your own species age dictionary to force the 604 rooting of the trees according to such data. 605 606 607 ARGUMENTS: 608 ========== 609 610 seqid: the ID of a sequence in the phylomeDB format. 611 i.e. Hsa0000001 612 613 sp2age: a dictionary of species code ages (key=species_code, 614 value=age). i.e. {"Hsa":1, "Dme":4, "Ath":10} 615 616 RETURNS: 617 ========= 618 619 A dictionary of orthologs and inparalogs found in each scanned 620 phylomes. 621 622 """ 623 phylome2or = {} 624 if type(seqid) == str: 625 seqid = [seqid] 626 for sid in seqid: 627 avail_trees = self.get_available_trees(sid) 628 for seedid, phylomes in avail_trees.iteritems(): 629 if seedid != sid: 630 continue # Skips collateral trees!! 631 for phyid in phylomes: 632 # Get the tree for each seed id 633 method, lks, t = self.get_best_tree(seedid, phyid) 634 # Roots the tree according to a predefined criterion 635 if sp2age is not None: 636 outgroup = t.get_farthest_oldest_leaf(sp2age) 637 t.set_outgroup(outgroup) 638 elif phyid in ROOTED_PHYLOMES: 639 outgroup = t.get_farthest_oldest_leaf( ROOTED_PHYLOMES[phyid] ) 640 t.set_outgroup(outgroup) 641 else: 642 t.set_outgroup(t.get_midpoint_outgroup()) 643 644 # Catches the node for our id (not necesarily the 645 # seedid) and obtains its evol events 646 seed_node = t.get_leaves_by_name(sid)[0] 647 evol_events = seed_node.get_my_evol_events() 648 # Predictions are sorted by species. 649 sp2or = {} 650 sp2in = {} 651 or2in = {} 652 for e in filter(lambda x: x.etype=="S", evol_events): 653 for o in e.orthologs: 654 sp = o[:3] 655 # orthologs sorted by species 656 sp2or.setdefault(sp, set([])).add(o) 657 658 # inparalogs sorted by orthologs 659 or2in.setdefault(o, set([])).update(e.inparalogs) 660 661 # inparalogs sorted by orthologs 662 sp2in.setdefault(sp, set([])).update(e.inparalogs) 663 664 phylome2or[phyid] = [sp2or, sp2in, or2in] 665 return phylome2or
666