1 __VERSION__="ete2-2.0rev86"
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 """
27 'phylomedb' provides an access API to the data stored in the
28 phylogenetic database PhylomeDB *[1].
29
30 All methods to perform queries are implemented within the
31 PhylomeDBConnector class.
32
33 *[1] PhylomeDB: a database for genome-wide collections of gene
34 phylogenies Jaime Huerta-Cepas, Anibal Bueno, Joaquin Dopazo and Toni
35 Gabaldon.
36
37 PhylomeDB is a database of complete phylomes derived for
38 different genomes within a specific taxonomic range. All
39 phylomes in the database are built using a high-quality
40 phylogenetic pipeline that includes evolutionary model testing
41 and alignment trimming phases. For each genome, PhylomeDB
42 provides the alignments, phylogentic trees and tree-based
43 orthology predictions for every single encoded protein.
44 """
45 import re
46 from string import strip
47 import MySQLdb
48 from ete2 import PhyloTree
49
50 __all__ = ["PhylomeDBConnector", "ROOTED_PHYLOMES"]
51
52
53
54 ROOTED_PHYLOMES = {
55 1: {
56
57 "Ath":10,
58 "Cre":10,
59 "Pfa":10,
60 "Pyo":10,
61 "Ddi":10,
62 "Gth":10,
63 "Lma":10,
64 "Pte":10,
65
66 "Ago":9,
67 "Cal":9,
68 "Cgl":9,
69 "Cne":9,
70 "Dha":9,
71 "Ecu":9,
72 "Gze":9,
73 "Kla":9,
74 "Ncr":9,
75 "Sce":9,
76 "Spb":9,
77 "Yli":9,
78
79 "Aga":8,
80 "Dme":8,
81 "Ame":8,
82 "Cel":8,
83 "Cbr":8,
84
85 "Cin":7,
86
87 "Dre":6,
88 "Tni":6,
89 "Fru":6,
90
91 "Xtr":5,
92
93 "Gga":4,
94
95 "Mdo":3,
96 "Mms":3,
97 "Rno":3,
98 "Cfa":3,
99 "Bta":3,
100
101 "Ptr":2,
102 "Mmu":2,
103
104 "Hsa":1,
105 },
106
107
108 16: {
109 "Cel" :10,
110 "Hsa" :9,
111 "Cin" :9,
112 "Dpu":7,
113 "Dps":5,
114 "Tca":5,
115 "Phu":5,
116 "Dme":5,
117 "Api":5,
118 "Dmo":5,
119 "Nvi":5,
120 "Dya":5,
121 "Aga":5,
122 "Cpi":5,
123 "Bom":5,
124 "Ame":5,
125 "Aae":5
126 }
127 }
128
130 """ Reuturns a connector to a phylomeDB database.
131
132 ARGUMENTS:
133 ==========
134 host: hostname in which phylomeDB is hosted.
135 user: username to the database.
136 passwd: password to connect database.
137 port: port used to connect database.
138
139 RETURNS:
140 ========
141 An object whose methods can be used to query the database.
142 """
143 - def __init__(self, host="84.88.66.245", \
144 db="phylomeDB", \
145 user="public", \
146 passwd="public", \
147 port=3306):
148
149 """ Connects to a phylomeDB database and returns an object to
150 perform custom queries on it. """
151
152
153 self._SQLconnection = MySQLdb.connect(host = host,\
154 user = user,\
155 passwd = passwd,\
156 db = db,\
157 port = int(port))
158
159 self._SQL = self._SQLconnection.cursor()
160
161 if user == "phyAdmin":
162 self._trees_table = "trees"
163 self._algs_table = "algs"
164 self._phylomes_table = "phylomes"
165 else:
166 self._trees_table = "trees_"+user
167 self._algs_table = "algs_"+user
168 self._phylomes_table = "phylomes_"+user
169
171 """ Executes a multi-line SQL command and returns the nombre of
172 affected rows. """
173 commands = cmd.split(";")
174 for c in commands:
175 c = c.strip()
176 if c != "":
177 try:
178 rows = self._SQL.execute(c+";")
179 except MySQLdb.Error:
180 raise
181 return rows
182
183
199
201 """ Returns the phylomeID of the given external ID"""
202
203 command = 'SELECT species,protid from id_conversion where external_id="%s"' % (external)
204 ids = []
205 if self._SQL.execute(command):
206 matches = self._SQL.fetchall()
207
208 for m in matches:
209 phyID = self.get_longest_isoform("%s%07d" % (m[0],m[1]))
210 if phyID:
211 ids.append( phyID )
212 return ids
213
215 """ returns all the registered translations of a given seqid """
216
217 cmd = 'SELECT external_db,external_id from id_conversion where species="%s" and protid=%d' % (seqid[:3],int(seqid[3:]))
218 conversion = {}
219 if self._SQL.execute(cmd):
220 extids = self._SQL.fetchall()
221 for db, eid in extids:
222 conversion.setdefault(db, []).append(eid)
223 return conversion
224
226 """ Returns a list of phylome protein Ids associated to the
227 given external queryID. If queryID is a phylomeDB id, it
228 returns the longest isoform associated to the queryID's gene
229 """
230 queryID = queryID.strip()
231
232
233
234 QUERYID_GENERAL_REGEXP_FILTER = "^[\w\d\-_,;:.|#@\/\\\()'<>!]+$"
235 QUERYID_INTERNAL_ID_REGEXP_FILTER = "^\w{3}\d{7}$"
236
237 phyID_matches = []
238
239 if re.match(QUERYID_INTERNAL_ID_REGEXP_FILTER, queryID):
240 phyID = self.get_longest_isoform(queryID)
241 phyID_matches.append(phyID)
242
243 elif re.match(QUERYID_GENERAL_REGEXP_FILTER, queryID):
244
245 cmd = 'SELECT species,protid from proteins where name="%s" or gene="%s"' % (queryID,queryID)
246 if self._SQL.execute(cmd):
247 for spc_code, protid in self._SQL.fetchall():
248 phyID = self.get_longest_isoform("%s%07d" % (spc_code,protid))
249 phyID_matches.append( phyID )
250
251
252 hits = self.get_id_by_external(queryID)
253 if hits:
254 phyID_matches.extend(hits)
255
256 return phyID_matches
257
259 """ Returns all current available proteomes"""
260 cmd = 'SELECT * FROM proteomes'
261 if self._SQL.execute(cmd):
262 return self._SQL.fetchall()
263
265 """ Returns all current available species"""
266
267 cmd = 'SELECT * FROM species'
268 if self._SQL.execute(cmd):
269 return self._SQL.fetchall()
271 """ Returns all current available phylomes """
272 cmd = 'SELECT phylome_id,seed_proteome,proteomes,DATE(ts),name,description,comments FROM %s' %(self._phylomes_table)
273 phylomes = {}
274 if self._SQL.execute(cmd):
275 for phylo in self._SQL.fetchall():
276 phylome_id = phylo[0]
277 phylomes[phylome_id]={}
278 phylomes[phylome_id]["seed_proteome"] = phylo[1]
279 phylomes[phylome_id]["seed_species"] = phylo[1][:3]
280 phylomes[phylome_id]["proteomes"] = phylo[2]
281 phylomes[phylome_id]["name"] = phylo[4]
282 phylomes[phylome_id]["description"] = phylo[5]
283 phylomes[phylome_id]["date"] = phylo[3]
284 phylomes[phylome_id]["comments"] = phylo[6]
285 return phylomes
286
288 """ Returns a list of proteomes associated to a given phylome_id"""
289
290 cmd = 'SELECT proteomes FROM %s WHERE phylome_id="%s" ' \
291 % (self._phylomes_table, phylome_id)
292 self._SQL.execute(cmd)
293 entries = self._SQL.fetchone()
294 if entries:
295 proteomes_string = map(strip, entries[0].split(","))
296 else:
297 proteomes_string = None
298 return proteomes_string
299
301 """ Returns all sequences of a given proteome """
302
303 seqids = []
304 if filter_isoforms:
305 cmd = 'SELECT species,protid,gene,seq FROM proteins WHERE proteome_id="%s" AND species="%s" ' \
306 % (proteome_id[3:],proteome_id[:3])
307 if self._SQL.execute(cmd):
308 entries = self._SQL.fetchall()
309 largest_isoforms = {}
310 unknown_counter = 0
311 for spcs,protid,gene,seq in entries:
312 gene = gene.strip()
313 unknown_counter += 1
314 if not gene:
315 gene="phyunknown%d" % unknown_counter
316 unknown_counter += 1
317 if gene not in largest_isoforms:
318 largest_isoforms[gene] = (spcs,protid,gene,seq)
319 elif len(seq) > len(largest_isoforms[gene][3]):
320 largest_isoforms[gene] = (spcs,protid,gene,seq)
321 seqids = ["%s%07d"%(v[0], v[1]) for v in largest_isoforms.values()]
322 else:
323 cmd = 'SELECT species,protid FROM proteins WHERE proteome_id="%s" AND species="%s" ' \
324 % (proteome_id[3:],proteome_id[:3])
325 if self._SQL.execute(cmd):
326 seqids = ["%s%07d"%(spc,protid) for spc,protid in self._SQL.fetchall()]
327
328 return seqids
329
331 cmd = 'SELECT species,protid,gene,seq FROM proteins WHERE proteome_id="%s" AND species="%s" ' \
332 % (proteome_id[3:],proteome_id[:3])
333 if self._SQL.execute(cmd):
334 entries = self._SQL.fetchall()
335 if filter_isoforms:
336 largest_isoforms = {}
337 unknown_counter = 0
338 for spcs,protid,gene,seq in entries:
339 gene = gene.strip()
340 unknown_counter += 1
341 if not gene:
342 gene="phyunknown%d" % unknown_counter
343 unknown_counter += 1
344 if gene not in largest_isoforms:
345 largest_isoforms[gene] = (spcs,protid,gene,seq)
346 elif len(seq) > len(largest_isoforms[gene][3]):
347 largest_isoforms[gene] = (spcs,protid,gene,seq)
348 seqs = largest_isoforms.values()
349 else:
350 seqs = entries
351 else:
352 seqs = None
353
354 return seqs
355
357 """ Returns all info about a registered proteome"""
358
359 cmd = 'SELECT proteome_id,species,source,comments,date FROM proteomes WHERE proteome_id ="%s" AND species="%s" ' \
360 % (proteome_id[3:],proteome_id[:3])
361 info = {}
362 if self._SQL.execute(cmd):
363 entry = self._SQL.fetchone()
364 info["proteome_id"] = entry[0]
365 info["species"] = entry[1]
366 info["source"] = entry[2]
367 info["comments"] = entry[3]
368 info["date"] = entry[4]
369 return info
370
372 """ Returns orginal info about a given protid"""
373 cmd = 'SELECT species,protid,proteome_id,name,gene,comments,seq FROM proteins WHERE species="%s" and protid="%s"' \
374 % (protid[:3],protid[3:])
375
376 info = {}
377 if self._SQL.execute(cmd):
378 entry = self._SQL.fetchone()
379 info["species"] = entry[0]
380 info["seqid"] = entry[1]
381 info["proteome_id"] = entry[2]
382 info["name"] = entry[3]
383 info["gene"] = entry[4]
384 info["comments"] = entry[5]
385 info["seq"] = entry[6]
386 return info
387
389 """ Returns info on a given phylome"""
390 cmd = 'SELECT seed_proteome,proteomes,DATE(ts),name,description,comments FROM %s WHERE phylome_id="%s" ' %\
391 (self._phylomes_table, phylomeid)
392 info = {}
393 if self._SQL.execute(cmd):
394 all_info = self._SQL.fetchone()
395 info["id"] = int(phylomeid)
396 info["seed_proteome"] = all_info[0]
397 info["seed_species"] = all_info[0][:3]
398 info["proteomes"] = all_info[1]
399 info["name"] = all_info[3]
400 info["description"] = all_info[4]
401 info["date"] = all_info[2]
402 info["comments"] = all_info[5]
403 return info
404
406 """ Returns all information on a given species_code"""
407
408 command = 'SELECT taxid,code,name from species where taxid="%s"' % (taxid_or_code)
409 if self._SQL.execute(command):
410 return self._SQL.fetchone()
411 else:
412 command = 'SELECT taxid,code,name from species where code="%s"' % (taxid_or_code)
413 info = {}
414 if self._SQL.execute(command):
415 entry = self._SQL.fetchone()
416 info["taxid"] = entry[0]
417 info["code"] = entry[1]
418 info["name"] = entry[2]
419 return info
420
422
423 cmd = 'SELECT seed_proteome FROM %s WHERE phylome_id="%s";' % (self._phylomes_table, phylome_id)
424 if self._SQL.execute(cmd):
425 seed_proteome = self._SQL.fetchone()[0]
426 seedids = self.get_seqids_in_proteome(seed_proteome, filter_isoforms=filter_isoforms)
427 else:
428 seedids = []
429 return seedids
430
432 cmd = 'SELECT seed_id, phylome_id FROM seed_friends WHERE species="%s" and protid="%s";' %\
433 (seqid[:3],int(seqid[3:]))
434 if self._SQL.execute(cmd):
435 return self._SQL.fetchall()
436 else:
437 return []
438
440 trees = {seqid:{}}
441 cmd = 'SELECT phylome_id, method FROM %s WHERE species="%s" AND protid="%s" ' \
442 %(self._trees_table, seqid[:3], seqid[3:])
443 if self._SQL.execute(cmd):
444 for phylome_id, method in self._SQL.fetchall():
445 if phylome_id in trees[seqid]:
446 trees[seqid][phylome_id].append(method)
447 else:
448 trees[seqid][phylome_id] = [method]
449
450 if collateral:
451 for cseed, phyid in self.get_collateral_seeds(seqid):
452 cmd = 'SELECT method FROM %s WHERE species="%s" AND protid="%s" and phylome_id ="%s" ' \
453 %(self._trees_table, cseed[:3], cseed[3:], phyid)
454 if self._SQL.execute(cmd):
455 trees[cseed] = {}
456 trees[cseed][phyid] = [method[0] for method in self._SQL.fetchall()]
457 return trees
458
460 trees = {seqid:{}}
461 cmd = 'SELECT phylome_id, method FROM %s WHERE species="%s" AND protid="%s" ' \
462 %(self._trees_table, seqid[:3], seqid[3:])
463
464 phyid2trees = {}
465 if self._SQL.execute(cmd):
466 for phylome_id, method in self._SQL.fetchall():
467 if phylome_id not in phyid2trees:
468 phyid2trees[phylome_id] = {seqid: [method]}
469 elif seqid in phyid2trees[phylome_id]:
470 phyid2trees[phylome_id][seqid].append(method)
471 elif seqid not in phyid2trees[phylome_id]:
472 phyid2trees[phylome_id][seqid] = [method]
473
474 if collateral:
475 for cseed, phyid in self.get_collateral_seeds(seqid):
476 cmd = 'SELECT method FROM %s WHERE species="%s" AND protid="%s" and phylome_id ="%s" ' \
477 %(self._trees_table, cseed[:3], cseed[3:], phyid)
478 if self._SQL.execute(cmd):
479 phyid2trees.setdefault(phyid, {})[cseed] = [method[0] for method in self._SQL.fetchall()]
480
481 return phyid2trees
482
484 trees = {seqid:{}}
485 cmd = 'SELECT method, lk FROM %s WHERE species="%s" AND protid="%s" AND phylome_id=%s' \
486 %(self._trees_table, seqid[:3], seqid[3:], phylomeid)
487 if self._SQL.execute(cmd):
488 return dict(self._SQL.fetchall())
489 else:
490 return {}
491
492 - def get_tree(self, protid, method, phylome_id):
493 """ Returns the method-tree associated to a given protid. """
494
495 cmd = 'SELECT newick,lk FROM %s WHERE phylome_id=%s AND species="%s" AND protid="%s" AND method ="%s"' %\
496 (self._trees_table, phylome_id, protid[:3],protid[3:],method)
497 if self._SQL.execute(cmd):
498 entry = self._SQL.fetchone()
499 nw = entry[0]
500 lk = float(entry[1])
501 t = PhyloTree(nw)
502 else:
503 t = None
504 lk = None
505 return t,lk
507 """ Returns the winner ML tree"""
508
509 likelihoods = {}
510 winner_model = None
511 winner_lk = None
512 winner_newick = None
513 t = None
514 command ='SELECT newick,method,lk FROM %s WHERE phylome_id=%s AND species="%s" and protid="%s";' \
515 % (self._trees_table,phylome_id, protid[:3], protid[3:])
516 self._SQL.execute(command)
517 result = self._SQL.fetchall()
518 for r in result:
519 nw,m,lk = r
520 if lk < 0:
521 likelihoods[m] = lk
522 if winner_lk==None or lk > winner_lk:
523 winner_lk = lk
524 winner_model = m
525 winner_newick = nw
526 if winner_newick:
527 t = PhyloTree(winner_newick)
528 return winner_model,likelihoods,t
529 - def get_algs(self, protid, phylome_id):
530 """ Given a protID, it returns a tuple with the raw_alg, clean_alg and
531 the number of seqs included.
532 """
533
534 command = 'SELECT raw_alg,clean_alg,seqnumber FROM %s WHERE phylome_id=%s AND species="%s" AND protid="%s"' %\
535 (self._algs_table, phylome_id, protid[:3],protid[3:])
536 self._SQL.execute(command)
537 return self._SQL.fetchone()
538
540 """ Given a protID, it returns a tuple with the raw_alg and
541 the number of seqs included.
542 """
543
544 command = 'SELECT raw_alg,seqnumber FROM %s WHERE phylome_id=%s AND species="%s" AND protid="%s"' %\
545 (self._algs_table, phylome_id, protid[:3],protid[3:])
546 if self._SQL.execute(command):
547 return self._SQL.fetchone()
548
550 """ Given a protID, it returns a tuple with the clean_alg and
551 the number of seqs included.
552 """
553
554 command = 'SELECT clean_alg,seqnumber FROM %s WHERE phylome_id=%s AND species="%s" AND protid="%s"' %\
555 (self._algs_table, phylome_id, protid[:3],protid[3:])
556 if self._SQL.execute(command):
557 return self._SQL.fetchone()
558
559
561 cmd = 'SELECT species, protid, method from %s where phylome_id=%s' \
562 %(self._trees_table,phylomeid)
563 method2seqid = {}
564 if self._SQL.execute(cmd):
565 for sp, protid, method in self._SQL.fetchall():
566 method2seqid.setdefault(method, []).append("%s%07d" %(sp, protid))
567 return method2seqid
569 cmd = 'SELECT species, protid from %s where phylome_id =%s' \
570 %(self._algs_table, phylomeid)
571 if self._SQL.execute(cmd):
572 return self._SQL.fetchall()
573 else:
574 return ()
575
577 cmd = 'SELECT method,count(*) from %s where phylome_id=%s GROUP BY method' \
578 %(self._trees_table,phylomeid)
579 counter = {}
580 if self._SQL.execute(cmd):
581 for method, n in self._SQL.fetchall():
582 counter[method] = n
583 return counter
584
586 cmd = 'SELECT count(*) from %s where phylome_id=%s;' \
587 %(self._algs_table,phylomeid)
588 if self._SQL.execute(cmd):
589 return self._SQL.fetchone()[0]
590 else:
591 return 0
592
594 """ Returns the orthology predictions of the given seqid in all
595 phylomes.
596
597 Only seed trees will be used to detect orthologies, and trees will
598 be rooted as the default policy defined in the API. If phylome has
599 an asociated dictionary of species ages,
600 root_to_farthest_oldest_leaf algorithm will be applied. Otherwise,
601 midpoint is used.
602
603 You can also provide your own species age dictionary to force the
604 rooting of the trees according to such data.
605
606
607 ARGUMENTS:
608 ==========
609
610 seqid: the ID of a sequence in the phylomeDB format.
611 i.e. Hsa0000001
612
613 sp2age: a dictionary of species code ages (key=species_code,
614 value=age). i.e. {"Hsa":1, "Dme":4, "Ath":10}
615
616 RETURNS:
617 =========
618
619 A dictionary of orthologs and inparalogs found in each scanned
620 phylomes.
621
622 """
623 phylome2or = {}
624 if type(seqid) == str:
625 seqid = [seqid]
626 for sid in seqid:
627 avail_trees = self.get_available_trees(sid)
628 for seedid, phylomes in avail_trees.iteritems():
629 if seedid != sid:
630 continue
631 for phyid in phylomes:
632
633 method, lks, t = self.get_best_tree(seedid, phyid)
634
635 if sp2age is not None:
636 outgroup = t.get_farthest_oldest_leaf(sp2age)
637 t.set_outgroup(outgroup)
638 elif phyid in ROOTED_PHYLOMES:
639 outgroup = t.get_farthest_oldest_leaf( ROOTED_PHYLOMES[phyid] )
640 t.set_outgroup(outgroup)
641 else:
642 t.set_outgroup(t.get_midpoint_outgroup())
643
644
645
646 seed_node = t.get_leaves_by_name(sid)[0]
647 evol_events = seed_node.get_my_evol_events()
648
649 sp2or = {}
650 sp2in = {}
651 or2in = {}
652 for e in filter(lambda x: x.etype=="S", evol_events):
653 for o in e.orthologs:
654 sp = o[:3]
655
656 sp2or.setdefault(sp, set([])).add(o)
657
658
659 or2in.setdefault(o, set([])).update(e.inparalogs)
660
661
662 sp2in.setdefault(sp, set([])).update(e.inparalogs)
663
664 phylome2or[phyid] = [sp2or, sp2in, or2in]
665 return phylome2or
666