Module littletable
[frames] | no frames]

Source Code for Module littletable

   1  # 
   2  # littletable.py 
   3  #  
   4  # littletable is a simple in-memory database for ad-hoc or user-defined objects, 
   5  # supporting simple query and join operations - useful for ORM-like access 
   6  # to a collection of data objects, without dealing with SQL 
   7  # 
   8  # 
   9  # Copyright (c) 2010  Paul T. McGuire 
  10  # 
  11  # Permission is hereby granted, free of charge, to any person obtaining 
  12  # a copy of this software and associated documentation files (the 
  13  # "Software"), to deal in the Software without restriction, including 
  14  # without limitation the rights to use, copy, modify, merge, publish, 
  15  # distribute, sublicense, and/or sell copies of the Software, and to 
  16  # permit persons to whom the Software is furnished to do so, subject to 
  17  # the following conditions: 
  18  # 
  19  # The above copyright notice and this permission notice shall be 
  20  # included in all copies or substantial portions of the Software. 
  21  # 
  22  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
  23  # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
  24  # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
  25  # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
  26  # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
  27  # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
  28  # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
  29  # 
  30   
  31  __doc__ = """\ 
  32   
  33  C{littletable} - a Python module to give ORM-like access to a collection of objects 
  34   
  35  The C{littletable} module provides a low-overhead, schema-less, in-memory database access to a  
  36  collection of user objects.  C{littletable} provides a L{DataObject} class for ad hoc creation 
  37  of semi-immutable objects that can be stored in a C{littletable} L{Table}. 
  38   
  39  In addition to basic ORM-style insert/remove/query/delete access to the contents of a  
  40  Table, C{littletable} offers: 
  41   - simple indexing for improved retrieval performance, and optional enforcing key uniqueness 
  42   - access to objects using indexed attributes 
  43   - simplified joins using '+' operator syntax between annotated Tables 
  44   - the result of any query or join is a new first-class C{littletable} Table 
  45   
  46  C{littletable} Tables do not require an upfront schema definition, but simply work off of the 
  47  attributes in the stored values, and those referenced in any query parameters. 
  48   
  49  Here is a simple C{littletable} data storage/retrieval example:: 
  50   
  51      from littletable import Table, DataObject 
  52   
  53      customers = Table('customers') 
  54      customers.create_index("id", unique=True) 
  55      customers.insert(DataObject(id="0010", name="George Jetson")) 
  56      customers.insert(DataObject(id="0020", name="Wile E. Coyote")) 
  57      customers.insert(DataObject(id="0030", name="Jonny Quest")) 
  58   
  59      catalog = Table('catalog') 
  60      catalog.create_index("sku", unique=True) 
  61      catalog.insert(DataObject(sku="ANVIL-001", descr="1000lb anvil", unitofmeas="EA",unitprice=100)) 
  62      catalog.insert(DataObject(sku="BRDSD-001", descr="Bird seed", unitofmeas="LB",unitprice=3)) 
  63      catalog.insert(DataObject(sku="MAGNT-001", descr="Magnet", unitofmeas="EA",unitprice=8)) 
  64      catalog.insert(DataObject(sku="MAGLS-001", descr="Magnifying glass", unitofmeas="EA",unitprice=12)) 
  65   
  66      wishitems = Table('wishitems') 
  67      wishitems.create_index("custid") 
  68      wishitems.create_index("sku") 
  69      wishitems.insert(DataObject(custid="0020", sku="ANVIL-001")) 
  70      wishitems.insert(DataObject(custid="0020", sku="BRDSD-001")) 
  71      wishitems.insert(DataObject(custid="0020", sku="MAGNT-001")) 
  72      wishitems.insert(DataObject(custid="0030", sku="MAGNT-001")) 
  73      wishitems.insert(DataObject(custid="0030", sku="MAGLS-001")) 
  74   
  75      # print a particular customer name  
  76      # (unique indexes will return a single item; non-unique 
  77      # indexes will return a list of all matching items) 
  78      print customers.id["0030"].name 
  79   
  80      # print all items sold by the pound 
  81      for item in catalog.query(unitofmeas="LB"): 
  82          print item.sku, item.descr 
  83   
  84      # print all items that cost more than 10 
  85      for item in catalog.where(lambda o : o.unitprice>10): 
  86          print item.sku, item.descr, item.unitprice 
  87   
  88      # join tables to create queryable wishlists collection 
  89      wishlists = customers.join_on("id") + wishitems.join_on("custid") + catalog.join_on("sku") 
  90   
  91      # print all wishlist items with price > 10 
  92      bigticketitems = wishlists().where(lambda ob : ob.unitprice > 10) 
  93      for item in bigticketitems: 
  94          print item 
  95   
  96      # list all wishlist items in descending order by price 
  97      for item in wishlists().query(_orderbydesc="unitprice"): 
  98          print item 
  99  """ 
 100   
 101  __version__ = "0.4" 
 102  __versionTime__ = "29 Jun 2011 16:36" 
 103  __author__ = "Paul McGuire <ptmcg@users.sourceforge.net>" 
 104   
 105  import sys 
 106  from collections import defaultdict 
 107  from itertools import groupby,ifilter,islice,starmap,repeat 
 108  import csv 
 109   
 110  try: 
 111      from itertools import product 
 112  except ImportError: 
113 - def product(aseq,bseq):
114 for a in aseq: 115 for b in bseq: 116 yield a,b
117 118 try: 119 t = basestring 120 except NameError: 121 basestring = str 122 123 __all__ = ["DataObject", "Table", "JoinTerm", "PivotTable"] 124
125 -def _object_attrnames(obj):
126 if hasattr(obj, "__dict__"): 127 # normal object 128 return obj.__dict__.keys() 129 elif isinstance(obj, tuple) and hasattr(obj, "_fields"): 130 # namedtuple 131 return obj._fields 132 elif hasattr(obj, "__slots__"): 133 return obj.__slots__ 134 else: 135 raise ValueError("object with unknown attributes")
136
137 -class DataObject(object):
138 """A generic semi-mutable object for storing data values in a table. Attributes 139 can be set by passing in named arguments in the constructor, or by setting them 140 as C{object.attribute = value}. New attributes can be added any time, but updates 141 are ignored. Table joins are returned as a Table of DataObjects."""
142 - def __init__(self, **kwargs):
143 if kwargs: 144 self.__dict__.update(kwargs)
145 - def __repr__(self):
146 return repr(self.__dict__)
147 - def __setattr__(self, attr, val):
148 # make all attributes write-once 149 if attr not in self.__dict__: 150 super(DataObject,self).__setattr__(attr,val)
151 - def __getitem__(self, k):
152 if hasattr(self,k): 153 return getattr(self,k) 154 else: 155 raise KeyError("object has no such attribute " + k)
156
157 -class _ObjIndex(object):
158 - def __init__(self, attr):
159 self.attr = attr 160 self.obs = defaultdict(list) 161 self.is_unique = False
162 - def __setitem__(self, k, v):
163 self.obs[k].append(v)
164 - def __getitem__(self, k):
165 return self.obs.get(k,[])
166 - def __len__(self):
167 return len(self.obs)
168 - def __iter__(self):
169 return iter(self.obs)
170 - def keys(self):
171 return sorted(self.obs.keys())
172 - def items(self):
173 return self.obs.items()
174 - def remove(self, obj):
175 try: 176 k = getattr(obj, self.attr) 177 self.obs[k].remove(obj) 178 except (ValueError,AttributeError,KeyError): 179 pass
180 - def __contains__(self, key):
181 return key in self.obs
182 - def copy_template(self):
183 return self.__class__(self.attr)
184
185 -class _UniqueObjIndex(_ObjIndex):
186 - def __init__(self, attr, accept_none=False):
187 self.attr = attr 188 self.obs = {} 189 self.is_unique = True 190 self.accept_none = accept_none 191 self.none_values = set()
192 - def __setitem__(self, k, v):
193 if k: 194 if k not in self.obs: 195 self.obs[k] = v 196 else: 197 raise KeyError("duplicate key value %s" % k) 198 else: 199 self.none_values.add(v)
200 - def __getitem__(self, k):
201 if k: 202 return [self.obs.get(k)] if k in self.obs else [] 203 else: 204 return list(self.none_values)
205 - def __contains__(self, k):
206 if k: 207 return k in self.obs 208 else: 209 return self.accept_none and self.none_values
210 - def keys(self):
211 return sorted(self.obs.keys()) + ([None,] if self.none_values else [])
212 - def items(self):
213 return [(k,[v]) for k,v in self.obs.items()]
214 - def remove(self, obj):
215 k = getattr(obj, self.attr) 216 if k: 217 if k in self.obs: 218 del self.obs[k] 219 else: 220 self.none_values.discard(obj)
221
222 -class _ObjIndexWrapper(object):
223 - def __init__(self, ind):
224 self._index = ind
225 - def __getattr__(self, attr):
226 return getattr(self._index, attr)
227 - def __getitem__(self, k):
228 ret = Table() 229 if k in self._index: 230 ret.insert_many(self._index[k]) 231 return ret
232 - def __contains__(self, k):
233 return k in self._index
234
235 -class _UniqueObjIndexWrapper(object):
236 - def __init__(self, ind):
237 self._index = ind
238 - def __getattr__(self, attr):
239 return getattr(self._index, attr)
240 - def __contains__(self, k):
241 return k in self._index
242 - def __getitem__(self, k):
243 if k: 244 return self._index[k][0] 245 else: 246 ret = Table() 247 if k in self._index: 248 ret.insert_many(self._index[k]) 249 return ret
250 251
252 -class Table(object):
253 """Table is the main class in C{littletable}, for representing a collection of DataObjects or 254 user-defined objects with publicly accessible attributes or properties. Tables can be: 255 - created, with an optional name, using standard Python L{C{Table() constructor}<__init__>} 256 - indexed, with multiple indexes, with unique or non-unique values, see L{create_index} 257 - queried, specifying values to exact match in the desired records, see L{query} 258 - filtered (using L{where}), using a simple predicate function to match desired records; 259 useful for selecting using inequalities or compound conditions 260 - accessed directly for keyed values, using C{table.indexattribute[key]} - see L{__getattr__} 261 - joined, using L{join_on} to identify attribute to be used for joining with another table, and 262 L{join} or operator '+' to perform the actual join 263 - pivoted, using L{pivot} to create a nested structure of sub-tables grouping objects 264 by attribute values 265 - L{imported<csv_import>}/L{exported<csv_export>} to CSV-format files 266 Queries and joins return their results as new Table objects, so that queries and joins can 267 be easily performed as a succession of operations. 268 """
269 - def __init__(self, table_name=''):
270 """Create a new, empty Table. 271 @param table_name: name for Table 272 @type table_name: string (optional) 273 """ 274 self.table_name = table_name 275 self.obs = [] 276 self._indexes = {}
277
278 - def __len__(self):
279 """Return the number of objects in the Table.""" 280 return len(self.obs)
281
282 - def __iter__(self):
283 """Create an iterator over the objects in the Table.""" 284 return iter(self.obs)
285
286 - def __getitem__(self, i):
287 """Provides direct indexed/sliced access to the Table's underlying list of objects.""" 288 return self.obs[i]
289
290 - def __getattr__(self, attr):
291 """A quick way to query for matching records using their indexed attributes. The attribute 292 name is used to locate the index, and returns a wrapper on the index. This wrapper provides 293 dict-like access to the underlying records in the table, as in:: 294 295 employees.socsecnum["000-00-0000"] 296 customers.zipcode["12345"] 297 298 The behavior differs slightly for unique and non-unique indexes: 299 - if the index is unique, then retrieving a matching object, will return just the object; 300 if there is no matching object, C{KeyError} is raised 301 - if the index is non-unique, then all matching objects will be returned in a new Table, 302 just as if a regular query had been performed; if no objects match the key value, an empty 303 Table is returned and no exception is raised. 304 305 If there is no index defined for the given attribute, then C{AttributeError} is raised. 306 """ 307 if attr in self._indexes: 308 ret = self._indexes[attr] 309 if isinstance(ret, _UniqueObjIndex): 310 ret = _UniqueObjIndexWrapper(ret) 311 if isinstance(ret, _ObjIndex): 312 ret = _ObjIndexWrapper(ret) 313 return ret 314 raise AttributeError("Table '%s' has no index '%s'" % 315 (self.table_name, attr))
316
317 - def __bool__(self):
318 return bool(self.obs)
319 320 __nonzero__ = __bool__ 321
322 - def __call__(self, table_name):
323 """A simple way to assign a name to a table, such as those 324 dynamically created by joins and queries. 325 @param table_name: name for Table 326 @type table_name: string 327 """ 328 self.table_name = table_name 329 return self
330
331 - def copy_template(self):
332 """Create empty copy of the current table, with copies of all 333 index definitions. 334 """ 335 ret = Table(self.table_name) 336 for k,v in self._indexes.items(): 337 ret._indexes[k] = v.copy_template() 338 return ret
339
340 - def clone(self):
341 """Create full copy of the current table, including table contents 342 and index definitions. 343 """ 344 ret = self.copy_template() 345 ret.insert_many(self.obs) 346 return ret
347
348 - def create_index(self, attr, unique=False, accept_none=False):
349 """Create a new index on a given attribute. 350 If C{unique} is True and records are found in the table with duplicate 351 attribute values, the index is deleted and C{KeyError} is raised. 352 353 If the table already has an index on the given attribute, then no 354 action is taken and no exception is raised. 355 @param attr: the attribute to be used for indexed access and joins 356 @type attr: string 357 @param unique: flag indicating whether the indexed field values are 358 expected to be unique across table entries 359 @type unique: boolean 360 @param accept_none: flag indicating whether None is an acceptable 361 value for this attribute 362 @type accept_none: boolean 363 """ 364 if attr in self._indexes: 365 return 366 367 if unique: 368 self._indexes[attr] = _UniqueObjIndex(attr,accept_none) 369 else: 370 self._indexes[attr] = _ObjIndex(attr) 371 accept_none = True 372 ind = self._indexes[attr] 373 try: 374 for obj in self.obs: 375 if hasattr(obj, attr): 376 obval = getattr(obj, attr) or None 377 else: 378 obval = None 379 if obval or accept_none: 380 ind[obval] = obj 381 else: 382 raise KeyError("None is not an allowed key") 383 384 except KeyError: 385 del self._indexes[attr] 386 raise
387
388 - def delete_index(self, attr):
389 """Deletes an index from the Table. Can be used to drop and rebuild an index, 390 or to convert a non-unique index to a unique index, or vice versa. 391 @param attr: name of an indexed attribute 392 @type attr: string 393 """ 394 if attr in self._indexes: 395 del self._indexes[attr]
396
397 - def insert(self, obj):
398 """Insert a new object into this Table. 399 @param obj: any Python object 400 Objects can be constructed using the defined DataObject type, or they can 401 be any Python object that does not use the Python C{__slots__} feature; C{littletable} 402 introspect's the object's C{__dict__} or C{_fields} attributes to obtain join and 403 index attributes and values. 404 405 If the table contains a unique index, and the record to be inserted would add 406 a duplicate value for the indexed attribute, then C{KeyError} is raised, and the 407 object is not inserted. 408 409 If the table has no unique indexes, then it is possible to insert duplicate 410 objects into the table. 411 """ 412 413 # verify new object doesn't duplicate any existing unique index values 414 uniqueIndexes = [ind for ind in self._indexes.values() if ind.is_unique] 415 if any((getattr(obj, ind.attr, None) is None and not ind.accept_none) 416 or ( 417 hasattr(obj, ind.attr) and getattr(obj, ind.attr) in ind 418 ) 419 for ind in uniqueIndexes): 420 # had a problem, find which one 421 for ind in uniqueIndexes: 422 if (getattr(obj, ind.attr, None) is None and not ind.accept_none): 423 raise KeyError("unique key cannot be None or blank for index %s" % ind.attr, obj) 424 if getattr(obj, ind.attr) in ind: 425 raise KeyError("duplicate unique key value '%s' for index %s" % (getattr(obj,ind.attr), ind.attr), obj) 426 427 self.obs.append(obj) 428 for attr, ind in self._indexes.items(): 429 obval = getattr(obj, attr) 430 ind[obval] = obj
431
432 - def insert_many(self, it):
433 """Inserts a collection of objects into the table.""" 434 for ob in it: 435 self.insert(ob)
436
437 - def remove(self, ob):
438 """Removes an object from the table. If object is not in the table, then 439 no action is taken and no exception is raised.""" 440 # remove from indexes 441 for attr,ind in self._indexes.items(): 442 ind.remove(ob) 443 444 # remove from main object list 445 self.obs.remove(ob)
446
447 - def remove_many(self, it):
448 """Removes a collection of objects from the table.""" 449 for ob in it: 450 self.remove(ob)
451
452 - def _query_attr_sort_fn(self, attr_val):
453 attr,v = attr_val 454 if attr in self._indexes: 455 idx = self._indexes[attr] 456 if v in idx: 457 return len(idx[v]) 458 else: 459 return 0 460 else: 461 return 1e9
462
463 - def query(self, **kwargs):
464 """Retrieves matching objects from the table, based on given 465 named parameters. If multiple named parameters are given, then 466 only objects that satisfy all of the query criteria will be returned. 467 468 Special kwargs: 469 - C{_orderby="attr,..."} - resulting table should sort content objects 470 by the C{attr}s given in a comma-separated string; to sort in 471 descending order, reference the attribute as C{attr desc}. 472 473 @param **kwargs: attributes for selecting records, given as additional 474 named arguments of the form C{attrname="attrvalue"}. 475 @return: a new Table containing the matching objects 476 """ 477 # extract meta keys 478 flags = [(k,v) for k,v in kwargs.items() if k.startswith("_")] 479 for f,v in flags: 480 del kwargs[f] 481 482 if kwargs: 483 ret = self.copy_template() 484 first = True 485 486 # order query criteria in ascending order of number of matching items 487 # for each individual given attribute; this will minimize the number 488 # of filtering records that each subsequent attribute will have to 489 # handle 490 kwargs = kwargs.items() 491 if len(kwargs) > 1 and len(self.obs) > 100: 492 kwargs = sorted(kwargs, key=self._query_attr_sort_fn) 493 for k,v in kwargs: 494 if k in flags: 495 continue 496 if first: 497 if k in self._indexes: 498 ret.insert_many(self._indexes[k][v]) 499 else: 500 ret.insert_many( r for r in self.obs 501 if hasattr(r,k) and getattr(r,k) == v ) 502 else: 503 if k in ret._indexes: 504 newret = ret.copy_template() 505 newret.insert_many(ret._indexes[k][v]) 506 ret = newret 507 else: 508 retobs = ret.obs[:] 509 ret.remove_many( o for o in retobs 510 if not hasattr(r,k) 511 or (getattr(r,k) != v) ) 512 first = False 513 else: 514 ret = self.clone() 515 516 for f,v in flags: 517 if f == "_orderby": 518 attrs = [s.strip() for s in v.split(',')] 519 attr_orders = [(a.split()+['asc',])[:2] for a in attrs][::-1] 520 for attr,order in attr_orders: 521 ret.obs.sort(key=lambda ob:getattr(ob,attr), reverse=(order=="desc")) 522 523 return ret
524
525 - def delete(self, **kwargs):
526 """Deletes matching objects from the table, based on given 527 named parameters. If multiple named parameters are given, then 528 only objects that satisfy all of the query criteria will be removed. 529 @param **kwargs: attributes for selecting records, given as additional 530 named arguments of the form C{attrname="attrvalue"}. 531 @return: the number of objects removed from the table 532 """ 533 if not kwargs: 534 return 0 535 536 affected = self.query(**kwargs) 537 self.remove_many(affected) 538 return len(affected)
539
540 - def where(self, wherefn, maxrecs=0):
541 """An alternative to L{query}, using a matching predicate function to 542 determine whether a given object matches the query or not. You must use 543 C{where} in place of C{query} if you want to query using inequalities or more 544 complex matching criteria than simple C{attribute=value}. 545 @param wherefn: a method or lambda that returns a boolean result, as in:: 546 547 lambda ob : ob.unitprice > 10 548 549 @type wherefn: callable(object) returning boolean 550 @param maxrecs: if only the first 'n' records are needed, then C{where} will 551 stop after locating 'n' matching records 552 @type maxrecs: int 553 @returns: a new Table containing the matching records 554 """ 555 ret = self.copy_template() 556 if maxrecs: 557 ret.insert_many(islice(ifilter(wherefn, self.obs), 0, maxrecs)) 558 else: 559 ret.insert_many(ifilter(wherefn, self.obs)) 560 return ret
561
562 - def join(self, other, attrlist=None, **kwargs):
563 """ 564 Join the objects of one table with the objects of another, based on the given 565 matching attributes in the named arguments. The attrlist specifies the attributes to 566 be copied from the source tables - if omitted, all attributes will be copied. Entries 567 in the attrlist may be single attribute names, or if there are duplicate names in both 568 tables, then a C{(table,attributename)} tuple can be given to disambiguate which 569 attribute is desired. A C{(table,attributename,alias)} tuple can also be passed, to 570 rename an attribute from a source table. 571 572 This method may be called directly, or can be constructed using the L{join_on} method and 573 the '+' operator. Using this syntax, the join is specified using C{table.join_on("xyz")} 574 to create a JoinTerm containing both table and joining attribute. Multiple JoinTerm 575 or tables can be added to construct a compound join expression. When complete, the 576 join expression gets executed by calling the resulting join definition, 577 using C{join_expression([attrlist])}. 578 579 @param other: other table to join to 580 @param attrlist: list of attributes to be copied to the new joined table; if 581 none provided, all attributes of both tables will be used (taken from the first 582 object in each table) 583 @type attrlist: string, or list of strings or C{(table,attribute[,alias])} tuples 584 (list may contain both strings and tuples) 585 @param **kwargs: attributes to join on, given as additional named arguments 586 of the form C{table1attr="table2attr"}, or a dict mapping attribute names. 587 @returns: a new Table containing the joined data as new DataObjects 588 """ 589 thiscol,othercol = kwargs.items()[0] 590 591 retname = ("(%s:%s^%s:%s)" % 592 (self.table_name, thiscol, other.table_name, othercol)) 593 # make sure both tables contain records to join - if not, just return empty list 594 if not (self.obs and other.obs): 595 return Table(retname) 596 597 if isinstance(attrlist, basestring): 598 attrlist = attrlist.split() 599 600 # expand attrlist to full (table, name, alias) tuples 601 thisnames = set(_object_attrnames(self.obs[0])) 602 othernames = set(_object_attrnames(other.obs[0])) 603 fullcols = [] 604 if attrlist is not None: 605 for col in attrlist: 606 if isinstance(col, tuple): 607 # assume col contains at least (table, colname), fill in alias if missing 608 # to be same as colname 609 fullcols.append((col + (col[1],))[:3]) 610 else: 611 if col in thisnames: 612 fullcols.append( (self, col, col) ) 613 elif col in othernames: 614 fullcols.append( (other, col, col) ) 615 else: 616 pass 617 else: 618 fullcols = [(self,n,n) for n in thisnames] 619 fullcols += [(other,n,n) for n in othernames] 620 621 thiscols = list(ifilter(lambda o:o[0] is self, fullcols)) 622 othercols = list(ifilter(lambda o:o[0] is other, fullcols)) 623 624 thiscolindex = othercolindex = None 625 if thiscol in self._indexes: 626 thiscolindex = self._indexes[thiscol] 627 if othercol in other._indexes: 628 othercolindex = other._indexes[othercol] 629 if not(thiscolindex and othercolindex): 630 raise ValueError("can only join on indexed attributes") 631 632 # use table with fewer keys to drive join 633 if len(thiscolindex) < len(othercolindex): 634 shortindex, longindex = (thiscolindex, othercolindex) 635 swap = False 636 else: 637 shortindex, longindex = (othercolindex, thiscolindex) 638 swap = True 639 640 # find matching rows 641 matchingrows = [] 642 for key,rows in shortindex.items(): 643 if key in longindex: 644 if swap: 645 matchingrows.append( (longindex[key], rows) ) 646 else: 647 matchingrows.append( (rows, longindex[key]) ) 648 649 joinrows = [] 650 for thisrows,otherrows in matchingrows: 651 for trow,orow in product(thisrows,otherrows): 652 retobj = DataObject() 653 for _,c,a in thiscols: 654 setattr(retobj, a, getattr(trow,c)) 655 for _,c,a in othercols: 656 setattr(retobj, a, getattr(orow,c)) 657 joinrows.append(retobj) 658 659 ret = Table(retname) 660 for tbl,collist in zip([self,other],[thiscols,othercols]): 661 for _,c,a in collist: 662 if c in tbl._indexes: 663 ret.create_index(a) # no unique indexes in join results 664 ret.insert_many(joinrows) 665 return ret
666
667 - def join_on(self, attr):
668 """Creates a JoinTerm in preparation for joining with another table, to 669 indicate what attribute should be used in the join. Only indexed attributes 670 may be used in a join. 671 @param attr: attribute name to join from this table (may be different 672 from the attribute name in the table being joined to) 673 @type attr: string 674 @returns: L{JoinTerm}""" 675 if attr not in self._indexes: 676 raise ValueError("can only join on indexed attributes") 677 return JoinTerm(self, attr)
678
679 - def pivot(self, attrlist):
680 """Pivots the data using the given attributes, returning a L{PivotTable}. 681 @param attrlist: list of attributes to be used to construct the pivot table 682 @type attrlist: list of strings, or string of space-delimited attribute names 683 """ 684 if isinstance(attrlist, basestring): 685 attrlist = attrlist.split() 686 if all(a in self._indexes for a in attrlist): 687 return PivotTable(self,[],attrlist) 688 else: 689 raise ValueError("pivot can only be called using indexed attributes")
690
691 - def csv_import(self, csv_source, transforms=None):
692 """Imports the contents of a CSV-formatted file into this table. 693 @param csv_source: CSV file - if a string is given, the file with that name will be 694 opened, read, and closed; if a file object is given, then that object 695 will be read as-is, and left for the caller to be closed. 696 @type csv_source: string or file 697 @param transforms: dict of functions by attribute name; if given, each 698 attribute will be transformed using the corresponding transform; if there is no 699 matching transform, the attribute will be read as a string (default); the 700 transform function can also be defined as a (function, default-value) tuple; if 701 there is an Exception raised by the transform function, then the attribute will 702 be set to the given default value 703 @type transforms: dict (optional) 704 """ 705 close_on_exit = False 706 if isinstance(csv_source, basestring): 707 csv_source = open(csv_source) 708 close_on_exit = True 709 try: 710 csvdata = csv.DictReader(csv_source) 711 self.insert_many(DataObject(**s) for s in csvdata) 712 if transforms: 713 for attr,fn in transforms.items(): 714 default = None 715 if isinstance(fn,tuple): 716 fn,default = fn 717 objfn = lambda obj : fn(getattr(obj,attr)) 718 self.compute(attr, objfn, default) 719 finally: 720 if close_on_exit: 721 csv_source.close()
722
723 - def csv_export(self, csv_dest, fieldnames=None):
724 """Exports the contents of the table to a CSV-formatted file. 725 @param csv_dest: CSV file - if a string is given, the file with that name will be 726 opened, written, and closed; if a file object is given, then that object 727 will be written as-is, and left for the caller to be closed. 728 @type csv_dest: string or file 729 @param fieldnames: attribute names to be exported; can be given as a single 730 string with space-delimited names, or as a list of attribute names 731 """ 732 close_on_exit = False 733 if isinstance(csv_dest, basestring): 734 csv_dest = open(csv_dest,'wb') 735 close_on_exit = True 736 try: 737 if fieldnames is None: 738 fieldnames = list(_object_attrnames(self.obs[0])) 739 if isinstance(fieldnames, basestring): 740 fieldnames = fieldnames.split() 741 742 csv_dest.write(','.join(fieldnames) + '\n') 743 csvout = csv.DictWriter(csv_dest, fieldnames, extrasaction='ignore') 744 if hasattr(self.obs[0], "__dict__"): 745 for o in self.obs: 746 csvout.writerow(o.__dict__) 747 else: 748 for o in self.obs: 749 row = dict(starmap(lambda obj, fld: (fld, getattr(obj, fld)), 750 zip(repeat(o), fieldnames))) 751 csvout.writerow(row) 752 finally: 753 if close_on_exit: 754 csv_dest.close()
755
756 - def compute(self, attrname, fn, default=None):
757 """Computes a new attribute for each object in table, or replaces an 758 existing attribute in each record with a computed value 759 @param attrname: attribute to compute for each object 760 @type attrname: string 761 @param fn: function used to compute new attribute value, based on 762 other values in the object 763 @type fn: function(obj) returns value 764 @param default: value to use if an exception is raised while trying 765 to evaluate fn 766 """ 767 for rec in self: 768 try: 769 val = fn(rec) 770 except Exception: 771 val = default 772 if isinstance(rec, DataObject): 773 object.__setattr__(rec, attrname, val) 774 else: 775 setattr(rec, attrname, val)
776 777
778 -class PivotTable(Table):
779 """Enhanced Table containing pivot results from calling table.pivot(). 780 """
781 - def __init__(self, parent, attr_val_path, attrlist):
782 """PivotTable initializer - do not create these directly, use 783 L{Table.pivot}. 784 """ 785 super(PivotTable,self).__init__() 786 self._attr_path = attr_val_path[:] 787 self._pivot_attrs = attrlist[:] 788 self._subtable_dict = {} 789 790 for k,v in parent._indexes.items(): 791 self._indexes[k] = v.copy_template() 792 if not attr_val_path: 793 self.insert_many(parent.obs) 794 else: 795 attr,val = attr_val_path[-1] 796 self.insert_many(parent.query(**{attr:val})) 797 parent._subtable_dict[val] = self 798 799 if len(attrlist) > 0: 800 this_attr = attrlist[0] 801 sub_attrlist = attrlist[1:] 802 ind = parent._indexes[this_attr] 803 self.subtables = [ PivotTable(self, 804 attr_val_path + [(this_attr,k)], 805 sub_attrlist) for k in sorted(ind.keys()) ] 806 else: 807 self.subtables = []
808
809 - def __getitem__(self,val):
810 if self._subtable_dict: 811 return self._subtable_dict[val] 812 else: 813 return super(PivotTable,self).__getitem__(val)
814
815 - def keys(self):
816 return sorted(self._subtable_dict.keys())
817
818 - def items(self):
819 return sorted(self._subtable_dict.items())
820
821 - def values(self):
822 return self._subtable_dict.values()
823
824 - def pivot_key(self):
825 """Return the set of attribute-value pairs that define the contents of this 826 table within the original source table. 827 """ 828 return self._attr_path
829
830 - def pivot_key_str(self):
831 """Return the pivot_key as a displayable string. 832 """ 833 return '/'.join("%s:%s" % (attr,key) for attr,key in self._attr_path)
834
835 - def has_subtables(self):
836 """Return whether this table has further subtables. 837 """ 838 return bool(self.subtables)
839
840 - def dump(self, out=sys.stdout, row_fn=repr, maxrecs=-1, indent=0):
841 """Dump out the contents of this table in a nested listing. 842 @param out: output stream to write to 843 @param row_fn: function to call to display individual rows 844 @param maxrecs: number of records to show at deepest level of pivot (-1=show all) 845 @param indent: current nesting level 846 """ 847 NL = '\n' 848 if indent: 849 out.write(" "*indent + self.pivot_key_str()) 850 else: 851 out.write("Pivot: %s" % ','.join(self._pivot_attrs)) 852 out.write(NL) 853 if self.has_subtables(): 854 for sub in self.subtables: 855 if sub: 856 sub.dump(out, row_fn, maxrecs, indent+1) 857 else: 858 if maxrecs >= 0: 859 showslice = slice(0,maxrecs) 860 else: 861 showslice = slice(None,None) 862 for r in self.obs[showslice]: 863 out.write(" "*(indent+1) + row_fn(r) + NL) 864 out.flush()
865
866 - def dump_counts(self, out=sys.stdout):
867 """Dump out the summary counts of entries in this pivot table as a tabular listing. 868 @param out: output stream to write to 869 """ 870 if len(self._pivot_attrs) == 1: 871 out.write("Pivot: %s\n" % ','.join(self._pivot_attrs)) 872 maxkeylen = max(len(str(k)) for k in self.keys()) 873 for sub in self.subtables: 874 out.write("%-*.*s " % (maxkeylen,maxkeylen,sub._attr_path[-1][1])) 875 out.write("%7d\n" % len(sub)) 876 elif len(self._pivot_attrs) == 2: 877 out.write("Pivot: %s\n" % ','.join(self._pivot_attrs)) 878 maxkeylen = max(max(len(str(k)) for k in self.keys()),5) 879 maxvallen = max(max(len(str(k)) for k in self.subtables[0].keys()),7) 880 keytally = dict((k,0) for k in self.subtables[0].keys()) 881 out.write("%*s " % (maxkeylen,'')) 882 out.write(' '.join("%*.*s" % (maxvallen,maxvallen,k) for k in self.subtables[0].keys())) 883 out.write(' Total\n') 884 for sub in self.subtables: 885 out.write("%-*.*s " % (maxkeylen,maxkeylen,sub._attr_path[-1][1])) 886 for ssub in sub.subtables: 887 out.write("%*d " % (maxvallen,len(ssub))) 888 keytally[ssub._attr_path[-1][1]] += len(ssub) 889 out.write("%7d\n" % len(sub)) 890 out.write('%-*.*s ' % (maxkeylen,maxkeylen,"Total")) 891 out.write(' '.join("%*d" % (maxvallen,tally) for k,tally in sorted(keytally.items()))) 892 out.write(" %7d\n" % sum(tally for k,tally in keytally.items())) 893 else: 894 raise ValueError("can only dump summary counts for 1 or 2-attribute pivots")
895
896 - def summary_counts(self,fn=None,col=None):
897 """Dump out the summary counts of this pivot table as a Table. 898 """ 899 ret = Table() 900 topattr = self._pivot_attrs[0] 901 for attr in self._pivot_attrs: 902 ret.create_index(attr) 903 if len(self._pivot_attrs) == 1: 904 for sub in self.subtables: 905 subattr,subval = sub._attr_path[-1] 906 if fn is None: 907 ret.insert(DataObject(**{subattr:subval, 'Count':len(sub)})) 908 else: 909 attrdict[fn.__name__] = reduce(fn, (s[col] for s in sub)) 910 elif len(self._pivot_attrs) == 2: 911 for sub in self.subtables: 912 for ssub in sub.subtables: 913 attrdict = dict(ssub._attr_path) 914 if fn is None: 915 attrdict['Count'] = len(ssub) 916 else: 917 attrdict[fn.__name__] = reduce(fn, (s[col] for s in ssub)) 918 ret.insert(DataObject(**attrdict)) 919 elif len(self._pivot_attrs) == 3: 920 for sub in self.subtables: 921 for ssub in sub.subtables: 922 for sssub in ssub.subtables: 923 attrdict = dict(sssub._attr_path) 924 if fn is None: 925 attrdict['Count'] = len(sssub) 926 else: 927 attrdict[fn.__name__] = reduce(fn, (s[col] for s in sssub)) 928 ret.insert(DataObject(**attrdict)) 929 else: 930 raise ValueError("can only dump summary counts for 1 or 2-attribute pivots") 931 return ret
932
933 -class JoinTerm(object):
934 """Temporary object created while composing a join across tables using 935 L{Table.join_on} and '+' addition. JoinTerm's are usually created by 936 calling join_on on a Table object, as in:: 937 938 customers.join_on("id") + orders.join_on("custid") 939 940 This join expression would set up the join relationship 941 equivalent to:: 942 943 customers.join(orders, id="custid") 944 945 If tables are being joined on attributes that have the same name in 946 both tables, then a join expression could be created by adding a 947 JoinTerm of one table directly to the other table:: 948 949 customers.join_on("custid") + orders 950 951 Once the join expression is composed, the actual join is performed 952 using function call notation:: 953 954 customerorders = customers.join_on("custid") + orders 955 for custord in customerorders(): 956 print custord 957 958 When calling the join expression, you can optionally specify a 959 list of attributes as defined in L{Table.join}. 960 """
961 - def __init__(self, sourceTable, joinfield):
962 self.sourcetable = sourceTable 963 self.joinfield = joinfield 964 self.jointo = None
965
966 - def __add__(self, other):
967 if isinstance(other, Table): 968 other = other.join_on(self.joinfield) 969 if isinstance(other, JoinTerm): 970 if self.jointo is None: 971 if other.jointo is None: 972 self.jointo = other 973 else: 974 self.jointo = other() 975 return self 976 else: 977 if other.jointo is None: 978 return self() + other 979 else: 980 return self() + other() 981 raise ValueError("cannot add object of type '%s' to JoinTerm" % other.__class__.__name__)
982
983 - def __radd__(self, other):
984 if isinstance(other, Table): 985 return other.join_on(self.joinfield) + self 986 raise ValueError("cannot add object of type '%s' to JoinTerm" % other.__class__.__name__)
987
988 - def __call__(self, attrs=None):
989 if self.jointo: 990 other = self.jointo 991 if isinstance(other, Table): 992 other = other.join_on(self.joinfield) 993 ret = self.sourcetable.join(other.sourcetable, attrs, 994 **{self.joinfield : other.joinfield}) 995 return ret 996 else: 997 return self.sourcetable.query()
998
999 - def join_on(self, col):
1000 return self().join_on(col)
1001 1002 1003 if __name__ == "__main__": 1004 1005 # import json in Python 2 or 3 compatible forms 1006 from functools import partial 1007 try: 1008 import simplejson as json 1009 json_dumps = partial(json.dumps, indent=' ') 1010 except ImportError: 1011 import json 1012 json_dumps = partial(json.dumps, indent=2) 1013 1014 1015 rawdata = """\ 1016 Phoenix:AZ:85001:KPHX 1017 Phoenix:AZ:85001:KPHY 1018 Phoenix:AZ:85001:KPHA 1019 Dallas:TX:75201:KDFW""".splitlines() 1020 1021 # load miniDB 1022 stations = Table() 1023 #~ stations.create_index("city") 1024 stations.create_index("stn", unique=True) 1025 1026 fields = "city state zip stn".split() 1027 for d in rawdata: 1028 ob = DataObject() 1029 for k,v in zip(fields, d.split(':')): 1030 setattr(ob,k,v.strip()) 1031 stations.insert(ob) 1032 1033 # perform some queries and deletes 1034 for queryargs in [ 1035 dict(city="Phoenix"), 1036 dict(city="Phoenix", stn="KPHX"), 1037 dict(stn="KPHA", city="Phoenix"), 1038 dict(state="TX"), 1039 dict(city="New York"), 1040 dict(city="Phoenix", _orderby="stn"), 1041 dict(city="Phoenix", _orderbydesc="stn"), 1042 ]: 1043 print queryargs, 1044 result = stations.query(**queryargs) 1045 print len(result) 1046 for r in result: print r 1047 print 1048 #~ print stations.delete(city="Phoenix") 1049 #~ print stations.delete(city="Boston") 1050 print list(stations.query()) 1051 print 1052 1053 amfm = Table() 1054 amfm.create_index("stn", unique=True) 1055 amfm.insert(DataObject(stn="KPHY", band="AM")) 1056 amfm.insert(DataObject(stn="KPHX", band="FM")) 1057 amfm.insert(DataObject(stn="KPHA", band="FM")) 1058 amfm.insert(DataObject(stn="KDFW", band="FM")) 1059 1060 try: 1061 amfm.insert(DataObject(stn="KPHA", band="AM")) 1062 except KeyError: 1063 print "duplicate key not allowed" 1064 1065 print 1066 for rec in (stations.join_on("stn") + amfm.join_on("stn") 1067 )(["stn", "city", (amfm,"band","AMFM"), 1068 (stations,"state","st")]).query(_orderby="AMFM"): 1069 print repr(rec) 1070 1071 print 1072 for rec in (stations.join_on("stn") + amfm.join_on("stn") 1073 )(["stn", "city", (amfm,"band"), (stations,"state","st")]): 1074 print json_dumps(rec.__dict__) 1075 1076 print 1077 for rec in (stations.join_on("stn") + amfm.join_on("stn"))(): 1078 print json_dumps(rec.__dict__) 1079 1080 print 1081 stations.create_index("state") 1082 pivot = stations.pivot("state") 1083 pivot.dump_counts() 1084 1085 print 1086 amfm.create_index("band") 1087 pivot = (stations.join_on("stn") + amfm)().pivot("state band") 1088 pivot.dump_counts() 1089