Module record
[hide private]
[frames] | no frames]

Source Code for Module record

   1    
   2  from baseObjects import Record 
   3  from c3errors import C3Exception 
   4  import types, utils, os, re 
   5  from Ft.Xml.Domlette import implementation, Print 
   6  from cStringIO import StringIO 
   7  from xml.sax.saxutils import escape 
   8  from PyZ3950.zmarc_relaxed import MARC 
   9  from xml.sax import ContentHandler 
  10   
  11  from utils import Context, flattenTexts 
  12   
  13  # 1 <name> <attrHash> parent predicate end 
  14  # Element 
  15  # 4 <as 1> 
  16  # Namespaced Element 
  17  # 2 <name> <startLine> 
  18  # End Element 
  19  # 5 <as 2> 
  20  # End Namespaced 
  21  # 3 <text> 
  22  # Characters 
  23  # 9 <element hash> 
  24  # Hash of locations 
  25   
  26  # Split to separate object to allow for DOM->SAX direct conversion 
  27  # by throwing events from DOM tree to handler. 
28 -class SaxContentHandler(ContentHandler):
29 currentText = [] 30 currentPath = [] 31 pathLines = [] 32 currentLine = -1 33 recordSize = 0 34 elementHash = {} 35 namespaces = [] 36 hashAttributesNames = {} 37 hashAttributes = [] 38 stripWS = 0 39 saveElementIndexes = 1 40
41 - def __init__(self):
42 self.saveElementIndexes = 1 43 self.hashAttributesNames = {} 44 self.hashAttributes = [] 45 self.stripWS = 0 46 self.reinit()
47
48 - def reinit(self):
49 self.currentText = [] 50 self.currentPath = [] 51 self.pathLines = [] 52 self.currentLine = -1 53 self.recordSize = 0 54 self.elementHash = {} 55 self.elementIndexes = [] 56 self.namespaces = []
57
58 - def startPrefixMapping(self, pfx, uri):
59 self.currentLine += 1 60 if (pfx == None): 61 pfx = '' 62 self.currentText.append("6 %r, %r" % (pfx, uri))
63 64 # We want to fwd elems to NS elem handlers with default NS?
65 - def startElement(self, name, attrs):
66 self.currentLine += 1 67 self.pathLines.append(self.currentLine) 68 try: 69 parent = self.pathLines[-2] 70 except IndexError: 71 parent = -1 72 attrHash = {} 73 if (attrs): 74 for k in attrs.keys(): 75 attrHash[k] = escape(attrs[k]) 76 77 try: 78 npred = self.elementIndexes[-1][name] + 1 79 self.elementIndexes[-1][name] += 1 80 except IndexError: 81 # Empty 82 npred = 1 83 self.elementIndexes = [{name: npred}] 84 except KeyError: 85 # First occurence of Element 86 npred = 1 87 self.elementIndexes[-1][name] = 1 88 except: 89 print (name, self.elementIndexes) 90 raise 91 self.elementIndexes.append({}) 92 self.currentText.append("1 %s %s %d %d" % (name, repr(attrHash), parent, npred)) 93 saveAttrs = [] 94 try: 95 hashAttrList = self.hashAttributesNames[name] 96 for a in hashAttrList: 97 try: 98 saveAttrs.append("%s[@%s='%s']" % (name, a, attrHash[a])) 99 except: 100 pass 101 except: 102 pass 103 try: 104 starAttrList = self.hashAttributesNames['*'] 105 for a in starAttrList: 106 try: 107 saveAttrs.append("*[@%s='%s']" % (a, attrHash[a])) 108 except: 109 pass 110 except: 111 pass 112 if saveAttrs: 113 self.hashAttributes.append((self.currentLine, saveAttrs))
114 115 116
117 - def endElement(self, name):
118 self.currentLine += 1 119 start = self.pathLines.pop() 120 self.currentText.append("2 %s %d" % (name, start)) 121 self.currentText[start] = "%s %d" % (self.currentText[start], self.currentLine) 122 self.elementIndexes.pop() 123 try: 124 self.elementHash[name].append([start, self.currentLine]) 125 except: 126 self.elementHash[name] = [[start, self.currentLine]] 127 if self.hashAttributes and self.hashAttributes[-1][0] == start: 128 attrs = self.hashAttributes.pop()[1] 129 for sa in attrs: 130 try: 131 self.elementHash[sa].append([start, self.currentLine]) 132 except: 133 self.elementHash[sa] = [[start, self.currentLine]]
134
135 - def startElementNS(self, name, qname, attrs):
136 self.currentLine += 1 137 self.pathLines.append(self.currentLine) 138 try: 139 parent = self.pathLines[-2] 140 except: 141 parent = -1 142 attrHash = {} 143 # Convert from weird sax thing 144 if (attrs): 145 for k in attrs.keys(): 146 attrHash[k] = attrs[k] 147 148 simpleName = name[1] 149 try: 150 npred = self.elementIndexes[-1][simpleName] + 1 151 self.elementIndexes[-1][simpleName] += 1 152 except IndexError: 153 # Empty 154 npred = 1 155 self.elementIndexes = [{simpleName: npred}] 156 except KeyError: 157 # First occurence of Element 158 npred = 1 159 self.elementIndexes[-1][simpleName] = 1 160 self.elementIndexes.append({}) 161 162 self.currentText.append("4 %r, %r, %r, %r %d %d" % (name[0], simpleName, qname, attrHash, parent, npred)) 163 164 saveAttrs = [] 165 try: 166 hashAttrList = self.hashAttributesNames[simpleName] 167 for a in hashAttrList: 168 try: 169 saveAttrs.append("%s[@%s='%s']" % (simpleName, a, attrHash[a])) 170 except: 171 pass 172 except: 173 pass 174 try: 175 starAttrList = self.hashAttributesNames['*'] 176 for a in starAttrList: 177 try: 178 saveAttrs.append("*[@%s='%s']" % (a, attrHash[a])) 179 except: 180 pass 181 except: 182 pass 183 if saveAttrs: 184 self.hashAttributes.append((self.currentLine, saveAttrs))
185 186
187 - def endElementNS(self, name, qname):
188 self.currentLine += 1 189 start = self.pathLines.pop() 190 self.currentText.append("5 %r, %r, %r %d" % (name[0], name[1], qname, start)) 191 self.currentText[start] ="%s %d" % (self.currentText[start], self.currentLine) 192 self.elementIndexes.pop() 193 try: 194 self.elementHash[name[1]].append([start, self.currentLine]) 195 except: 196 self.elementHash[name[1]] = [[start, self.currentLine]] 197 if self.hashAttributes and self.hashAttributes[-1][0] == start: 198 attrs = self.hashAttributes.pop()[1] 199 for sa in attrs: 200 try: 201 self.elementHash[sa].append([start, self.currentLine]) 202 except: 203 self.elementHash[sa] = [[start, self.currentLine]]
204
205 - def characters(self, text, start=0, length=-1):
206 # if text.isspace(): 207 # text = " " 208 prev = self.currentText[-1] 209 if self.stripWS and text.isspace(): 210 return 211 self.currentLine += 1 212 if (len(text) != 1 and len(prev) != 3 and prev[0] == "3" and not prev[-1] in [' ', '-']): 213 # Adjacent lines of text, ensure spaces 214 text = ' ' + text 215 self.currentText.append("3 %s" % (text)) 216 self.recordSize += len(text.split())
217
218 - def ignorableWhitespace(self, ws):
219 # ... ignore! :D 220 pass
221
222 - def processingInstruction(self, target, data):
223 pass
224 - def skippedEntity(self, name):
225 pass
226
227 -class SaxToDomHandler:
228 nodeStack = [] 229 document = None 230 currText = "" 231
232 - def initState(self):
233 self.nodeStack = [] 234 self.document=None 235 self.top = None
236
237 - def startElement(self, name, attribs={}):
238 if (not self.document): 239 self.document = implementation.createDocument(None, name, None) 240 elem = self.document.childNodes[0] 241 else: 242 elem = self.document.createElementNS(None,name) 243 for a in attribs: 244 elem.setAttributeNS(None,a,attribs[a]) 245 if (self.nodeStack): 246 self.nodeStack[-1].appendChild(elem) 247 else: 248 self.document.appendChild(elem) 249 self.nodeStack.append(elem)
250
251 - def endElement(self, foo):
252 self.nodeStack.pop()
253
254 - def characters(self, text, zero=0, length=0):
255 if (self.nodeStack): 256 if (text.isspace()): 257 text = " " 258 # Is this escape necessary? 259 text = escape(text) 260 d = self.document.createTextNode(text) 261 self.nodeStack[-1].appendChild(d)
262
263 - def startElementNS(self, name, qname, attribs):
264 if (not self.document): 265 self.document = implementation.createDocument(name[0], name[1], None) 266 elem = self.document.childNodes[0] 267 else: 268 elem = self.document.createElementNS(name[0],name[1]) 269 270 for a in attribs: 271 elem.setAttributeNS(a[0],a[1],attribs[a]) 272 if (self.nodeStack): 273 self.nodeStack[-1].appendChild(elem) 274 else: 275 self.document.appendChild(elem) 276 self.nodeStack.append(elem)
277
278 - def endElementNS(self, name,qname):
279 self.nodeStack.pop()
280
281 - def startPrefixMapping(self, pref, uri):
282 pass
283
284 - def getRootNode(self):
285 return self.document
286 287 s2dhandler = SaxToDomHandler() 288
289 -class SaxToXmlHandler:
290 xml = [] 291 currNs = 0 292 newNamespaces = {} 293
294 - def initState(self):
295 self.xml = [] 296 self.namespaces = {} 297 self.currNs = 0 298 self.newNamespaces = {}
299
300 - def startPrefixMapping(self, pref, uri):
301 self.namespaces[uri] = pref 302 self.newNamespaces[pref] = uri
303
304 - def startElement(self, name, attribs={}):
305 attrs = [] 306 for a in attribs: 307 attrs.append('%s="%s"' % (a, attribs[a])) 308 attribtxt = ' '.join(attrs) 309 if (attribtxt): 310 attribtxt = " " + attribtxt 311 self.xml.append("<%s%s>" % (name, attribtxt))
312
313 - def endElement(self, name):
314 self.xml.append("</%s>" % (name))
315
316 - def _getPrefix(self, ns):
317 if (not ns): 318 return "" 319 pref = self.namespaces.get(ns, None) 320 if (pref == None): 321 self.currNs += 1 322 pref = "ns%d" % (self.currNs) 323 self.namespaces[ns] = pref 324 self.newNamespaces[pref] = ns 325 return pref
326
327 - def startElementNS(self, n, qn=None, attrs={}):
328 pref = self._getPrefix(n[0]) 329 if (pref): 330 name = "%s:%s" % (pref, n[1]) 331 else: 332 name = n[1] 333 attrlist = [] 334 for ns,aname in attrs: 335 p2 = self._getPrefix(ns) 336 if (p2): 337 nsaname = "%s:%s" % (p2, aname) 338 else: 339 nsaname = aname 340 attrlist.append('%s="%s"' % (nsaname, attrs[(ns,aname)])) 341 for x in self.newNamespaces.items(): 342 if (x[0]): 343 attrlist.append('xmlns:%s="%s"' % (x[0], x[1])) 344 else: 345 attrlist.append('xmlns="%s"' % (x[1])) 346 self.newNamespaces = {} 347 attribtxt = ' '.join(attrlist) 348 if (attribtxt): 349 attribtxt = " " + attribtxt 350 self.xml.append("<%s%s>" % (name,attribtxt))
351
352 - def endElementNS(self, n, qn=None):
353 pref = self._getPrefix(n[0]) 354 if (pref): 355 name = "%s:%s" % (pref, n[1]) 356 else: 357 name = n[1] 358 self.xml.append("</%s>" % (name))
359
360 - def characters(self, text, zero=0, length=0):
361 text = escape(text) 362 self.xml.append(text)
363
364 - def get_raw(self):
365 return ''.join(self.xml)
366 367 368 s2xhandler = SaxToXmlHandler() 369 370
371 -class NumericPredicateException(C3Exception):
372 pass
373 374
375 -class DomRecord(Record):
376 context = None 377 size = 0 378
379 - def __init__(self, domNode, xml="", docid=None):
380 self.dom = domNode 381 self.xml = xml 382 self.id = docid 383 self.parent = ('','',-1) 384 self.context = None 385 self.size = len(flattenTexts(domNode).split())
386 387
388 - def _walk(self, node):
389 pass
390
391 - def get_sax(self):
392 if (not self.sax): 393 # XXX Turn DOM into SAX and cache 394 self.handler = SaxContentHandler() 395 for c in self.dom.childNodes: 396 self._walkTop(c) 397 self.sax = self.handler.currentText 398 self.sax.append("9 %r" % self.handler.elementHash) 399 self.handler = None 400 return self.sax
401
402 - def get_dom(self):
403 return self.dom
404
405 -class MinidomRecord(DomRecord):
406 useNamespace = 1 407
408 - def get_xml(self):
409 if (self.xml): 410 return self.xml 411 else: 412 self.xml = self.dom.toxml() 413 return self.xml
414
415 - def _walkTop(self, node):
416 # top level node 417 if node.nodeType == utils.elementType: 418 self.namespaces = node.namespaceURI != None 419 self._walk(node)
420
421 - def _walk(self, node):
422 if (node.nodeType == utils.elementType): 423 name = node.localName 424 ns = node.namespaceURI 425 attrHash = {} 426 for ai in range(node.attributes.length): 427 attr = node.attributes.item(ai) 428 if self.namespaces: 429 if attr.namespaceURI == 'http://www.w3.org/2000/xmlns/': 430 self.handler.startPrefixMapping(attr.localName, attr.value) 431 else: 432 attrHash[(attr.namespaceURI, attr.localName)] = attr.value 433 else: 434 attrHash[attr.localName] = attr.value 435 if self.namespaces: 436 self.handler.startElementNS((node.namespaceURI, node.localName), None, attrHash) 437 else: 438 self.handler.startElement(node.localName, attrHash) 439 for c in node.childNodes: 440 self._walk(c) 441 if self.namespaces: 442 self.handler.endElementNS((node.namespaceURI, node.localName), None) 443 else: 444 self.handler.endElement(node.localName) 445 elif node.nodeType == utils.textType: 446 self.handler.characters(node.data)
447
448 - def process_xpath(self, tuple):
449 # XXX Does PyXML provide XPath yet? 450 raise NotImplementedError
451
452 -class FtDomRecord(DomRecord):
453
454 - def get_xml(self):
455 if (self.xml): 456 return self.xml 457 else: 458 stream = StringIO() 459 Print(self.dom, stream) 460 stream.seek(0) 461 self.xml = stream.read() 462 return self.xml
463 464
465 - def process_xpath(self, tuple):
466 xp = tuple[0] 467 if (not self.context): 468 self.context = Context.Context(self.dom) 469 return xp.evaluate(self.context)
470 471 472 try: 473 from lxml import etree, sax 474 class LxmlRecord(DomRecord): 475 476 def __repr__(self): 477 if self.recordStore != None: 478 return "%s/%s" % (self.recordStore, self.id) 479 else: 480 return "Record-%d" % self.id
481 482 def process_xpath(self, xpath, maps={}): 483 484 if (isinstance(xpath, list)): 485 xpath = repr(xpath[0]) 486 if xpath[0] != "/" and xpath[-1] != ')': 487 xpath = "//" + xpath 488 if maps: 489 return self.dom.xpath(xpath, maps) 490 else: 491 return self.dom.xpath(xpath) 492 493 def get_xml(self): 494 return etree.tostring(self.dom) 495 496 def get_sax(self): 497 if (not self.sax): 498 # XXX Turn DOM into SAX and cache 499 handler = SaxContentHandler() 500 sax.saxify(self.dom, handler) 501 self.sax = self.handler.currentText 502 self.sax.append("9 %r" % self.handler.elementHash) 503 return self.sax 504 505 except: 506 class LxmlRecord(DomRecord): 507 pass 508 509
510 -class SaxRecord(Record):
511
512 - def __repr__(self):
513 if self.recordStore != None: 514 return "%s/%s" % (self.recordStore, self.id) 515 else: 516 return "Record-%d" % self.id
517
518 - def __init__(self, saxList, xml="", docid=None, recordSize=0):
519 self.sax = saxList 520 self.id = docid 521 self.xml = xml 522 self.history = [] 523 self.rights = [] 524 self.elementHash = {} 525 self.size = recordSize 526 self.parent = ('','',-1) 527 self.attrRe = re.compile("u['\"](.+?)['\"]: u['\"](.*?)['\"](, |})") 528 #self.attrRe = re.compile("u(?P<quote>['\"])(.+?)(?P=quote): u(?P<quoteb>['\"])(.*?)(?P=quoteb)(, |})") 529 self.recordStore = ""
530 531
532 - def process_xpath(self, xpTuple, maps={}):
533 if (not isinstance(xpTuple, list)): 534 # Raw XPath 535 c = utils.verifyXPaths([xpTuple]) 536 if (not c or not c[0][1]): 537 print "BAD XPATH" 538 return [] 539 else: 540 xpTuple = c[0] 541 542 xp = xpTuple[1] 543 try: 544 flatten = 0 545 if xp[0][0] == "FUNCTION" and xp[0][1] == 'count': 546 # process xpath and return number of matches 547 if isinstance(xp[0][2][0], str) and xp[0][2][0] != '/': 548 data = self.process_xpath([None, [xp[0][2]]], maps) 549 else: 550 data = self.process_xpath([None, xp[0][2]], maps) 551 552 return len(data) 553 554 if (xp[-1][0] == 'child' and xp[-1][1] == "__text()"): 555 flatten = 1 556 xp = xp[:-1] 557 558 if (xp[-1][0] == 'attribute'): 559 return self._handleAttribute(xp, maps) 560 elif (xp[-1][0] == "/"): 561 # Return top level element 562 for x in range(len(self.sax)): 563 if self.sax[x][0] in ['1', '4']: 564 return self.sax[x:] 565 elif(xp[-1][0] in ['child', 'descendant']): 566 data = [] 567 # Extracting element 568 elemName = xp[-1][1] 569 570 nselem = elemName.split(":") 571 if (len(nselem) == 2): 572 # Namespaced. 573 nsUri = maps[nselem[0]] 574 elemName = nselem[1] 575 else: 576 nsUri = "" 577 578 attr = xp[-1][2] 579 elemLines = [] 580 if elemName == '*' and attr: 581 for p in attr: 582 if p[0] == 'FUNCTION' and p[2] == '__name()': 583 names = self.elementHash.keys() 584 if p[1] == 'starts-with' and p[2] == '__name()': 585 for x in names: 586 if x.find(p[3]) == 0: 587 elemLines.extend(self.elementHash[x]) 588 elif p[1] == 'regexp' and p[2] == '__name()': 589 for x in names: 590 if p[3].search(x): 591 elemLines.extend(self.elementHash[x]) 592 elif (not self.elementHash.has_key(elemName)): 593 return [] 594 595 if (len(attr) == 1 and type(attr[0]) == types.ListType and attr[0][1] == "="): 596 n = u"%s[@%s='%s']" % (elemName, attr[0][0], attr[0][2]) 597 elemLines = self.elementHash.get(n, []) 598 599 if elemLines == []: 600 try: 601 elemLines = self.elementHash[elemName] 602 except: 603 # might really be empty 604 pass 605 for e in elemLines: 606 if (not nsUri or self.sax[e[0]][4:4+len(nsUri)] == nsUri): 607 match = self._checkSaxXPathLine(xp, e[0]) 608 if (match): 609 # Return event chunk 610 l = self.sax[e[0]] 611 end = int(l[l.rfind(' ')+1:]) 612 data.append(self.sax[e[0]:end+1]) 613 else: 614 # Unsupported final axis 615 raise(NotImplementedError) 616 617 if flatten and data: 618 # Flatten to text nodes 619 ndata = [] 620 for match in data: 621 txt = [] 622 for ev in match: 623 if ev[0] == '3': 624 txt.append(ev[2:]) 625 ndata.append(''.join(txt)) 626 return ndata 627 else: 628 return data 629 except NotImplementedError: 630 # Convert to DOM (slow) and reapply (slower still) 631 dom = self.get_dom() 632 xp = xpTuple[0] 633 try: 634 return utils.evaluateXPath(xp, dom) 635 except: 636 print "Buggy Xpath!..." 637 return []
638 # Otherwise just fall over as we've hit a real bug 639 640
641 - def _handleAttribute(self, xp, maps={}):
642 attrName = xp[-1][1] 643 nselem = attrName.split(":") 644 if (len(nselem) == 2): 645 # Namespaced attribute 646 nsUri = maps[nselem[0]] 647 attrName = nselem[1] 648 else: 649 nsUri = None 650 651 data = [] 652 653 if (len(xp) == 1): 654 # Extracting all occs of attribute anywhere!? 655 # Check predicates... (only support one numeric predicate) 656 if (len(xp[0][2]) == 1 and type(xp[0][2][0]) == types.FloatType): 657 nth = int(xp[0][2][0]) 658 elif (len(xp[0][2])): 659 # Non index or multiple predicates?? 660 raise(NotImplementedError) 661 else: 662 nth = 0 663 664 currn = 0 665 for l in self.sax: 666 if (l[0] == "1"): 667 (name, attrs) = self._convert_elem(l) 668 if (attrs.has_key(attrName)): 669 currn += 1 670 content = attrs[attrName] 671 if (currn == nth): 672 data.append(content) 673 break 674 elif (not nth): 675 data.append(content) 676 677 else: 678 elemName = xp[-2][1] 679 flatten = 0 680 if (elemName == "*"): 681 # Let DOM code handle this monstrosity :P 682 raise(NotImplementedError) 683 684 nselem = elemName.split(":") 685 if (len(nselem) == 2): 686 # Namespaced. 687 elemNsUri = maps[nselem[0]] 688 elemName = nselem[1] 689 else: 690 elemNsUri = "" 691 692 if (self.elementHash.has_key(elemName)): 693 elemLines = self.elementHash[elemName] 694 for e in elemLines: 695 if (not elemNsUri or self.sax[e[0]][4:4+len(elemNsUri)] == elemNsUri): 696 line = self.sax[e[0]] 697 (name, attrs) = self._convert_elem(line) 698 if (attrName == '*'): 699 # All attributes' values 700 match = self._checkSaxXPathLine(xp[:-1], e[0]) 701 if (match): 702 for k in attrs.keys(): 703 data.append(attrs[k]) 704 else: 705 if (not attrs.has_key(attrName)): 706 attrName = (nsUri, attrName) 707 if (not attrs.has_key(attrName) and not nsUri): 708 # step through and take first 709 content = None 710 for key in attrs: 711 if key[1] == attrName[1]: 712 content = attrs[key] 713 else: 714 content = attrs.get(attrName, None) 715 if (content): 716 # Now check rest of path 717 match = self._checkSaxXPathLine(xp[:-1], e[0]) 718 if (match): 719 data.append(content) 720 721 return data
722
723 - def _checkSaxXPathLine(self, xp, line):
724 # Check that event at line in record matches xpath up tree 725 # Pass by reference, need a copy to pop! Looks like a hack... 726 xpath = xp[:] 727 climb = False 728 while (xpath): 729 posn = len(xpath) 730 node = xpath.pop() 731 if (line == -1): 732 if node != "/" and node != ['/']: 733 return 0 734 else: 735 elem = self.sax[line] 736 (name, attrs) = self._convert_elem(elem) 737 match = self._checkSaxXPathNode(node, name, attrs, line, posn) 738 if not match: 739 if not climb: 740 return 0 741 else: 742 # Previous was a descendant, keep looking 743 while not match: 744 start = elem.rfind("}") + 2 745 end = elem.find(" ", start) 746 line = int(elem[start:end]) 747 if line != -1: 748 elem = self.sax[line] 749 (name, attrs) = self._convert_elem(elem) 750 match = self._checkSaxXPathNode(node, name, attrs, line, posn) 751 else: 752 return 0 753 754 if xpath: 755 start = elem.rfind("}") + 2 756 end = elem.find(" ", start) 757 line = int(elem[start:end]) 758 climb = (node and node[0] == "descendant") 759 760 return 1
761 762
763 - def _checkSaxXPathNode(self, step, name, attrs, line, posn):
764 # name already checked, strip 765 if step in ['/', ['/']] and name: 766 return 0 767 if (step[1] <> name and step[1] <> '*' and step[1][step[1].find(":")+1:] <> name): 768 return 0 769 elif (not step[0] in ['child', 'descendant']): 770 # Unsupported axis 771 raise(NotImplementedError) 772 elif (step[2]): 773 # Check predicates 774 predPosn = 0 775 for pred in (step[2]): 776 predPosn += 1 777 m = self._checkSaxXPathPredicate(pred, name, attrs, line, posn, predPosn) 778 if (not m): 779 return 0 780 return 1
781
782 - def _checkSaxXPathPredicate(self, pred, name, attrs, line, posn, predPosn):
783 784 if (type(pred) != types.ListType): 785 # Numeric Predicate. (eg /foo/bar[1]) 786 if (predPosn != 1): 787 # Can't do numeric predicate on already predicated nodeset 788 # eg: text[@type='main'][2] 789 raise(NotImplementedError) 790 791 if (posn == 1): 792 # First position in relative path. 793 # Check against position in elementHash 794 if (self.elementHash.has_key(name)): 795 all = self.elementHash[name] 796 p = int(pred) 797 if (len(all) < p): 798 return 0 799 return all[int(pred)-1][0] == line 800 return 0 801 else: 802 # Not first position, so it applies to parent elem 803 # Which we record during parsing 804 elem = self.sax[line] 805 end = elem.rfind("}") + 2 806 start = elem.find(' ', end) + 1 807 end = elem.find(' ', start) 808 npred = float(elem[start:end]) 809 return npred == pred 810 elif (pred[1] in ['=', '!=', '<', '>', '<=', '>=']): 811 # Single attribute 812 return self._checkSaxXPathAttr(pred, attrs) 813 elif (pred[1] in ['and', 'or']): 814 # Attribute combinations 815 left = self._checkSaxXPathPredicate(pred[0], name, attrs, line, posn, predPosn) 816 right = self._checkSaxXPathPredicate(pred[2], name, attrs, line, posn, predPosn) 817 if (pred[1] == 'and' and left and right): 818 return 1 819 elif (pred[1] == 'or' and (left or right)): 820 return 1 821 return 0 822 elif (pred[0] == 'attribute'): 823 # Attribute exists test 824 return attrs.has_key(pred[1]) 825 elif (pred[0] == 'FUNCTION'): 826 if pred[2] == "__name()": 827 return True 828 if pred[1] == 'starts-with': 829 if attrs.has_key(pred[2]): 830 val = attrs[pred[2]] 831 return not val.find(pred[3]) 832 else: 833 return False 834 elif pred[1] == 'regexp': 835 if attrs.has_key(pred[2]): 836 return pred[3].search(attrs[pred[2]]) != None 837 else: 838 return False 839 raise NotImplementedError 840 else: 841 # No idea!! 842 raise(NotImplementedError) 843 return 1
844
845 - def _checkSaxXPathAttr(self, pred, attrs):
846 847 # Namespacey 848 if (not attrs.has_key(pred[0])): 849 if (attrs.has_key((None, pred[0]))): 850 pred[0] = (None, pred[0]) 851 else: 852 return 0 853 rel = pred[1] 854 855 # -Much- faster than eval 856 if (type(pred[2]) == types.FloatType): 857 attrValue = float(attrs[pred[0]]) 858 else: 859 attrValue = attrs[pred[0]] 860 861 comp = cmp(attrValue, pred[2]) 862 if rel == "=": 863 return comp == 0 864 elif rel == ">": 865 return comp == 1 866 elif rel == "<": 867 return comp == -1 868 elif rel == "<=": 869 return comp in (-1, 0) 870 elif rel == ">=": 871 return comp in (1, 0) 872 elif rel == "!=": 873 return comp in (1, -1) 874 else: 875 raise(NotImplementedError)
876 877
878 - def _convert_elem(self, line):
879 # Currently: 1 name {attrs} parent npred end 880 if (line[0] == '1'): 881 start = line.find("{") 882 name = line[2:start-1] 883 if line[start+1] == '}': 884 attrs = {} 885 else: 886 attrList = self.attrRe.findall(line) 887 attrs = {} 888 for m in attrList: 889 attrs[unicode(m[0])] = unicode(m[1]) 890 # XXX Below version is VERY VERY SLOW 891 #attrs = eval(line[start:end+1]) 892 return [name, attrs] 893 elif (line[0] == '4'): 894 end = line.rfind("}") 895 stuff = eval(line[2:end+1]) 896 return [stuff[1], stuff[3]] 897 else: 898 raise ValueError("Called convert on non element.")
899
900 - def saxify(self, handler=None, sax=[]):
901 if handler == None: 902 handler = self 903 if not sax: 904 sax = self.get_sax() 905 906 for l in sax: 907 line = l 908 # line = l.strip() 909 if line[0] == "1": 910 # String manipulation method 911 (name, attrs) = self._convert_elem(line) 912 handler.startElement(name, attrs) 913 elif line[0] == "3": 914 handler.characters(line[2:], 0, len(line)-2) 915 elif line[0] == "2": 916 end = line.rfind(' ') 917 handler.endElement(line[2:end]) 918 elif line[0] == "9": 919 pass 920 elif line[0] == '4': 921 # 4 ns,name,qname, {} 922 idx = line.rfind(' ') 923 idx = line[:idx].rfind(' ') 924 idx = line[:idx].rfind(' ') 925 line = line[:idx] 926 (ns, name, qname, attrs) = eval(line[2:]) 927 handler.startElementNS((ns,name), qname, attrs) 928 elif line[0] == '5': 929 # 5 ns,name,qname parent pred end 930 idx = line.rfind(' ') 931 line = line[:idx] 932 (ns, name, qname) = eval(line[2:]) 933 handler.endElementNS((ns,name),qname) 934 elif line[0] == '6': 935 # 6 pref, uri 936 pref, uri = eval(line[2:]) 937 handler.startPrefixMapping(pref, uri) 938 else: 939 # Unknown type 940 raise ValueError(line)
941 942
943 - def get_dom(self):
944 if (self.dom): 945 return self.dom 946 else: 947 # Turn SAX into DOM and cache 948 s2dhandler.initState() 949 self.saxify(s2dhandler); 950 self.dom = s2dhandler.getRootNode() 951 return self.dom
952
953 - def get_xml(self, events=[]):
954 if (not events and self.xml): 955 return self.xml 956 else: 957 # Turn SAX into XML and cache 958 if not events: 959 process = self.sax 960 else: 961 process = events 962 s2xhandler.initState() 963 self.saxify(s2xhandler, process) 964 if not events: 965 self.xml = s2xhandler.get_raw() 966 return self.xml 967 else: 968 return s2xhandler.get_raw()
969 970
971 - def get_sax(self):
972 return self.sax
973 974
975 -class MarcRecord(Record):
976
977 - def __repr__(self):
978 if self.recordStore != None: 979 return "%s/%s" % (self.recordStore, self.id) 980 else: 981 return "Record-%d" % self.id
982
983 - def __init__(self, doc, docid=0, store=""):
984 txt = doc.get_raw() 985 self.marc = MARC(txt) 986 self.id = docid 987 self.recordStore = store 988 # Estimate number of words... 989 display = str(self.marc) 990 self.size = len(display.split()) - ( len(display.split('\n')) * 2)
991 992
993 - def process_xpath(self, xpTuple, maps={}):
994 if (not isinstance(xpTuple, list)): 995 # Raw XPath 996 c = utils.verifyXPaths([xpTuple]) 997 if (not c or not c[0][1]): 998 return [] 999 else: 1000 xpTuple = c[0] 1001 1002 xp = xpTuple[1] 1003 # format: fldXXX/a 1004 fld = int(xp[0][1][3:]) 1005 if self.marc.fields.has_key(fld): 1006 data = self.marc.fields[fld] 1007 else: 1008 return [] 1009 if len(xp) > 1: 1010 subfield = xp[1][1] 1011 else: 1012 subfield = "" 1013 1014 vals = [] 1015 for d in data: 1016 if not subfield: 1017 vals.append(' '.join([x[1] for x in d[2]])) 1018 elif subfield == 'ind1': 1019 vals.append(d[0]) 1020 elif subfield == 'ind2': 1021 vals.append(d[1]) 1022 elif fld == 8: 1023 if not subfield: 1024 vals.append(d) 1025 elif subfield == 'lang': 1026 vals.append(d[35:38]) 1027 elif subfield == 'date': 1028 vals.append(d[:6]) 1029 elif subfield == 'pubStatus': 1030 vals.append(d[6]) 1031 elif subfield == 'date1': 1032 vals.append(d[7:11]) 1033 elif subfield == 'date2': 1034 vals.append(d[11:15]) 1035 elif subfield == 'pubPlace': 1036 vals.append(d[15:18]) 1037 else: 1038 for x in d[2]: 1039 try: 1040 if x[0] == subfield: 1041 vals.append(x[1]) 1042 except: 1043 # broken 1044 pass 1045 return vals
1046
1047 - def get_dom(self):
1048 raise(NotImplementedError)
1049 - def get_sax(self):
1050 raise(NotImplementedError)
1051 - def get_xml(self):
1052 return self.marc.toMARCXML()
1053