Trees | Index | Help |
---|
|
1 2 from configParser import C3Object 3 from baseObjects import Parser 4 from xml.dom.minidom import parseString as domParseString 5 from record import SaxRecord, MinidomRecord, FtDomRecord, SaxContentHandler, LxmlRecord 6 7 from xml.sax import ContentHandler, make_parser, parseString as saxParseString, ErrorHandler, InputSource as SaxInput 8 from xml.sax.saxutils import escape 9 10 11 from utils import flattenTexts, elementType 12 import re 13 import cStringIO, StringIO 14 15 16 # utility function to update data on record from document 17 1829 30 40 4121 rec.filename = doc.filename 22 rec.schema = doc.schema 23 rec.processHistory = doc.processHistory 24 rec.processHistory.append(self.id) 25 if doc.documentStore: 26 rec.parent = ('document', doc.documentStore, doc.id) 27 elif doc.parent: 28 rec.parent = doc.parent43 """ Default SAX based parser. Creates SaxRecord """ 4497 98 try: 99 from lxml import etree 100 101 class LxmlParser(BaseParser): 102 """ lxml based Parser. Creates LxmlRecords """ 103 def process_document(self, session, doc): 104 # input must be stream 105 data = doc.get_raw() 106 et = etree.XML(data) 107 rec = LxmlRecord(et) 108 self.copy_data(doc, rec) 109 return rec46 Parser.__init__(self, session, parent, config) 47 self.parser = make_parser() 48 self.errorHandler = ErrorHandler() 49 self.parser.setErrorHandler(self.errorHandler) 50 self.inputSource = SaxInput() 51 ch = SaxContentHandler() 52 self.contentHandler = ch 53 self.parser.setContentHandler(ch) 54 self.keepError = 1 55 56 if (self.get_setting(session, 'namespaces')): 57 self.parser.setFeature('http://xml.org/sax/features/namespaces', 1) 58 p = self.get_setting(session, 'attrHash') 59 if (p): 60 l = p.split() 61 for i in l: 62 (a,b) = i.split("@") 63 try: 64 ch.hashAttributesNames[a].append(b) 65 except: 66 ch.hashAttributesNames[a] = [b] 67 if self.get_setting(session, 'stripWhitespace'): 68 ch.stripWS = 16971 72 xml = doc.get_raw() 73 self.inputSource.setByteStream(cStringIO.StringIO(xml)) 74 ch = self.contentHandler 75 ch.reinit() 76 try: 77 self.parser.parse(self.inputSource) 78 except: 79 # Splat. Reset self and reraise 80 if self.keepError: 81 # Work out path 82 path = [] 83 for l in ch.pathLines: 84 line = ch.currentText[l] 85 elemName = line[2:line.index('{')-1] 86 path.append("%s[@SAXID='%s']" % (elemName, l)) 87 self.errorPath = '/'.join(path) 88 else: 89 ch.reinit() 90 91 raise 92 rec = SaxRecord(ch.currentText, xml, recordSize=ch.recordSize) 93 rec.elementHash = ch.elementHash 94 self.copy_data(doc, rec) 95 ch.reinit() 96 return rec
Trees | Index | Help |
---|
Generated by Epydoc 3.0alpha2 on Wed Aug 9 18:11:02 2006 | http://epydoc.sf.net |