Module parser
[hide private]
[frames] | no frames]

Source Code for Module parser

  1   
  2  from configParser import C3Object 
  3  from baseObjects import Parser 
  4  from xml.dom.minidom import parseString as domParseString 
  5  from record import SaxRecord, MinidomRecord, FtDomRecord, SaxContentHandler, LxmlRecord 
  6   
  7  from xml.sax import ContentHandler, make_parser, parseString as saxParseString, ErrorHandler, InputSource as SaxInput 
  8  from xml.sax.saxutils import escape 
  9   
 10   
 11  from utils import flattenTexts, elementType 
 12  import re 
 13  import cStringIO, StringIO 
 14   
 15   
 16  # utility function to update data on record from document 
 17       
 18   
19 -class BaseParser(Parser):
20 - def copy_data(self, doc, rec):
21 rec.filename = doc.filename 22 rec.schema = doc.schema 23 rec.processHistory = doc.processHistory 24 rec.processHistory.append(self.id) 25 if doc.documentStore: 26 rec.parent = ('document', doc.documentStore, doc.id) 27 elif doc.parent: 28 rec.parent = doc.parent
29 30
31 -class MinidomParser(BaseParser):
32 """ Use default Python Minidom implementation to parse document """ 33
34 - def process_document(self, session, doc):
35 xml = doc.get_raw() 36 dom = domParseString(xml) 37 rec = MinidomRecord(dom, xml) 38 self.copy_data(doc, rec) 39 return rec
40 41
42 -class SaxParser(BaseParser):
43 """ Default SAX based parser. Creates SaxRecord """ 44
45 - def __init__(self, session, parent, config):
46 Parser.__init__(self, session, parent, config) 47 self.parser = make_parser() 48 self.errorHandler = ErrorHandler() 49 self.parser.setErrorHandler(self.errorHandler) 50 self.inputSource = SaxInput() 51 ch = SaxContentHandler() 52 self.contentHandler = ch 53 self.parser.setContentHandler(ch) 54 self.keepError = 1 55 56 if (self.get_setting(session, 'namespaces')): 57 self.parser.setFeature('http://xml.org/sax/features/namespaces', 1) 58 p = self.get_setting(session, 'attrHash') 59 if (p): 60 l = p.split() 61 for i in l: 62 (a,b) = i.split("@") 63 try: 64 ch.hashAttributesNames[a].append(b) 65 except: 66 ch.hashAttributesNames[a] = [b] 67 if self.get_setting(session, 'stripWhitespace'): 68 ch.stripWS = 1
69
70 - def process_document(self, session, doc):
71 72 xml = doc.get_raw() 73 self.inputSource.setByteStream(cStringIO.StringIO(xml)) 74 ch = self.contentHandler 75 ch.reinit() 76 try: 77 self.parser.parse(self.inputSource) 78 except: 79 # Splat. Reset self and reraise 80 if self.keepError: 81 # Work out path 82 path = [] 83 for l in ch.pathLines: 84 line = ch.currentText[l] 85 elemName = line[2:line.index('{')-1] 86 path.append("%s[@SAXID='%s']" % (elemName, l)) 87 self.errorPath = '/'.join(path) 88 else: 89 ch.reinit() 90 91 raise 92 rec = SaxRecord(ch.currentText, xml, recordSize=ch.recordSize) 93 rec.elementHash = ch.elementHash 94 self.copy_data(doc, rec) 95 ch.reinit() 96 return rec
97 98 try: 99 from lxml import etree 100 101 class LxmlParser(BaseParser): 102 """ lxml based Parser. Creates LxmlRecords """ 103 def process_document(self, session, doc): 104 # input must be stream 105 data = doc.get_raw() 106 et = etree.XML(data) 107 rec = LxmlRecord(et) 108 self.copy_data(doc, rec) 109 return rec
110 111 class LxmlSchemaParser(Parser): 112 pass 113 class LxmlRelaxNGParser(Parser): 114 pass 115 116 except: 117 # Define empty classes 118 class LxmlParser(Parser): 119 pass 120 121 122 from Ft.Xml import Sax, InputSource as FtInput 123 from Ft.Xml.Domlette import NonvalidatingReaderBase 124
125 -class FtParser(BaseParser, NonvalidatingReaderBase):
126 """ 4Suite based Parser. Creates FtDomRecords """
127 - def __init__(self, session, config, parent):
128 Parser.__init__(self, session, config, parent) 129 NonvalidatingReaderBase.__init__(self)
130
131 - def process_document(self, session, doc):
132 data = doc.get_raw() 133 dom = self.parseString(data, 'urn:foo') 134 rec = FtDomRecord(dom, data) 135 self.copy_data(doc, rec) 136 return rec
137
138 -class FtSaxParser(BaseParser):
139 """ 4Suite SAX based Parser. Creates SaxRecords """ 140
141 - def __init__(self, session, parent, config):
142 Parser.__init__(self, session, parent, config) 143 self.parser = Sax.CreateParser() 144 ch = SaxContentHandler() 145 self.contentHandler = ch 146 self.parser.setContentHandler(ch) 147 p = self.get_setting(session, 'attrHash') 148 if (p): 149 l = p.split() 150 for i in l: 151 (a,b) = i.split("@") 152 try: 153 ch.hashAttributesNames[a].append(b) 154 except: 155 ch.hashAttributesNames[a] = [b] 156 if self.get_setting(session, 'stripWhitespace'): 157 ch.stripWS = 1
158 159
160 - def process_document(self, session, doc):
161 162 xml = doc.get_raw() 163 src = FtInput.InputSource(StringIO.StringIO(xml)) 164 ch = self.contentHandler 165 ch.reinit() 166 try: 167 self.parser.parse(src) 168 except: 169 # Splat. Reset self and reraise 170 ch.reinit() 171 raise 172 173 rec = SaxRecord(ch.currentText, xml, recordSize=ch.recordSize) 174 rec.elementHash = ch.elementHash 175 self.copy_data(doc, rec) 176 return rec
177 178
179 -class PassThroughParser(BaseParser):
180 """ Copy the data from a document (eg list of sax events or a dom tree) into an appropriate record object """ 181
182 - def process_document(self, session, doc):
183 # Simply copy data into a record of appropriate type 184 data = doc.get_raw() 185 if (typeof(data) == types.ListType): 186 rec = SaxRecord(data) 187 else: 188 rec = DomRecord(data) 189 self.copy_data(doc, rec) 190 return rec
191 192 193 # Copy 194 from record import MarcRecord
195 -class MarcParser(BaseParser):
196 """ Creates MarcRecords which fake the Record API for Marc """
197 - def process_document(self, session, doc):
198 return MarcRecord(doc)
199 200 201 202 203 204 from utils import nonTextToken 205 # Semi-Worthless
206 -class XmlRecordStoreParser(BaseParser):
207 """ Metadata wrapping Parser for RecordStores. Not recommended """ 208 209 # We take in stuff and return a Record, that makes us a Parser. 210 # Retrieve metadata and sax list from XML structure 211
212 - def __init__(self, session, config, parent):
213 Parser.__init__(self, session, config, parent) 214 self.saxre = re.compile("<c3:saxEvents>(.+)</c3:saxEvents>", re.S)
215
216 - def process_document(self, session, doc):
217 # Take xml wrapper and convert onto object 218 # Strip out SAX events first 219 220 data = doc.get_raw() 221 222 # Strip out sax to list 223 match = self.saxre.search(data) 224 elemHash = {} 225 if match: 226 sax = match.groups(1)[0] 227 sax = unicode(sax, 'utf-8').split(nonTextToken) 228 # Now check if last is an element hash 229 if sax[-1][0] == "9": 230 elemHash = eval(sax[-1][2:]) 231 sax = sax[:-1] 232 else: 233 sax = [] 234 235 # Build base Record 236 rec = SaxRecord(sax) 237 rec.elementHash = elemHash 238 239 # Maybe quit 240 if (self.get_setting(session, 'SaxOnly')): 241 return rec 242 243 # Otherwise parse the metadata 244 data = self.saxre.sub("", data) 245 dom = domParseString(data) 246 for c in dom.childNodes[0].childNodes: 247 if c.nodeType == elementType: 248 if (c.localName == "id"): 249 rec.id = flattenTexts(c) 250 if (rec.id.isdigit()): 251 rec.id = long(rec.id) 252 elif (c.localName == "baseUri"): 253 rec.baseUri = flattenTexts(c) 254 elif (c.localName == "parent"): 255 # triple: type, store, id 256 # store must be string here, as we don't necessarily have access to all objects 257 type = store = "" 258 id = -1 259 for c2 in c.childNodes: 260 if (c2.nodeType == elementType): 261 if (c2.localName == "type"): 262 type = flattenTexts(c2) 263 elif (c2.localName == "store"): 264 store = flattenTexts(c2) 265 elif (c2.localName == "id"): 266 id = long(flattenTexts(c2)) 267 rec.parent = (type, store, id) 268 elif (c.localName == "processHistory"): 269 foo = [] 270 for c2 in c.childNodes: 271 if (c2.nodeType == elementType and c2.localName == 'object'): 272 foo.append(flattenTexts(c2)) 273 rec.processHistory = foo 274 elif (c.localName == "schema"): 275 rec.schema = flattenTexts(c) 276 elif (c.localName == "schemaType"): 277 rec.schemaType = flattenTexts(c) 278 elif (c.localName == "size"): 279 rec.size = long(flattenTexts(c)) 280 elif (c.localName == "technicalRights"): 281 for c2 in c.childNodes: 282 if (c2.nodeType == elementType): 283 entry = (flattenTexts(c2), c2.localName, c2.getAttribute('role')) 284 rec.rights.append(entry) 285 elif (c.localName == "history"): 286 for c2 in c.childNodes: 287 if (c2.nodeType == elementType): 288 # A modification 289 entry = ['', '', c2.getAttribute('type')] 290 for c3 in c2.childNodes: 291 if (c3.nodeType == elementType): 292 if (c3.localName == "agent"): 293 entry[0] = flattenTexts(c3) 294 elif (c3.localName == "date"): 295 entry[1] = flattenTexts(c3) 296 rec.history.append(entry) 297 298 return rec
299