Package textmining :: Module tmTransformer
[hide private]
[frames] | no frames]

Source Code for Module textmining.tmTransformer

 1   
 2  import os, re 
 3  from utils import getFirstData, elementType, verifyXPaths 
 4   
 5  from TsujiiC3 import TsujiiObject 
 6  from baseObjects import Transformer 
 7   
8 -class PosTransformer(Transformer):
9 pass
10
11 -class TsujiiTextPosTransformer(PosTransformer, TsujiiObject):
12
13 - def __init__(self, session, node, parent):
16
17 - def process_record(self, session, rec):
18 """ Oooooh. Try to step through all text nodes and tag? """ 19 pass
20 21
22 -class TsujiiXPathTransformer(PosTransformer, TsujiiObject):
23 24 # <xpath type="copy"> ... 25 # <xpath type="tag"> ... 26 copyElems = [] 27 tagElems = [] 28
29 - def _handleConfigNode(self, session, node):
30 # Source 31 if (node.localName == "xpath"): 32 xpath = getFirstData(node) 33 maps = {} 34 for a in node.attributes.keys(): 35 if (a[:6] == "xmlns:"): 36 pref = a[6:] 37 uri = node.getAttribute(a) 38 maps[pref] = uri 39 elif a == "type": 40 tc = node.getAttribute(a) 41 cxp = verifyXPaths([xpath]) 42 if tc == 'copy': 43 self.copyElems.append([cxp[0], maps]) 44 else: 45 self.tagElems.append([cxp[0], maps])
46
47 - def __init__(self, session, node, parent):
48 self.copyElems = [] 49 self.tagElems = [] 50 PosTransformer.__init__(self, session, node, parent) 51 TsujiiObject.__init__(self, session, node, parent)
52
53 - def process_record(self, session, rec):
54 doc = [] 55 for c in self.copyElems: 56 res = rec.process_xpath(c[0], c[1]) 57 for match in res: 58 txt = rec.get_xml(match) 59 doc.append(txt) 60 for t in self.tagElems: 61 res = rec.process_xpath(t[0], t[1]) 62 for match in res: 63 # Process all text nodes together 64 totag = [] 65 for event in match: 66 if event[0] == '3': 67 totag.append(event[1:]) 68 tagtxt = ''.join(totag) 69 tagged = self.tag(session, tagtxt) 70 tagged = ''.join(tagged) 71 if match[0][0] != '3': 72 (name, attrhash) = rec._convert_elem(match[0]) 73 attrs = [] 74 for a in attrhash: 75 attrs.append('%s="%s"' % (a, attribs[a])) 76 attribtxt = ' '.join(attrs) 77 if (attribtxt): 78 attribtxt = " " + attribtxt 79 txt = "<%s%s>%s</%s>" % (name, attribtxt, tagged, name) 80 else: 81 txt = "<text>%s</text>" % (tagged) 82 doc.append(txt) 83 doctxt = "<record>%s</record>" % '\n'.join(doc) 84 strdoc = StringDocument(doctxt, self.id, rec.processHistory, 'text/xml') 85 return strdoc
86