Module extractor
[hide private]
[frames] | no frames]

Source Code for Module extractor

  1   
  2  import re 
  3  from configParser import C3Object 
  4  from baseObjects import Extractor 
  5  from utils import flattenTexts 
  6  from dateutil import parser 
  7  import types 
  8   
9 -class SimpleExtractor(Extractor):
10 """ Base extractor. Extracts exact text """ 11
12 - def __init__(self, session, config, parent):
13 Extractor.__init__(self, session, config, parent) 14 self.spaceRe = re.compile('\s+')
15
16 - def process_string(self, session, data):
17 # Accept just text and extract bits from it. 18 return {data: {'text' : data, 'occurences' : 1}}
19
20 - def process_node(self, session, data):
21 # Walk a DOM structure and extract 22 txt = flattenTexts(data) 23 txt = txt.replace('\n', ' ') 24 txt = txt.strip() 25 return {txt : {'text' : txt, 'occurences' : 1}}
26
27 - def process_eventList(self, session, data):
28 # Step through a SAX event list and extract 29 txt = [] 30 for e in data: 31 if (e[0] == "3"): 32 if (len(txt) and txt[-1][-1] != ' ' and e[2].isalnum()): 33 txt.append(' ') 34 txt.append(e[2:]) 35 txt = ''.join(txt) 36 txt = self.spaceRe.sub(' ', txt) 37 return {txt:{'text' : txt, 'occurences' : 1}}
38
39 - def _mergeHash(self, a, b):
40 if not a: 41 return b 42 if not b: 43 return a 44 for k in b.keys(): 45 try: 46 a[k]['occurences'] += b[k]['occurences'] 47 try: 48 a[k]['positions'].extend(b[k]['positions']) 49 except: 50 # Non prox 51 pass 52 except: 53 a[k] = b[k] 54 return a
55
56 - def process_xpathResult(self, session, data):
57 new = {} 58 for xp in data: 59 for d in xp: 60 if (type(d) == types.ListType): 61 # SAX event 62 new = self._mergeHash(new, self.process_eventList(session, d)) 63 elif (type(d) in types.StringTypes): 64 # Attribute content 65 new = self._mergeHash(new, self.process_string(session, d)) 66 else: 67 # DOM nodes 68 new = self._mergeHash(new, self.process_node(session, d)) 69 return new
70 71
72 -class KeywordExtractor(SimpleExtractor):
73 # Word consituent: a-zA-Z0-9 $% 74 """ Extracts keywords from the text """ 75
76 - def __init__(self, session, parser, config):
77 SimpleExtractor.__init__(self, session, parser, config) 78 # compiled regex is MUCH faster than interpreted loop 79 pre = self.get_setting(session, 'regexp', "((?<!\s)'|[-.,]((?=\s)|$)|(^|(?<=\s))[-.,']|[.,'-][.,'-]|[~`!@+=\#\&\^*()\[\]{}\\\|\":;<>?/])") 80 self.punctuationRe = re.compile(pre)
81
82 - def _keywordify(self, session, data):
83 kw = {} 84 reSub = self.punctuationRe.sub 85 has = kw.has_key 86 for d in data.keys(): 87 if d: 88 s = reSub(' ', d) 89 for t in s.split(): 90 if has(t): 91 kw[t]['occurences'] += 1 92 else: 93 kw[t] = {'text' : t, 'occurences' : 1} 94 return kw
95
96 - def process_string(self, session, data):
97 data = SimpleExtractor.process_string(self, session, data) 98 return self._keywordify(session, data)
99 - def process_node(self, session, data):
100 data = SimpleExtractor.process_node(self, session, data) 101 return self._keywordify(session, data)
102 - def process_eventList(self, session, data):
103 data = SimpleExtractor.process_eventList(self, session, data) 104 return self._keywordify(session, data)
105 106
107 -class DateExtractor(SimpleExtractor):
108 """ Extracts a single date. Multiple dates, ranges not yet implemented """ 109
110 - def __init__(self, session, config, parent):
111 SimpleExtractor.__init__(self, session, config, parent) 112 default = self.get_default(None, 'datetime') 113 self.fuzzy = self.get_setting(None, 'fuzzy') 114 if default: 115 self.default = parser.parse(default.encode('utf-8'), fuzzy=self.fuzzy) 116 else: 117 self.default = parser.parse('now', fuzzy=True)
118 119
120 - def _datify(self, session, data):
121 data = data.keys()[0] 122 try: 123 # This will only find 1 date. 124 d = str(parser.parse(data.encode('utf-8'), fuzzy=self.fuzzy, default = self.default)) 125 return {d:{'text' : d, 'occurences' : 1}} 126 except: 127 wds = data.split() 128 # reconstruct data word by word and feed to parser?. 129 # Must be a better way to do this 130 return {}
131
132 - def process_string(self, session, data):
133 data = SimpleExtractor.process_string(self, session, data) 134 return self._datify(session, data)
135 - def process_node(self, session, data):
136 data = SimpleExtractor.process_node(self, session, data) 137 return self._datify(session, data)
138 - def process_eventList(self, session, data):
139 data = SimpleExtractor.process_eventList(self, session, data) 140 return self._datify(session, data)
141 142
143 -class ProximityExtractor(KeywordExtractor):
144 """ Extract keywords and maintain information for proximity searches """ 145 146
147 - def process_string(self, session, data):
148 kw = {} 149 w = 0 150 hash = {data: {'text' : data, 'occurences' : 1}} 151 # Now keywordify with Prox 152 reSub = self.punctuationRe.sub 153 has = kw.has_key 154 155 for d in hash.keys(): 156 if d: 157 s = reSub(' ', d) 158 for wd in s.split(): 159 if has(wd): 160 kw[wd]['occurences'] += 1 161 kw[wd]['positions'].extend([-1, w]) 162 else: 163 kw[wd] = {'text' : wd, 'occurences' : 1, 'positions' : [-1, w]} 164 w += 1 165 return kw
166 167
168 - def process_node(self, session, data):
169 # need unique integer for this node. 170 # generate full path to node, and hash() 171 # UGLY! 172 path = [] 173 node = data 174 while True: 175 parent = node.getparent() 176 if not parent: 177 break 178 kids = parent.getchildren() 179 idx = kids.index(node) 180 path.append(idx) 181 node = parent; 182 pstr= '/'.join(map(str,path)) 183 lno = abs(hash(pstr)) 184 w = 0 185 186 kwhash = SimpleExtractor.process_node(self, session, data) 187 # Now keywordify with Prox 188 kw = {} 189 reSub = self.punctuationRe.sub 190 for d in kwhash.keys(): 191 if d: 192 s = reSub(' ', d) 193 for wd in s.split(): 194 try: 195 kw[wd]['occurences'] += 1 196 kw[wd]['positions'].extend([lno, w]) 197 except: 198 kw[wd] = {'text' : wd, 'occurences' : 1, 'positions' : [lno, w]} 199 w += 1 200 return kw
201
202 - def process_eventList(self, session, data):
203 # Treat parent element as line, not indexed element 204 # EG map in <map><attrType>1</attrType><attrVal>4</attrVal></map> 205 # For attrType or attrVal 206 207 if (self.get_setting(session, 'parent')): 208 lno = int(data[0].split()[-3]) 209 else: 210 lno = int(data[-1].split()[-1]) 211 w = 0 212 213 hash = SimpleExtractor.process_eventList(self, session, data) 214 # Now keywordify with Prox 215 kw = {} 216 reSub = self.punctuationRe.sub 217 has = kw.has_key 218 for d in hash.keys(): 219 if d: 220 s = reSub(' ', d) 221 for wd in s.split(): 222 try: 223 kw[wd]['occurences'] += 1 224 kw[wd]['positions'].extend([lno, w]) 225 except: 226 kw[wd] = {'text' : wd, 'occurences' : 1, 'positions' : [lno, w]} 227 w += 1 228 return kw
229 230 # Useful for element proximity, or for pre-keywording normalisation 231
232 -class ExactProximityExtractor(ProximityExtractor):
233 """ Extract exact text with proximity information. For example, to check nestedness/adjacency of elements """ 234 235 # XXX Not sure if this is actually useful?
236 - def process_string(self, session, data):
237 return {data : {'text' : data, 238 'positions' : [0, 0], 239 'occurences' : 1 240 } 241 }
242
243 - def process_node(self, session, data):
244 raise NotImplementedError
245
246 - def process_eventList(self, session, data):
247 # Treat parent element as line, not indexed element 248 # EG map in <map><attrType>1</attrType><attrVal>4</attrVal></map> 249 # For attrType or attrVal 250 parent = self.get_setting(session, 'parent') 251 if (parent <> None): 252 lno = int(data[0].split()[-3]) 253 else: 254 lno = int(data[-1].split()[-1]) 255 256 kw = {} 257 txtList = [] 258 for e in data: 259 if (e[0] == "3"): 260 txtList.append(e[2:]) 261 txt = ''.join(txtList) 262 return {txt : {'text' : txt, 263 'positions' : [lno, 0], 264 'occurences' : 1 265 } 266 }
267