Package textmining :: Module tmNormalizer
[hide private]
[frames] | no frames]

Source Code for Module textmining.tmNormalizer

  1   
  2  from configParser import C3Object 
  3  from c3errors import ConfigFileException 
  4  import types, re, os 
  5   
  6  from normalizer import SimpleNormalizer, KeywordNormalizer 
  7  from textmining.TsujiiC3 import TsujiiObject, GeniaObject, EnjuObject 
  8   
  9   
10 -class PosNormalizer(SimpleNormalizer):
11 """ Base class for deriving Part of Speech Normalizers """ 12 pass
13 14
15 -class TsujiiPosNormalizer(PosNormalizer, TsujiiObject):
16
17 - def __init__(self, session, node, parent):
20
21 - def process_string(self, session, data):
22 tl = self.tag(session, data) 23 return ' '.join(tl)
24
25 -class EnjuNormalizer(PosNormalizer, EnjuObject):
26
27 - def __init__(self, session, node, parent):
30
31 - def process_string(self, session, data):
32 tl = self.tag(session, data) 33 return ' '.join(tl)
34
35 -class GeniaNormalizer(PosNormalizer, GeniaObject):
36
37 - def __init__(self, session, node, parent):
40
41 - def process_string(self, session, data):
42 # exact string in 43 tl = self.tag(session, data) 44 # count up all stems 45 results = {} 46 for t in tl: 47 txt = t['text'] + "/" + t['pos'] 48 try: 49 results[txt]['occurences'] += 1 50 except: 51 results[txt] = {'text' : txt, 52 'occurences' : 1, 53 'word' : t['text'], 54 'pos' : t['pos'], 55 'stem' : t['stem'], 56 'clause' : t['phr']} 57 return results
58 59
60 -class UnparsedGeniaNormalizer(PosNormalizer, GeniaObject):
61 - def __init__(self, session, node, parent):
62 PosNormalizer.__init__(self, session, node, parent) 63 GeniaObject.__init__(self, session, node, parent) 64 self.unparsedOutput = 1
65
66 - def process_string(self, session, data):
67 tl = self.tag(session, data) 68 return ''.join(tl)
69
70 -class ExactGeniaNormalizer(PosNormalizer, GeniaObject):
71
72 - def __init__(self, session, node, parent):
73 PosNormalizer.__init__(self, session, node, parent) 74 GeniaObject.__init__(self, session, node, parent) 75 self.stem = self.get_setting(session, 'useStem')
76
77 - def process_string(self, session, data):
78 # exact string in 79 tl = self.tag(session, data) 80 # count up all stems 81 txts = [] 82 if self.stem: 83 fld = 'stem' 84 else: 85 fld = 'text' 86 for t in tl: 87 txts.append("%s/%s" % (t[fld], t['pos'])) 88 return ' '.join(txts)
89 90
91 -class GeniaStemNormalizer(SimpleNormalizer):
92 """ Take output from GeniaNormalizer and return stems as terms """
93 - def process_hash(self, session, data):
94 results = {} 95 for d in data: 96 try: 97 results[d['stem']]['occurences'] += d['occurences'] 98 except: 99 results[d['stem']] = {'text': d['stem'], 'occurences' : 1} 100 return results
101
102 -class GeniaTextNormalizer(SimpleNormalizer):
103 """ Take the full output from Genia and reconstruct the document, maybe with stems ('useStem') and/or PoS tags ('pos') """ 104
105 - def __init__(self, session, config, parent):
106 SimpleNormalizer.__init__(self, session, config, parent) 107 self.stem = self.get_setting(session, 'useStem', 0) 108 self.pos = self.get_setting(session, 'pos', 0) 109 self.puncRe = re.compile('[ ]([.,;:?!][ \n])')
110
111 - def process_string(self, session, data):
112 lines = data.split('\n') 113 words = [] 114 for l in lines: 115 try: 116 (word, stem, pos, other) = l[:-1].split('\t') 117 except ValueError: 118 # empty line 119 words.append(l) 120 continue 121 if self.stem: 122 w = stem 123 else: 124 w = word 125 if self.pos: 126 w = "%s/%s" % (w, pos) 127 words.append(w) 128 txt = ' '.join(words) 129 txt = self.puncRe.sub('\\1', txt) 130 return txt
131 132
133 -class PosPhraseNormalizer(SimpleNormalizer):
134 """ Extract statistical multi-word noun phrases. Default phrase is one or more nouns preceded by zero or more adjectives """ 135
136 - def __init__(self, session, config, parent):
137 SimpleNormalizer.__init__(self, session, config, parent) 138 match = self.get_setting(session, 'regexp') 139 if not match: 140 match = self.get_setting(session, 'pattern') 141 if not match: 142 match = "(([ ][^\\s]+/JJ[SR]?)*)(([ ][^\\s]+/NN[SP]?)+)" 143 else: 144 match = match.replace('*', '*)') 145 match = match.replace('+', '+)') 146 match = match.replace('?', '?)') 147 match = match.replace('JJ', '(([ ][^\\s]+/JJ[SR]?)') 148 match = match.replace('NN', '(([ ][^\\s]+/NN[SP]?)') 149 self.pattern = re.compile(match) 150 self.strip = re.compile('/(JJ[SR]?|NN[SP]?)') 151 m = self.get_setting(session, 'minimumWords') 152 if m: 153 self.minimum = int(m) 154 else: 155 self.minimum = 0
156
157 - def process_string(self, session, data):
158 # input is tagged string, pre keywording 159 # output: hash of phrases 160 kw = {} 161 has = kw.has_key 162 srch = self.pattern.search 163 strp = self.strip.sub 164 minm = self.minimum 165 m = srch(data) 166 while m: 167 phrase = m.group(1) + m.group(3) 168 # Strip tags 169 phrase = strp('', phrase) 170 phrase = phrase.strip() 171 if not minm or phrase.count(' ') >= minm -1: 172 if has(phrase): 173 kw[phrase]['occurences'] += 1 174 else: 175 kw[phrase] = {'text' : phrase, 'occurences' : 1} 176 data = data[m.end():] 177 m = srch(data) 178 return kw
179 180
181 -class PosKeywordNormalizer(KeywordNormalizer):
182 # Needs to respect keywording rules from POS Tagger 183 """ Turn string into keywords, but respecting Part of Speech tags """ 184
185 - def __init__(self, session, config, parent):
187
188 - def process_string(self, session, data):
189 kw = {} 190 has = kw.has_key 191 # Force proximity 192 if (self.get_setting(session, 'prox')): 193 prox = 1 194 else: 195 prox = 0 196 w = 0 197 for t in data.split(): 198 if has(t): 199 kw[t]['occurences'] += 1 200 if prox: 201 kw[t]['positions'].extend([-1, w]) 202 w += 1 203 elif prox: 204 kw[t] = {'text' : t, 'occurences' : 1, 205 'positions' : [-1, w]} 206 w += 1 207 else: 208 kw[t] = {'text' : t, 'occurences' : 1} 209 return kw
210
211 - def process_hash(self, session, data):
212 kw = {} 213 vals = data.values() 214 if vals and vals[0].has_key('positions') or self.get_setting(session, 'prox'): 215 prox = 1 216 has = kw.has_key 217 if (self.get_setting(session, 'prox')): 218 prox = 1 219 else: 220 prox = 0 221 for d in vals: 222 t = d['text'] 223 w = 0 224 if prox: 225 try: 226 lno = d['positions'][0] 227 except: 228 lno = -1 229 s = t 230 for t in s.split(): 231 if has(t): 232 kw[t]['occurences'] += 1 233 if prox: 234 kw[t]['positions'].extend([lno, w]) 235 w += 1 236 elif prox: 237 kw[t] = {'text' : t, 'occurences' : 1, 238 'positions' : [lno, w]} 239 w += 1 240 else: 241 kw[t] = {'text' : t, 'occurences' : 1} 242 return kw
243 244 245
246 -class PosTypeNormalizer(SimpleNormalizer):
247 """ Filter by part of speech tags. Default to keeping only nouns """ 248 249 types = [] 250
251 - def __init__(self, session, config, parent):
252 SimpleNormalizer.__init__(self, session, config, parent) 253 # Load types from config 254 types = self.get_setting(session, 'posTypes') 255 if types: 256 self.types = types.split() 257 else: 258 # Default to nouns 259 self.types = ['NN', 'NNP', 'NNS']
260
261 - def process_string(self, session, data):
262 try: 263 (w, t) = data.rsplit('/', 1) 264 except ValueError: 265 print data 266 return "" 267 if t in self.types: 268 return w 269 else: 270 return ""
271