1
2 from configParser import C3Object
3 from baseObjects import Normalizer
4 from c3errors import ConfigFileException
5 import types, re, os
6
7
8
10 """ Base normalizer. Simply returns the data (shouldn't be used directly) """
11
12 - def __init__(self, session, config, parent):
14
16
17 return data
18
20 kw = {}
21 has = kw.has_key
22 vals = data.values()
23 if not vals:
24 return kw
25 prox = vals[0].has_key('positions')
26 process = self.process_string
27 for d in vals:
28 new = process(session, d['text'])
29 if type(new) == types.DictType:
30
31 for k in new.values():
32 txt = k['text']
33 if has(txt):
34 kw[txt]['occurences'] += k['occurences']
35 if prox:
36 kw[txt]['positions'].extend(k['positions'])
37 else:
38 kw[txt] = k
39 else:
40 if new != None:
41 try:
42 kw[new]['occurences'] += d['occurences']
43 if prox:
44 kw[new]['positions'].extend(d['positions'])
45 except KeyError:
46 d = d.copy()
47 d['text'] = new
48 kw[new] = d
49 return kw
50
52 """ Return '1' if any data exists, otherwise '0' """
53
55 if data:
56 return "1"
57 else:
58 return "0"
59
61 """ Un-stoplist anonymizing normalizer. Eg for use with data mining """
62 - def __init__(self, session, config, parent):
68
70 if data in self.termlist:
71 return "1"
72 else:
73 return "0"
74
89
90
92 """ Reduce text to lower case """
93
95 return data.lower()
96
98 """ Reverse string (eg for left truncation) """
100 return data[::-1]
101
102
104 """ Reduce multiple whitespace to single space character """
105 - def __init__(self, session, config, parent):
108
110 data = data.strip()
111 data = self.whitespace.sub(' ', data)
112 return data
113
114 -class ArticleNormalizer(SimpleNormalizer):
115 """ Remove leading english articles (the, a, an) """
116 - def process_string(self, session, data):
117 d = data.lower()
118 if (d[:4] == "the "):
119 return data[4:]
120 elif (d[:2] == "a "):
121 return data[2:]
122 elif (d[:3] == "an "):
123 return data[3:]
124 else:
125 return data
126
127
129 """ Replace characters matching regular expression with the equivalent numeric character entity """
130 - def __init__(self, session, config, parent):
136
138 return self.regexp.sub(self.function, data)
139
140
141
142
143
144
145
146
148 """ Either strip, replace or keep data which matches a given regular expression """
149 - def __init__(self, session, config, parent):
160
162 if self.keep:
163 l = self.regexp.findall(data)
164 return self.char.join(l)
165 else:
166 return self.regexp.sub(self.char, data)
167
168
169
171 """ Remove trailing 's or s' from words """
173
174 if (data[-2:] == "s'"):
175 return data[:-1]
176 elif (data[-2:] == "'s"):
177 return data[:-2]
178 else:
179 return data
180
182 """ Turn a string into an integer """
184 try:
185 return long(data)
186 except:
187 return None
188
190 """ Turn an integer into a 0 padded string, 12 chrs long """
192 try:
193 d = long(data)
194 return "%012d" % (d)
195 except:
196 return None
197
199 """ Remove words that match a stopword list """
200 stoplist = {}
201
202 - def __init__(self, session, config, parent):
214
216 if (self.stoplist.has_key(data)):
217 return None
218 else:
219 return data
220
221 try:
222 import txngstemmer as Stemmer
223
224 class StemNormalizer(SimpleNormalizer):
225 """ Use a Snowball stemmer to stem the terms """
226 stemmer = None
227
228 def __init__(self, session, config, parent):
229 SimpleNormalizer.__init__(self, session, config, parent)
230 lang = self.get_setting(session, 'language', 'english')
231 try:
232 self.stemmer = Stemmer.Stemmer(lang)
233 except:
234 raise(ConfigFileException("Unknown stemmer language: %s" % (lang)))