1
2 from configParser import C3Object
3 from c3errors import ConfigFileException
4 import types, re, os
5
6 from normalizer import SimpleNormalizer, KeywordNormalizer
7 from textmining.TsujiiC3 import TsujiiObject, GeniaObject, EnjuObject
8
9
11 """ Base class for deriving Part of Speech Normalizers """
12 pass
13
14
16
17 - def __init__(self, session, node, parent):
20
22 tl = self.tag(session, data)
23 return ' '.join(tl)
24
26
27 - def __init__(self, session, node, parent):
30
32 tl = self.tag(session, data)
33 return ' '.join(tl)
34
36
37 - def __init__(self, session, node, parent):
40
42
43 tl = self.tag(session, data)
44
45 results = {}
46 for t in tl:
47 txt = t['text'] + "/" + t['pos']
48 try:
49 results[txt]['occurences'] += 1
50 except:
51 results[txt] = {'text' : txt,
52 'occurences' : 1,
53 'word' : t['text'],
54 'pos' : t['pos'],
55 'stem' : t['stem'],
56 'clause' : t['phr']}
57 return results
58
59
61 - def __init__(self, session, node, parent):
65
67 tl = self.tag(session, data)
68 return ''.join(tl)
69
71
72 - def __init__(self, session, node, parent):
76
78
79 tl = self.tag(session, data)
80
81 txts = []
82 if self.stem:
83 fld = 'stem'
84 else:
85 fld = 'text'
86 for t in tl:
87 txts.append("%s/%s" % (t[fld], t['pos']))
88 return ' '.join(txts)
89
90
92 """ Take output from GeniaNormalizer and return stems as terms """
94 results = {}
95 for d in data:
96 try:
97 results[d['stem']]['occurences'] += d['occurences']
98 except:
99 results[d['stem']] = {'text': d['stem'], 'occurences' : 1}
100 return results
101
102 -class GeniaTextNormalizer(SimpleNormalizer):
103 """ Take the full output from Genia and reconstruct the document, maybe with stems ('useStem') and/or PoS tags ('pos') """
104
105 - def __init__(self, session, config, parent):
106 SimpleNormalizer.__init__(self, session, config, parent)
107 self.stem = self.get_setting(session, 'useStem', 0)
108 self.pos = self.get_setting(session, 'pos', 0)
109 self.puncRe = re.compile('[ ]([.,;:?!][ \n])')
110
111 - def process_string(self, session, data):
112 lines = data.split('\n')
113 words = []
114 for l in lines:
115 try:
116 (word, stem, pos, other) = l[:-1].split('\t')
117 except ValueError:
118
119 words.append(l)
120 continue
121 if self.stem:
122 w = stem
123 else:
124 w = word
125 if self.pos:
126 w = "%s/%s" % (w, pos)
127 words.append(w)
128 txt = ' '.join(words)
129 txt = self.puncRe.sub('\\1', txt)
130 return txt
131
132
134 """ Extract statistical multi-word noun phrases. Default phrase is one or more nouns preceded by zero or more adjectives """
135
136 - def __init__(self, session, config, parent):
137 SimpleNormalizer.__init__(self, session, config, parent)
138 match = self.get_setting(session, 'regexp')
139 if not match:
140 match = self.get_setting(session, 'pattern')
141 if not match:
142 match = "(([ ][^\\s]+/JJ[SR]?)*)(([ ][^\\s]+/NN[SP]?)+)"
143 else:
144 match = match.replace('*', '*)')
145 match = match.replace('+', '+)')
146 match = match.replace('?', '?)')
147 match = match.replace('JJ', '(([ ][^\\s]+/JJ[SR]?)')
148 match = match.replace('NN', '(([ ][^\\s]+/NN[SP]?)')
149 self.pattern = re.compile(match)
150 self.strip = re.compile('/(JJ[SR]?|NN[SP]?)')
151 m = self.get_setting(session, 'minimumWords')
152 if m:
153 self.minimum = int(m)
154 else:
155 self.minimum = 0
156
158
159
160 kw = {}
161 has = kw.has_key
162 srch = self.pattern.search
163 strp = self.strip.sub
164 minm = self.minimum
165 m = srch(data)
166 while m:
167 phrase = m.group(1) + m.group(3)
168
169 phrase = strp('', phrase)
170 phrase = phrase.strip()
171 if not minm or phrase.count(' ') >= minm -1:
172 if has(phrase):
173 kw[phrase]['occurences'] += 1
174 else:
175 kw[phrase] = {'text' : phrase, 'occurences' : 1}
176 data = data[m.end():]
177 m = srch(data)
178 return kw
179
180
182
183 """ Turn string into keywords, but respecting Part of Speech tags """
184
185 - def __init__(self, session, config, parent):
187
189 kw = {}
190 has = kw.has_key
191
192 if (self.get_setting(session, 'prox')):
193 prox = 1
194 else:
195 prox = 0
196 w = 0
197 for t in data.split():
198 if has(t):
199 kw[t]['occurences'] += 1
200 if prox:
201 kw[t]['positions'].extend([-1, w])
202 w += 1
203 elif prox:
204 kw[t] = {'text' : t, 'occurences' : 1,
205 'positions' : [-1, w]}
206 w += 1
207 else:
208 kw[t] = {'text' : t, 'occurences' : 1}
209 return kw
210
212 kw = {}
213 vals = data.values()
214 if vals and vals[0].has_key('positions') or self.get_setting(session, 'prox'):
215 prox = 1
216 has = kw.has_key
217 if (self.get_setting(session, 'prox')):
218 prox = 1
219 else:
220 prox = 0
221 for d in vals:
222 t = d['text']
223 w = 0
224 if prox:
225 try:
226 lno = d['positions'][0]
227 except:
228 lno = -1
229 s = t
230 for t in s.split():
231 if has(t):
232 kw[t]['occurences'] += 1
233 if prox:
234 kw[t]['positions'].extend([lno, w])
235 w += 1
236 elif prox:
237 kw[t] = {'text' : t, 'occurences' : 1,
238 'positions' : [lno, w]}
239 w += 1
240 else:
241 kw[t] = {'text' : t, 'occurences' : 1}
242 return kw
243
244
245
246 -class PosTypeNormalizer(SimpleNormalizer):
247 """ Filter by part of speech tags. Default to keeping only nouns """
248
249 types = []
250
251 - def __init__(self, session, config, parent):
252 SimpleNormalizer.__init__(self, session, config, parent)
253
254 types = self.get_setting(session, 'posTypes')
255 if types:
256 self.types = types.split()
257 else:
258
259 self.types = ['NN', 'NNP', 'NNS']
260
261 - def process_string(self, session, data):
262 try:
263 (w, t) = data.rsplit('/', 1)
264 except ValueError:
265 print data
266 return ""
267 if t in self.types:
268 return w
269 else:
270 return ""
271