1
2 import re
3 from configParser import C3Object
4 from baseObjects import Extractor
5 from utils import flattenTexts
6 from dateutil import parser
7 import types
8
10 """ Base extractor. Extracts exact text """
11
15
17
18 return {data: {'text' : data, 'occurences' : 1}}
19
26
28
29 txt = []
30 for e in data:
31 if (e[0] == "3"):
32 if (len(txt) and txt[-1][-1] != ' ' and e[2].isalnum()):
33 txt.append(' ')
34 txt.append(e[2:])
35 txt = ''.join(txt)
36 txt = self.spaceRe.sub(' ', txt)
37 return {txt:{'text' : txt, 'occurences' : 1}}
38
40 if not a:
41 return b
42 if not b:
43 return a
44 for k in b.keys():
45 try:
46 a[k]['occurences'] += b[k]['occurences']
47 try:
48 a[k]['positions'].extend(b[k]['positions'])
49 except:
50
51 pass
52 except:
53 a[k] = b[k]
54 return a
55
70
71
73
74 """ Extracts keywords from the text """
75
81
83 kw = {}
84 reSub = self.punctuationRe.sub
85 has = kw.has_key
86 for d in data.keys():
87 if d:
88 s = reSub(' ', d)
89 for t in s.split():
90 if has(t):
91 kw[t]['occurences'] += 1
92 else:
93 kw[t] = {'text' : t, 'occurences' : 1}
94 return kw
95
105
106
108 """ Extracts a single date. Multiple dates, ranges not yet implemented """
109
118
119
121 data = data.keys()[0]
122 try:
123
124 d = str(parser.parse(data.encode('utf-8'), fuzzy=self.fuzzy, default = self.default))
125 return {d:{'text' : d, 'occurences' : 1}}
126 except:
127 wds = data.split()
128
129
130 return {}
131
141
142
144 """ Extract keywords and maintain information for proximity searches """
145
146
148 kw = {}
149 w = 0
150 hash = {data: {'text' : data, 'occurences' : 1}}
151
152 reSub = self.punctuationRe.sub
153 has = kw.has_key
154
155 for d in hash.keys():
156 if d:
157 s = reSub(' ', d)
158 for wd in s.split():
159 if has(wd):
160 kw[wd]['occurences'] += 1
161 kw[wd]['positions'].extend([-1, w])
162 else:
163 kw[wd] = {'text' : wd, 'occurences' : 1, 'positions' : [-1, w]}
164 w += 1
165 return kw
166
167
169
170
171
172 path = []
173 node = data
174 while True:
175 parent = node.getparent()
176 if not parent:
177 break
178 kids = parent.getchildren()
179 idx = kids.index(node)
180 path.append(idx)
181 node = parent;
182 pstr= '/'.join(map(str,path))
183 lno = abs(hash(pstr))
184 w = 0
185
186 kwhash = SimpleExtractor.process_node(self, session, data)
187
188 kw = {}
189 reSub = self.punctuationRe.sub
190 for d in kwhash.keys():
191 if d:
192 s = reSub(' ', d)
193 for wd in s.split():
194 try:
195 kw[wd]['occurences'] += 1
196 kw[wd]['positions'].extend([lno, w])
197 except:
198 kw[wd] = {'text' : wd, 'occurences' : 1, 'positions' : [lno, w]}
199 w += 1
200 return kw
201
203
204
205
206
207 if (self.get_setting(session, 'parent')):
208 lno = int(data[0].split()[-3])
209 else:
210 lno = int(data[-1].split()[-1])
211 w = 0
212
213 hash = SimpleExtractor.process_eventList(self, session, data)
214
215 kw = {}
216 reSub = self.punctuationRe.sub
217 has = kw.has_key
218 for d in hash.keys():
219 if d:
220 s = reSub(' ', d)
221 for wd in s.split():
222 try:
223 kw[wd]['occurences'] += 1
224 kw[wd]['positions'].extend([lno, w])
225 except:
226 kw[wd] = {'text' : wd, 'occurences' : 1, 'positions' : [lno, w]}
227 w += 1
228 return kw
229
230
231
233 """ Extract exact text with proximity information. For example, to check nestedness/adjacency of elements """
234
235
237 return {data : {'text' : data,
238 'positions' : [0, 0],
239 'occurences' : 1
240 }
241 }
242
244 raise NotImplementedError
245
247
248
249
250 parent = self.get_setting(session, 'parent')
251 if (parent <> None):
252 lno = int(data[0].split()[-3])
253 else:
254 lno = int(data[-1].split()[-1])
255
256 kw = {}
257 txtList = []
258 for e in data:
259 if (e[0] == "3"):
260 txtList.append(e[2:])
261 txt = ''.join(txtList)
262 return {txt : {'text' : txt,
263 'positions' : [lno, 0],
264 'occurences' : 1
265 }
266 }
267