1
2 from baseObjects import Index, Document
3 from configParser import C3Object
4 from utils import elementType, getFirstData, verifyXPaths, flattenTexts
5 from c3errors import ConfigFileException
6 import re, types, sys, os, struct, time
7 from record import SaxRecord, DomRecord
8 from resultSet import SimpleResultSet, SimpleResultSetItem
9 from PyZ3950 import CQLParser, SRWDiagnostics
10 import codecs
11
12 try:
13 import termine
14 except:
15 pass
16
17 from XPathProcessor import XPathProcessor
18
20 sources = []
21 xPathAllAbsolute = 1
22 xPathAttributesRequired = []
23 xPathsNormalized = {}
24 currentFullPath = []
25 currentPath = []
26 storeOrig = 0
27 debug = 0
28
29 indexingTerm = ""
30 indexingData = []
31
33
34 if (node.localName == "source"):
35 process = []
36 preprocess = []
37 xp = None
38 for child in node.childNodes:
39 if child.nodeType == elementType:
40 if child.localName == "xpath":
41 if xp == None:
42 ref = child.getAttributeNS(None, 'ref')
43 if ref:
44 xp = self.get_object(session, ref)
45 else:
46 xp = XPathProcessor(session, node, self)
47 xp._handleConfigNode(session, node)
48 elif child.localName == "preprocess":
49
50 for child2 in child.childNodes:
51 if child2.nodeType == elementType and child2.localName == "object":
52 preprocess.append([child2.getAttributeNS(None, 'type'), child2.getAttributeNS(None, 'ref')])
53 elif child.localName == "process":
54
55 for child2 in child.childNodes:
56 if child2.nodeType == elementType and child2.localName == "object":
57 process.append([child2.getAttributeNS(None, 'type'), child2.getAttributeNS(None, 'ref')])
58 if xp == None:
59 raise ConfigFileException("No XPath given for index %s" % self.id)
60 self.sources.append((xp, process, preprocess))
61
62 - def __init__(self, session, node, parent):
63 self.sources = []
64 self.xPathAttributesRequired = []
65 self.xPathsNormalized = {}
66 self.xPathAllAbsolute = 1
67 self.indexingTerm = ""
68 self.indexingData = []
69 lss = self.get_setting(session, 'longSize')
70 if lss:
71 self.longStructSize = int(lss)
72 else:
73 self.longStructSize = len(struct.pack('L', 1))
74
75 Index.__init__(self, session, node, parent)
76 self.debug = self.get_setting(session, 'debug')
77
78
79 iStore = self.get_path(session, 'indexStore')
80 self.indexStore = iStore
81
82 if (iStore == None):
83 raise(ConfigFileException("Index (%s) does not have an indexStore." % (self.id)))
84 elif not iStore.contains_index(session, self):
85 iStore.create_index(session, self)
86
87 for s in range(len(self.sources)):
88 if self.sources[s][1][0][0] <> 'extractor':
89 raise(ConfigFileException("First link in process chain must be an Extractor."))
90 for t in range(len(self.sources[s][1])):
91 o = self.get_object(session, self.sources[s][1][t][1])
92 if (o <> None):
93 self.sources[s][1][t][1] = o
94 else:
95 raise(ConfigFileException("[%s] Unknown object %s" % (self.id, self.sources[s][1][t][1])))
96 for t in range(len(self.sources[s][2])):
97 o = self.get_object(session, self.sources[s][2][t][1])
98 if (o <> None):
99 self.sources[s][2][t][1] = o
100 else:
101 raise(ConfigFileException("Unknown object %s" % (self.sources[s][2][t][1])))
102
103
105 if not a:
106 return b
107 if not b:
108 return a
109 for k in b.keys():
110 try:
111 a[k]['occurences'] += b[k]['occurences']
112 try:
113 a[k]['positions'].extend(b[k]['positions'])
114 except:
115
116 pass
117 except:
118 a[k] = b[k]
119 return a
120
122 (otype, obj) = process[0]
123 new = {}
124 if (otype == 'extractor'):
125
126
127
128 for d in data:
129 if (type(d) == types.ListType):
130
131 new = self._mergeHash(new, obj.process_eventList(session, d))
132 elif (type(d) in types.StringTypes):
133
134 new = self._mergeHash(new, obj.process_string(session, d))
135 else:
136
137 new = self._mergeHash(new, obj.process_node(session, d))
138
139 elif (otype == 'normalizer'):
140
141 new = obj.process_hash(session, data)
142
143 elif (otype == 'preParser'):
144
145
146 fn = obj.process_document
147 for d in data.keys():
148 if (not isinstance(d[0], Document)):
149 doc = StringDocument(d, self)
150 else:
151 doc = d
152 new[fn(session, doc)] = data[d]
153
154 elif (otype == 'parser'):
155
156 fn = obj.process_document
157 for d in data.keys():
158 try:
159 new[fn(session, d)] = data[d]
160 except Exception, err:
161 f = d.get_raw()
162 err.text = f
163 raise err
164
165 elif (otype == 'index'):
166
167
168
169 if obj == self:
170 raise(ConfigFileException("Infinitely recursive process chain!"))
171 fn = obj.index_record
172 for d in data.keys():
173 fn(session, d)
174
175 elif (otype == 'transformer'):
176
177 fn = obj.process_record
178 for d in data.keys():
179 new[fn(session, d)] = data[d]
180
181 elif (type == 'indexStore'):
182
183 raise NotImplementedError
184
185 elif (type == 'recordStore'):
186
187 fn = obj.store_record
188 for d in data.keys():
189 if (not isinstance(d, Record)):
190 raise(ValueError)
191 fn(session, d)
192 new = data
193 else:
194 raise(ConfigFileException("Unknown object type: %s" % (otype)))
195
196 if (len(process) == 1):
197 return new
198 else:
199 return self._processChain(session, new, process[1:])
200
201
220
221
222
231
245
248
263
282
283
301
302 - def search(self, session, clause, db):
303
304 p = self.permissionHandlers.get('info:srw/operation/2/search', None)
305 if p:
306 if not session.user:
307 raise PermissionException("Authenticated user required to search index %s" % self.id)
308 okay = p.hasPermission(session, session.user)
309 if not okay:
310 raise PermissionException("Permission required to search index %s" % self.id)
311 pn = self.get_setting(session, 'termProcess')
312 if (pn == None):
313 pn = 0
314 else:
315 pn = int(pn)
316 process = self.sources[pn][1]
317 res = self._processChain(session, [clause.term.value], process)
318 store = self.get_path(session, 'indexStore')
319 matches = []
320 rel = clause.relation
321
322
323 if (clause.relation.value in ['any', 'all', '=', 'exact']):
324 for k in res:
325 term = store.fetch_term(session, self, k)
326 s = self.construct_resultSet(session, term, res[k])
327 matches.append(s)
328 elif (clause.relation.value in ['>=', '>', '<', '<=']):
329 if (len(res) <> 1):
330 d = SRWDiagnostics.Diagnostic24()
331 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value)
332 raise d
333 else:
334 termList = store.fetch_termList(session, self, res.keys()[0], 0, clause.relation.value)
335 for t in termList:
336 matches.append(self.construct_resultSet(session, t[1]))
337 elif (clause.relation.value == "within"):
338 if (len(res) <> 2):
339 d = SRWDiagnostics.Diagnostic24()
340 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value)
341 raise d
342 else:
343 termList = store.fetch_termList(session, self, res.keys()[0], end=res.keys()[1])
344 for t in termList:
345 matches.append(self.construct_resultSet(session, t[1]))
346 else:
347 d = SRWDiagnostics.Diagnostic24()
348 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value)
349 raise d
350 base = SimpleResultSet(session)
351 if not matches:
352 return base
353 else:
354 rs = base.combine(session, matches, clause, db)
355
356 tdb = self.get_path(session, 'termineDb')
357 if tdb:
358 rs.termWeight = 0
359 for k in res:
360 w = termine.fetch_weight(session, k, tdb)
361 rs.termWeight += w
362 return rs
363
364 - def scan(self, session, value, numReq, direction=">="):
365
366 p = self.permissionHandlers.get('info:srw/operation/2/scan', None)
367 if p:
368 if not session.user:
369 raise PermissionException("Authenticated user required to scan index %s" % self.id)
370 okay = p.hasPermission(session, session.user)
371 if not okay:
372 raise PermissionException("Permission required to scan index %s" % self.id)
373 pn = self.get_setting(session, 'termProcess')
374 if (pn == None):
375 pn = 0
376 else:
377 pn = int(pn)
378 process = self.sources[pn][1]
379 res = self._processChain(session, [value], process)
380 if (len(res) <> 1):
381 d = SRWDiagnostics.Diagnostic24()
382 d.details = "%s" % (value)
383 raise d
384 store = self.get_path(session, 'indexStore')
385 tList = store.fetch_termList(session, self, res.keys()[0], numReq=numReq, relation=direction, summary=1)
386
387 return tList
388
390
391 if not recs:
392 recs = len(terms) / 3
393 occs = sum(terms[2::3])
394 fmt = 'lll' * (recs + 1)
395 params = [fmt, termid, recs, occs] + terms
396 return struct.pack(*params)
397
399 fmt = 'lll' * (len(data) / (3 * self.longStructSize))
400 return struct.unpack(fmt, data)
401
402 - def merge_terms(self, structTerms, newTerms, op="replace", recs=0, occs=0):
403
404
405
406
407
408 (termid, oldTotalRecs, oldTotalOccs) = structTerms[0:3]
409 structTerms = list(structTerms[3:])
410
411 if op == 'add':
412 structTerms.extend(newTerms)
413 if recs:
414 trecs = oldTotalRecs + recs
415 toccs = oldTotalOccs + occs
416 else:
417 trecs = oldTotalRecs + len(newTerms) / 3
418 toccs = oldTotalOccs + sum(newTerms[2::3])
419 elif op == 'replace':
420 for n in range(0,len(newTerms),3):
421 docid = newTerms[n]
422 storeid = newTerms[n+1]
423 replaced = 0
424 for x in range(3, len(structTerms), 3):
425 if structTerms[x] == docid and structTerms[x+1] == storeid:
426 structTerms[x+2] == newTerms[n+2]
427 replaced = 1
428 break
429 if not replaced:
430 structTerms.extend([docid, storeid, newTerms[n+2]])
431
432 trecs = len(structTerms) / 3
433 toccs = sum(structTerms[2::3])
434 elif op == 'delete':
435 for n in range(0,len(newTerms),3):
436 docid = newTerms[n]
437 storeid = newTerms[n+1]
438 for x in range(0, len(structTerms), 3):
439 if structTerms[x] == docid and structTerms[x+1] == storeid:
440 del structTerms[x:x+3]
441 break
442 trecs = len(structTerms) / 3
443 toccs = sum(structTerms[2::3])
444
445 merged = [termid, trecs, toccs] + structTerms
446 return merged
447
448 - def construct_item(self, session, term, rsitype="SimpleResultSetItem"):
453
479
480
482 """ Need to use prox extractor """
483
485
486 fmt = 'l' * (len(terms) + 3)
487 params = [fmt, termid, recs, occs] + terms
488 try:
489 val = struct.pack(*params)
490 except:
491 print params
492 raise
493 return val
494
496 fmt = 'L' * (len(data) / self.longStructSize)
497 flat = struct.unpack(fmt, data)
498 (termid, totalRecs, totalOccs) = flat[:3]
499 idx = 3
500 docs = [termid, totalRecs, totalOccs]
501 while idx < len(flat):
502 doc = list(flat[idx:idx+3])
503 nidx = idx + 3 + (doc[2]*2)
504 doc.extend(flat[idx+3:nidx])
505 idx = nidx
506 docs.append(doc)
507 return docs
508
509 - def merge_terms(self, structTerms, newTerms, op="replace", recs=0, occs=0):
510
511
512
513 (termid, oldTotalRecs, oldTotalOccs) = structTerms[0:3]
514 structTerms = list(structTerms[3:])
515
516 if op == 'add':
517
518 terms = []
519 for t in structTerms:
520 terms.extend(t)
521 terms.extend(newTerms)
522 structTerms = terms
523 if recs != 0:
524 trecs = oldTotalRecs + recs
525 toccs = oldTotalOccs + occs
526 else:
527
528 trecs = oldTotalRecs + len(newTerms)
529 toccs = oldTotalOccs
530 for t in newTerms:
531 toccs = toccs + t[2]
532 raise ValueError("FIXME: mergeTerms needs recs/occs params")
533 elif op == 'replace':
534 raise NotImplementedError()
535 newOccs = 0
536 for new in newTerms:
537 docid = new[0]
538 storeid = new[1]
539 replaced = 0
540 for x in range(3, len(structTerms)):
541 old = structTerms[x]
542 if old[0] == docid and old[1] == storeid:
543 structTerms[x][2] = new[2]
544 structTerms[x][3:] = new[3:]
545
546
547
548 newOccs = newOccs + new[2]
549 replaced = 1
550 break
551 if not replaced:
552 structTerms.append(new)
553 newOccs = newOccs + new[2]
554 trecs = len(structTerms)
555 toccs = oldTotalOccs + newOccs
556 elif op == 'delete':
557 delOccs = 0
558 idx = 0
559 while idx < len(newTerms):
560 doc = list(newTerms[idx:idx+3])
561 idx = idx + 3 + (doc[2]*2)
562 for x in range(len(structTerms)):
563 old = structTerms[x]
564 if old[0] == doc[0] and old[1] == doc[1]:
565 delOccs = delOccs + old[2]
566 del structTerms[x]
567 break
568 trecs = len(structTerms) -3
569 toccs = oldTotalOccs - delOccs
570
571 terms = []
572 for t in structTerms:
573 terms.extend(t)
574 structTerms = terms
575
576 merged = [termid, trecs, toccs]
577 merged.extend(structTerms)
578 return merged
579
587
616
617
619
620
621
622 - def search(self, session, clause, db):
623
624 p = self.permissionHandlers.get('info:srw/operation/2/search', None)
625 if p:
626 if not session.user:
627 raise PermissionException("Authenticated user required to search index %s" % self.id)
628 okay = p.hasPermission(session, session.user)
629 if not okay:
630 raise PermissionException("Permission required to search index %s" % self.id)
631
632
633
634 pn = self.get_setting(session, 'termProcess')
635 if (pn == None):
636 pn = 0
637 else:
638 pn = int(pn)
639
640 process = self.sources[pn][1][:-1]
641 res = self._processChain(session, [clause.term.value], process)
642
643 store = self.get_path(session, 'indexStore')
644 matches = []
645 rel = clause.relation
646
647 if clause.relation.value == 'encloses':
648 pass
649
650
651
652 if (clause.relation.value in ['any', 'all', '=', 'exact']):
653 for k in res:
654 term = store.fetch_term(session, self, k)
655 s = self.construct_resultSet(session, term, res[k])
656 matches.append(s)
657 elif (clause.relation.value in ['>=', '>', '<', '<=']):
658 if (len(res) <> 1):
659 d = SRWDiagnostics.Diagnostic24()
660 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value)
661 raise d
662 else:
663 termList = store.fetch_termList(session, self, res.keys()[0], 0, clause.relation.value)
664 for t in termList:
665 matches.append(self.construct_resultSet(session, t[1]))
666 elif (clause.relation.value == "within"):
667 if (len(res) <> 2):
668 d = SRWDiagnostics.Diagnostic24()
669 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value)
670 raise d
671 else:
672
673 termList = store.fetch_termList(session, self, res.keys()[0], end=res.keys()[1])
674 for t in termList:
675 matches.append(self.construct_resultSet(session, t[1]))
676 else:
677 d = SRWDiagnostics.Diagnostic24()
678 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value)
679 raise d
680 base = SimpleResultSet(session)
681 if not matches:
682 return base
683 else:
684 return base.combine(session, matches, clause, db)
685
686
687
689
690
691
693 if (node.localName == "cluster"):
694 maps = []
695 for child in node.childNodes:
696 if (child.nodeType == elementType and child.localName == "map"):
697 t = child.getAttributeNS(None, 'type')
698 map = []
699 for xpchild in child.childNodes:
700 if (xpchild.nodeType == elementType and xpchild.localName == "xpath"):
701 map.append(flattenTexts(xpchild))
702 elif (xpchild.nodeType == elementType and xpchild.localName == "process"):
703 p = []
704 for child2 in xpchild.childNodes:
705 if child2.nodeType == elementType and child2.localName == "object":
706 p.append([child2.getAttributeNS(None, 'type'), child2.getAttributeNS(None, 'ref')])
707 map.append(p)
708 vxp = verifyXPaths([map[0]])
709 if (len(map) < 3):
710
711 map.append([['extractor', 'ExactExtractor']])
712 if (t == u'key'):
713 self.keyMap = [vxp[0], map[1], map[2]]
714 else:
715 maps.append([vxp[0], map[1], map[2]])
716 self.maps = maps
717
719 self.keyMap = []
720 self.maps = []
721 Index.__init__(self, session, config, parent)
722
723 for m in range(len(self.maps)):
724 for t in range(len(self.maps[m][2])):
725 o = self.get_object(None, self.maps[m][2][t][1])
726 if (o <> None):
727 self.maps[m][2][t][1] = o
728 else:
729 raise(ConfigFileException("Unknown object %s" % (self.maps[m][2][t][1])))
730 for t in range(len(self.keyMap[2])):
731 o = self.get_object(None, self.keyMap[2][t][1])
732 if (o <> None):
733 self.keyMap[2][t][1] = o
734 else:
735 raise(ConfigFileException("Unknown object %s" % (self.keyMap[2][t][1])))
736
737
739 path = self.get_path(session, "tempPath")
740 if (not os.path.isabs(path)):
741 dfp = self.get_path(session, "defaultPath")
742 path = os.path.join(dfp, path)
743 self.fileHandle = codecs.open(path, "w", 'utf-8')
744
746 self.fileHandle.close()
747
748
776
777
778 from utils import SimpleBitfield
779 from resultSet import BitmapResultSet
780
782
783
784
790
792
793 if len(terms) == 1:
794
795 bf = terms[0]
796 else:
797 bf = SimpleBitfield()
798 for item in terms[::3]:
799 bf[item] = 1
800 pack = struct.pack('lll', termid, recs, occs)
801 val = pack + str(bf)
802 return val
803
805 lsize = 3 * self.longStructSize
806 longs = data[:lsize]
807 terms = list(struct.unpack('lll', longs))
808 if len(data) > lsize:
809 bf = SimpleBitfield(data[lsize:])
810 terms.append(bf)
811 return terms
812
813 - def merge_terms(self, structTerms, newTerms, op="replace", recs=0, occs=0):
814 (termid, oldTotalRecs, oldTotalOccs, oldBf) = structTerms
815 if op in['add', 'replace']:
816 for t in newTerms[1::3]:
817 oldBf[t] = 1
818 elif op == 'delete':
819 for t in newTerms[1::3]:
820 oldBf[t] = 0
821 trecs = oldBf.lenTrueItems()
822 toccs = trecs
823 merged = [termid, trecs, toccs, oldBf]
824 return merged
825
831
853
854
855 try:
856 from resultSet import ArrayResultSet
857 except:
858 raise
859
860
862
864 pass
866 pass
868 pass
870 pass
871
872 - def scan(self, session, clause, db):
873 raise NotImplementedError()
874
875 - def search(self, session, clause, db):
906
907 try:
908 import numarray as na
909
910 class ArrayIndex(SimpleIndex):
911
912
913 def __init__(self, session, node, parent):
914 SimpleIndex.__init__(self, session, node, parent)
915 self.indexStore = self.get_path(session, 'indexStore')
916 self.recordStore = self.get_path(session, 'recordStore')