1
2 from baseObjects import IndexStore, Database
3 from configParser import C3Object
4 from c3errors import ConfigFileException, FileDoesNotExistException
5 from resultSet import SimpleResultSetItem
6 from index import *
7 from utils import parseSrbUrl
8 import os, types, struct, sys, commands, time
9 try:
10
11 import bsddb as bdb
12 except:
13 import bsddb3 as bdb
14
15 nonTextToken = "\x00\t"
16
18
19 indexing = 0
20 outFiles = {}
21 outSortFiles = {}
22 storeHash = {}
23 storeHashReverse = {}
24 sortStoreCxn = {}
25 identifierMapCxn = {}
26 indexCxn = {}
27 reservedLongs = 3
28
29 - def __init__(self, session, parent, config):
68
73
89
91 if x[-6:] <> ".index":
92 return 0
93 elif x[:len(self.id)] <> self.id:
94 return 0
95 else:
96 return 1
97
99 stuff = [self.id, "--", index.id, ".index"]
100 return ''.join(stuff)
101
102
104 if self.identifierMapCxn.has_key(rec.recordStore):
105 cxn = self.identifierMapCxn[rec.recordStore]
106 else:
107 fn = "recordIdentifiers_" + self.id + "_" + rec.recordStore + ".bdb"
108 dfp = self.get_path(session, "defaultPath")
109 dbp = os.path.join(dfp, fn)
110 if not os.path.exists(dbp):
111
112 cxn = bdb.db.DB()
113
114 cxn.open(dbp, dbtype=bdb.db.DB_BTREE, flags = bdb.db.DB_CREATE, mode=0660)
115 cxn.close()
116
117 cxn = bdb.db.DB()
118
119 if session.environment == "apache":
120 cxn.open(dbp, flags=bdb.db.DB_NOMMAP)
121 else:
122 cxn.open(dbp)
123 self.identifierMapCxn[rec.recordStore] = cxn
124
125 try:
126 recid = rec.id.encode('utf-8')
127 except:
128 pass
129 try:
130 data = cxn.get(recid)
131 if data:
132 return long(data)
133 except:
134 pass
135
136 c = cxn.cursor()
137 c.set_range("__i2s_999999999999999")
138 data = c.prev()
139 if (data and data[0][:6] == "__i2s_"):
140 max = long(data[0][6:])
141 intid = "%015d" % (max + 1)
142 else:
143 intid = "000000000000000"
144 cxn.put(recid, intid)
145 cxn.put("__i2s_%s" % intid, recid)
146 return long(intid)
147
170
171
172
188
189
191
192 p = self.permissionHandlers.get('info:srw/operation/2/index', None)
193 if p:
194 if not session.user:
195 raise PermissionException("Authenticated user required to add to indexStore %s" % self.id)
196 okay = p.hasPermission(session, session.user)
197 if not okay:
198 raise PermissionException("Permission required to add to indexStore %s" % self.id)
199 temp = self.get_path(session, 'tempPath')
200 if not os.path.isabs(temp):
201 temp = os.path.join(self.get_path(session, 'defaultPath'), temp)
202 if (not os.path.exists(temp)):
203 try:
204 os.mkdir(temp)
205 except:
206 raise(ConfigFileException('TempPath does not exist and is not creatable.'))
207 elif (not os.path.isdir(temp)):
208 raise(ConfigFileException('TempPath is not a directory.'))
209 basename = os.path.join(temp, self._generateFilename(index))
210 if (hasattr(session, 'task')):
211 basename += str(session.task)
212
213
214 if (not self.outFiles.has_key(index)):
215 self.outFiles[index] = codecs.open(basename + "_TEMP", 'a', 'utf-8')
216
217 if (index.get_setting(session, "sortStore")):
218
219 if (session.task):
220
221 raise NotImplementedError
222 dfp = self.get_path(session, "defaultPath")
223 name = self._generateFilename(index) + "_VALUES"
224 fullname = os.path.join(dfp, name)
225 if (not os.path.exists(fullname)):
226 raise FileDoesNotExistException(fullname)
227 cxn = bdb.db.DB()
228
229 if session.environment == "apache":
230 cxn.open(fullname, flags=bdb.db.DB_NOMMAP)
231 else:
232 cxn.open(fullname)
233 self.outSortFiles[index] = cxn
234
235
237
238 temp = self.get_path(session, 'tempPath')
239 if not os.path.isabs(temp):
240 temp = os.path.join(self.get_path(session, 'defaultPath'), temp)
241 files = os.listdir(temp)
242 recids = []
243 for f in files:
244
245 fh = file(os.path.join(temp, f))
246 fh.seek(-1024, 2)
247 data = fh.read(1024)
248 lines = data.split('\n')
249 l= lines[-2]
250 bits = l.split('\x00\t')
251 recids.append(long(bits[1]))
252 currRec = max(recids)
253
254
255
256
257
258 return "%012d" % currRec
259
260
262
263
264 p = self.permissionHandlers.get('info:srw/operation/2/index', None)
265 if p:
266 if not session.user:
267 raise PermissionException("Authenticated user required to add to indexStore %s" % self.id)
268 okay = p.hasPermission(session, session.user)
269 if not okay:
270 raise PermissionException("Permission required to add to indexStore %s" % self.id)
271
272 temp = self.get_path(session, 'tempPath')
273 dfp = self.get_path(session, 'defaultPath')
274 if not os.path.isabs(temp):
275 temp = os.path.join(dfp, temp)
276
277 for k in self.outSortFiles:
278 self.outSortFiles[k].close()
279 if (not self.outFiles.has_key(index)):
280 raise FileDoesNotExistException(index.id)
281 sort = self.get_path(session, 'sortPath')
282 if (not os.path.exists(sort)):
283 raise ConfigFileException("Sort executable for %s does not exist" % self.id)
284
285 fh = self.outFiles[index]
286 fh.flush()
287 fh.close()
288 del self.outFiles[index]
289
290 for db in self.identifierMapCxn.values():
291 db.sync()
292
293 basename = self._generateFilename(index)
294 if (hasattr(session, 'task')):
295 basename += str(session.task)
296
297 basename = os.path.join(temp, basename)
298 tempfile = basename + "_TEMP"
299 sorted = basename + "_SORT"
300 cmd = "%s %s -o %s" % (sort, tempfile, sorted)
301 commands.getoutput(cmd)
302
303 if (not os.path.exists(sorted)):
304 raise ValueError("Failed to sort %s" % tempfile)
305 os.remove(tempfile)
306 if hasattr(session, 'phase') and session.phase == 'commit_indexing1':
307 return sorted
308
309
310 self.commit_indexing2(session, index, sorted)
311
313 p = self.permissionHandlers.get('info:srw/operation/2/index', None)
314 if p:
315 if not session.user:
316 raise PermissionException("Authenticated user required to add to indexStore %s" % self.id)
317 okay = p.hasPermission(session, session.user)
318 if not okay:
319 raise PermissionException("Permission required to add to indexStore %s" % self.id)
320
321 cxn = self._openIndex(session, index)
322 cursor = cxn.cursor()
323 nonEmpty = cursor.first()
324
325
326
327 f = file(sorted)
328
329 termid = long(0)
330 currTerm = None
331 currData = []
332 l = 1
333
334 s2t = index.deserialise_terms
335 mt = index.merge_terms
336 t2s = index.serialise_terms
337 minTerms = index.get_setting(session, 'minimumSupport')
338 if not minTerms:
339 minTerms = 0
340
341 while(l):
342 l = f.readline()[:-1]
343 data = l.split(nonTextToken)
344 term = data[0]
345 fullinfo = map(long, data[1:])
346 if term == currTerm:
347
348 totalRecs += 1
349 totalOccs += fullinfo[2]
350 currData.extend(fullinfo)
351 else:
352
353
354 if currData:
355 if (nonEmpty):
356 val = cxn.get(currTerm)
357 if (val != None):
358 unpacked = s2t(val)
359 unpacked = mt(unpacked, currData, 'add', recs=totalRecs, occs=totalOccs)
360 totalRecs = unpacked[1]
361 totalOccs = unpacked[2]
362 unpacked = unpacked[3:]
363 else:
364 unpacked = currData
365 packed = t2s(termid, unpacked, recs=totalRecs, occs=totalOccs)
366 else:
367 packed = t2s(termid, currData, recs=totalRecs, occs=totalOccs)
368 cxn.put(currTerm, packed)
369
370 try:
371 totalOccs = fullinfo[2]
372 termid += 1
373 currTerm = term
374 currData = fullinfo
375 totalRecs = 1
376 except:
377
378 pass
379
380 self._closeIndex(session, index)
381
382 return None
383
384 - def create_term(self, session, index, termid, resultSet):
397
398
400
401 dfp = self.get_path(session, "defaultPath")
402 files = os.listdir(dfp)
403 files = filter(self._fileFilter, files)
404 ids = []
405 start = len(self.id) + 1
406 for f in files:
407 ids.append(f[start:-6])
408 return ids
409
411 raise(NotImplementedError)
412
418
420
421 p = self.permissionHandlers.get('info:srw/operation/1/create', None)
422 if p:
423 if not session.user:
424 raise PermissionException("Authenticated user required to create index in %s" % self.id)
425 okay = p.hasPermission(session, session.user)
426 if not okay:
427 raise PermissionException("Permission required to create index in %s" % self.id)
428
429 dfp = self.get_path(session, "defaultPath")
430 name = self._generateFilename(index)
431 fullname = os.path.join(dfp, name)
432 if os.path.exists(fullname):
433 raise FileDoesNotExistException(fullname)
434 cxn = bdb.db.DB()
435
436 try:
437 cxn.open(fullname, dbtype=bdb.db.DB_BTREE, flags=bdb.db.DB_CREATE, mode=0660)
438 cxn.close()
439 except:
440 raise ConfigFileException(fullname)
441
442 if (index.get_setting(session, "sortStore")):
443 try:
444 oxn = bdb.db.DB()
445
446 oxn.open(fullname + "_VALUES", dbtype=bdb.db.DB_BTREE, flags=bdb.db.DB_CREATE, mode=0660)
447 oxn.close()
448 except:
449 raise(ValueError)
450 return 1
451
452
457
474
475
493
494 - def store_terms(self, session, index, termhash, record):
495
496
497
498
499 p = self.permissionHandlers.get('info:srw/operation/2/index', None)
500 if p:
501 if not session.user:
502 raise PermissionException("Authenticated user required to add to indexStore %s" % self.id)
503 okay = p.hasPermission(session, session.user)
504 if not okay:
505 raise PermissionException("Permission required to add to indexStore %s" % self.id)
506
507 if (not termhash):
508
509 return
510
511 storeid = record.recordStore
512 if (type(storeid) != types.IntType):
513
514 if (self.storeHashReverse.has_key(storeid)):
515 storeid = self.storeHashReverse[storeid]
516 else:
517
518 self.storeHashReverse[storeid] = len(self.storeHash.keys())
519 self.storeHash[self.storeHashReverse[storeid]] = storeid
520 raise ConfigFileException("indexStore %s does not recognise recordStore: %s" % (self.id, storeid))
521
522 docid = record.id
523 if (type(docid) != types.IntType):
524 if (type(docid) == types.StringType and docid.isdigit()):
525 docid = long(docid)
526 else:
527
528 docid = self._get_internalId(session, record)
529 elif (docid == -1):
530
531 raise ValueError(str(record))
532
533 if self.outFiles.has_key(index):
534
535 value = termhash.values()[0]['text']
536 if (self.outSortFiles.has_key(index) and value):
537 if type(value) == unicode:
538 sortVal = value.encode('utf-8')
539 else:
540 sortVal = value
541 self.outSortFiles[index].put("%s/%s" % (str(record.recordStore), docid), sortVal)
542
543 start = time.time()
544 prox = termhash[value].has_key('positions')
545 for k in termhash.values():
546 kw = k['text']
547 if type(kw) != unicode:
548 kw = kw.decode('utf-8')
549 self.outFiles[index].write(kw)
550
551 lineList = ["", "%012d" % docid, str(storeid), str(k['occurences'])]
552 if prox:
553 lineList.extend(map(str, k['positions']))
554 self.outFiles[index].write(nonTextToken.join(lineList) + "\n")
555 else:
556
557 cxn = self._openIndex(session, index)
558
559
560
561 for k in hash.values():
562 key = k['text']
563 stuff = [docid, storeid, k['occurences']]
564 try:
565 stuff.extend(k['positions'])
566 except:
567 pass
568 val = cxn.get(key.encode('utf-8'))
569 if (val != None):
570 current = index.deserialise_terms(val)
571 unpacked = index.merge_terms(current, stuff, recs=1, occs=k['occurences'])
572
573 (termid, totalRecs, totalOccs) = unpacked[:3]
574 unpacked = unpacked[3:]
575 else:
576 termid = cxn.stat()['nkeys']
577 unpacked = stuff
578
579 packed = index.serialise_terms(termid, unpacked, recs=totalRecs, occs=totalOccs)
580 cxn.put(key.encode('utf-8'), packed)
581 self._closeIndex(session, index)
582
584 p = self.permissionHandlers.get('info:srw/operation/2/unindex', None)
585 if p:
586 if not session.user:
587 raise PermissionException("Authenticated user required to delete from indexStore %s" % self.id)
588 okay = p.hasPermission(session, session.user)
589 if not okay:
590 raise PermissionException("Permission required to delete from indexStore %s" % self.id)
591 if not terms:
592 return
593
594 docid = record.id
595
596 if (type(docid) == types.StringType and docid.isdigit()):
597 docid = long(docid)
598 elif (type(docid) in [types.IntType, types.LongType]):
599 pass
600 else:
601
602 docid = self._get_internalId(session, record)
603
604 storeid = record.recordStore
605 if (type(storeid) <> types.IntType):
606
607 if (self.storeHashReverse.has_key(storeid)):
608 storeid = self.storeHashReverse[storeid]
609 else:
610
611 self.storeHashReverse[storeid] = len(self.storeHash.keys())
612 self.storeHash[self.storeHashReverse[storeid]] = storeid
613 storeid = self.storeHashReverse[storeid]
614 raise ConfigFileException("indexStore %s does not recognise recordStore: %s" % (self.id, storeid))
615
616
617 cxn = self._openIndex(session, index)
618
619 for k in terms.keys():
620 val = cxn.get(k.encode('utf-8'))
621 if (val <> None):
622 current = index.deserialise_terms(val)
623 gone = [docid, storeid, terms[k]['occurences']]
624 unpacked = index.merge_terms(current, gone, 'delete')
625 if not unpacked[1]:
626
627 cxn.delete(k.encode('utf-8'))
628 else:
629 packed = index.serialise_terms(current[0], unpacked[3:])
630 cxn.put(k.encode('utf-8'), packed)
631 self._closeIndex(session, index)
632
633
634
635
636
637
638 - def fetch_termList(self, session, index, term, numReq=0, relation="", end="", summary=0, reverse=0):
639 p = self.permissionHandlers.get('info:srw/operation/2/scan', None)
640 if p:
641 if not session.user:
642 raise PermissionException("Authenticated user required to scan indexStore %s" % self.id)
643 okay = p.hasPermission(session, session.user)
644 if not okay:
645 raise PermissionException("Permission required to scan indexStore %s" % self.id)
646
647 if (not (numReq or relation or end)):
648
649 numReq = 20
650 if (not relation and not end):
651 relation = ">="
652 if (not relation):
653 if (term > end):
654 relation = "<="
655 else:
656 relation = ">"
657
658 if reverse:
659 dfp = self.get_path(session, "defaultPath")
660 name = self._generateFilename(index)
661 fullname = os.path.join(dfp, name)
662 fullname += "_REVERSE"
663 term = term[::-1]
664 end = end[::-1]
665 cxn = bdb.db.DB()
666
667 if session.environment == "apache":
668 cxn.open(fullname, flags=bdb.db.DB_NOMMAP)
669 else:
670 cxn.open(fullname)
671 else:
672 cxn = self._openIndex(session, index)
673
674 if summary:
675 dataLen = index.longStructSize * self.reservedLongs
676
677 c = cxn.cursor()
678 term = term.encode('utf-8')
679 try:
680 if summary:
681 (key, data) = c.set_range(term, dlen=dataLen,doff=0)
682 else:
683 (key, data) = c.set_range(term)
684 except Exception, e:
685 try:
686 if summary:
687 (key, data) = c.last(dlen=dataLen, doff=0)
688 else:
689 (key, data) = c.last()
690 except TypeError:
691
692 cxn.close()
693 return []
694 if (relation in [">", ">="] and term > key):
695
696 cxn.close()
697 return []
698
699 tlist = []
700 fetching = 1
701
702 if (not (key == term and relation in ['>', '<'])):
703
704 unpacked = index.deserialise_terms(data)
705 if reverse:
706 key = key[::-1]
707 tlist.append([key, unpacked])
708 if numReq == 1:
709 fetching = 0
710
711 while fetching:
712 dir = relation[0]
713 if (dir == ">"):
714 if summary:
715 tup = c.next(dlen=dataLen, doff=0)
716 else:
717 tup = c.next()
718 else:
719 if summary:
720 tup = c.prev(dlen=dataLen, doff=0)
721 else:
722 tup = c.prev()
723 if tup:
724 (key, rec) = tup
725 if (end and dir == '>' and key >= end):
726 fetching = 0
727 elif (end and dir == "<" and key <= end):
728 fetching = 0
729 else:
730 unpacked = index.deserialise_terms(rec)
731 if reverse:
732 key = key[::-1]
733 tlist.append([key, unpacked])
734 if (numReq and len(tlist) == numReq):
735 fetching = 0
736 else:
737 if tlist:
738 if (dir == ">"):
739 tlist[-1].append("last")
740 else:
741 tlist[-1].append("first")
742 key = None
743 fetching = 0
744
745 if reverse:
746 cxn.close()
747 return tlist
748
749 - def create_item(self, session, tid, rst, occs, rsitype="SimpleResultSetItem"):
750 recStore = self.storeHash[rst]
751 if self.identifierMapCxn and self.identifierMapCxn.has_key(recStore):
752 numericTid = tid
753 tid = self._get_externalId(session, recStore, tid)
754 else:
755 numericTid = None
756 if rsitype == "SimpleResultSetItem":
757 return SimpleResultSetItem(session, tid, recStore, occs, session.database, numeric=numericTid)
758 elif rsitype == "Hash":
759 return ("%s/%s" % (recStore, tid), {"recordStore" : recStore, "recordId" : tid, "occurences" : occs, "database" : session.database})
760 else:
761 raise NotImplementedError(rsitype)
762
763
764 - def fetch_term(self, session, index, term, prox=True):
777
779 try:
780 term = term.encode('utf-8')
781 except:
782 pass
783 cxn = self._openIndex(session, index)
784 val = cxn.get(term)
785 return val
786
787
788
789
791 """Use C2 style indexes, only one recordStore"""
792 pass
793
794
795 try:
796 from srboo import *
797
798
799
800
801
802
803
804
805
806
807
808
809 class SrbBdbIndexStore(BdbIndexStore):
810
811 host = ""
812 port = ""
813 user = ""
814 passwd = ""
815 dn = ""
816 domain = ""
817 resource = ""
818 subcollection = ""
819
820 connection = None
821 tempChunks = 0
822
823 def _connect(self):
824 try:
825 self.connection = SrbConnection(self.host, self.port, self.domain, user = self.user, passwd = self.passwd, dn = self.dn)
826 self.connection.resource = self.resource
827 except SrbException:
828
829 raise
830 scs = self.subcollection.split('/')
831 orig = self.connection.collection
832 for c in scs:
833 try:
834 self.connection.create_collection(c)
835 except SrbException, e:
836
837
838 pass
839 self.connection.open_collection(c)
840 self.connection.open_collection(orig)
841 self.connection.open_collection(self.subcollection)