Module indexStore
[hide private]
[frames] | no frames]

Source Code for Module indexStore

   1    
   2  from baseObjects import IndexStore, Database 
   3  from configParser import C3Object 
   4  from c3errors import ConfigFileException, FileDoesNotExistException 
   5  from resultSet import SimpleResultSetItem 
   6  from index import * 
   7  from utils import parseSrbUrl 
   8  import os, types, struct, sys, commands, time 
   9  try: 
  10      # Python 2.3 vs 2.2 
  11      import bsddb as bdb 
  12  except: 
  13      import bsddb3 as bdb 
  14   
  15  nonTextToken = "\x00\t"     
  16   
17 -class BdbIndexStore(IndexStore):
18 19 indexing = 0 20 outFiles = {} 21 outSortFiles = {} 22 storeHash = {} 23 storeHashReverse = {} 24 sortStoreCxn = {} 25 identifierMapCxn = {} 26 indexCxn = {} 27 reservedLongs = 3 28
29 - def __init__(self, session, parent, config):
30 IndexStore.__init__(self, session, parent, config) 31 self.outFiles = {} 32 self.storeHash = {} 33 self.outSortFiles = {} 34 self.sortStoreCxn = {} 35 self.identifierMapCxn = {} 36 self.indexCxn = {} 37 self.reservedLongs = 3 38 rsh = self.get_path(session, 'recordStoreHash') 39 if rsh: 40 wds = rsh.split() 41 for w in range(len(wds)): 42 self.storeHash[long(w)] = wds[w] 43 self.storeHashReverse[wds[w]] = long(w) 44 45 # Record Hash (in bdb) 46 fnbase = "recordIdentifiers_" + self.id + "_" 47 fnlen = len(fnbase) 48 dfp = self.get_path(session, "defaultPath") 49 try: 50 files = os.listdir(dfp) 51 except OSError: 52 try: 53 os.mkdir(dfp, 0755) 54 files = os.listdir(dfp) 55 except: 56 raise ConfigFileException("Cannot create default path for %s: %s" % (self.id, dfp)) 57 for f in files: 58 if (f[:fnlen] == fnbase): 59 recstore = f[fnlen:-4] 60 dbp = os.path.join(dfp, f) 61 cxn = bdb.db.DB() 62 #cxn.set_flags(bdb.db.DB_RECNUM) 63 if session.environment == "apache": 64 cxn.open(dbp, flags=bdb.db.DB_NOMMAP) 65 else: 66 cxn.open(dbp) 67 self.identifierMapCxn[recstore] = cxn
68
69 - def _closeIndex(self, session, index):
70 if self.indexCxn.has_key(index): 71 self.indexCxn[index].close() 72 del self.indexCxn[index]
73
74 - def _openIndex(self, session, index):
75 if self.indexCxn.has_key(index): 76 return self.indexCxn[index] 77 else: 78 dfp = self.get_path(session, 'defaultPath') 79 basename = self._generateFilename(index) 80 dbname = os.path.join(dfp, basename) 81 cxn = bdb.db.DB() 82 #cxn.set_flags(bdb.db.DB_RECNUM) 83 if session.environment == "apache": 84 cxn.open(dbname, flags=bdb.db.DB_NOMMAP) 85 else: 86 cxn.open(dbname) 87 self.indexCxn[index] = cxn 88 return cxn
89
90 - def _fileFilter(self, x):
91 if x[-6:] <> ".index": 92 return 0 93 elif x[:len(self.id)] <> self.id: 94 return 0 95 else: 96 return 1
97
98 - def _generateFilename(self, index):
99 stuff = [self.id, "--", index.id, ".index"] 100 return ''.join(stuff)
101 102
103 - def _get_internalId(self, session, rec):
104 if self.identifierMapCxn.has_key(rec.recordStore): 105 cxn = self.identifierMapCxn[rec.recordStore] 106 else: 107 fn = "recordIdentifiers_" + self.id + "_" + rec.recordStore + ".bdb" 108 dfp = self.get_path(session, "defaultPath") 109 dbp = os.path.join(dfp, fn) 110 if not os.path.exists(dbp): 111 # Create 112 cxn = bdb.db.DB() 113 #cxn.set_flags(bdb.db.DB_RECNUM) 114 cxn.open(dbp, dbtype=bdb.db.DB_BTREE, flags = bdb.db.DB_CREATE, mode=0660) 115 cxn.close() 116 # Open 117 cxn = bdb.db.DB() 118 #cxn.set_flags(bdb.db.DB_RECNUM) 119 if session.environment == "apache": 120 cxn.open(dbp, flags=bdb.db.DB_NOMMAP) 121 else: 122 cxn.open(dbp) 123 self.identifierMapCxn[rec.recordStore] = cxn 124 # Now we have cxn, check it rec exists 125 try: 126 recid = rec.id.encode('utf-8') 127 except: 128 pass 129 try: 130 data = cxn.get(recid) 131 if data: 132 return long(data) 133 except: 134 pass 135 # Doesn't exist, write 136 c = cxn.cursor() 137 c.set_range("__i2s_999999999999999") 138 data = c.prev() 139 if (data and data[0][:6] == "__i2s_"): 140 max = long(data[0][6:]) 141 intid = "%015d" % (max + 1) 142 else: 143 intid = "000000000000000" 144 cxn.put(recid, intid) 145 cxn.put("__i2s_%s" % intid, recid) 146 return long(intid)
147
148 - def _get_externalId(self, session, recordStore, identifier):
149 if self.identifierMapCxn.has_key(recordStore): 150 cxn = self.identifierMapCxn[recordStore] 151 else: 152 fn = "recordIdentifiers_" + self.id + "_" + recordStore + ".bdb" 153 dfp = self.get_path(session, "defaultPath") 154 dbp = os.path.join(dfp, fn) 155 if not os.path.exists(dbp): 156 raise FileDoesNotExistException(dbp) 157 cxn = bdb.db.DB() 158 #cxn.set_flags(bdb.db.DB_RECNUM) 159 if session.environment == "apache": 160 cxn.open(dbp, flags=bdb.db.DB_NOMMAP) 161 else: 162 cxn.open(dbp) 163 self.identifierMapCxn[recordStore] = cxn 164 identifier = "%015d" % identifier 165 data = cxn.get("__i2s_%s" % identifier) 166 if (data): 167 return data 168 else: 169 raise FileDoesNotExistException("%s/%s" % (recordStore, identifier))
170 171 172 # Not API, but called for grid file xfer
173 - def fetch_temp_file(self, session, idxid):
174 175 index = self.get_object(session, idxid) 176 temp = self.get_path(session, 'tempPath') 177 if not os.path.isabs(temp): 178 temp = os.path.join(self.get_path(session, 'defaultPath'), temp) 179 basename = os.path.join(temp, self._generateFilename(index)) 180 if (hasattr(session, 'task')): 181 basename += str(session.task) 182 183 tempfile = basename + "_SORT" 184 fh = file(tempfile) 185 data = fh.read() 186 fh.close() 187 return data
188 189
190 - def begin_indexing(self, session, index):
191 192 p = self.permissionHandlers.get('info:srw/operation/2/index', None) 193 if p: 194 if not session.user: 195 raise PermissionException("Authenticated user required to add to indexStore %s" % self.id) 196 okay = p.hasPermission(session, session.user) 197 if not okay: 198 raise PermissionException("Permission required to add to indexStore %s" % self.id) 199 temp = self.get_path(session, 'tempPath') 200 if not os.path.isabs(temp): 201 temp = os.path.join(self.get_path(session, 'defaultPath'), temp) 202 if (not os.path.exists(temp)): 203 try: 204 os.mkdir(temp) 205 except: 206 raise(ConfigFileException('TempPath does not exist and is not creatable.')) 207 elif (not os.path.isdir(temp)): 208 raise(ConfigFileException('TempPath is not a directory.')) 209 basename = os.path.join(temp, self._generateFilename(index)) 210 if (hasattr(session, 'task')): 211 basename += str(session.task) 212 213 # In case we're called twice 214 if (not self.outFiles.has_key(index)): 215 self.outFiles[index] = codecs.open(basename + "_TEMP", 'a', 'utf-8') 216 217 if (index.get_setting(session, "sortStore")): 218 # Store in db for faster sorting 219 if (session.task): 220 # Need to tempify 221 raise NotImplementedError 222 dfp = self.get_path(session, "defaultPath") 223 name = self._generateFilename(index) + "_VALUES" 224 fullname = os.path.join(dfp, name) 225 if (not os.path.exists(fullname)): 226 raise FileDoesNotExistException(fullname) 227 cxn = bdb.db.DB() 228 #cxn.set_flags(bdb.db.DB_RECNUM) 229 if session.environment == "apache": 230 cxn.open(fullname, flags=bdb.db.DB_NOMMAP) 231 else: 232 cxn.open(fullname) 233 self.outSortFiles[index] = cxn
234 235
236 - def get_indexingPosition(self, session):
237 # Useful if the indexing breaks for some reason and want to restart 238 temp = self.get_path(session, 'tempPath') 239 if not os.path.isabs(temp): 240 temp = os.path.join(self.get_path(session, 'defaultPath'), temp) 241 files = os.listdir(temp) 242 recids = [] 243 for f in files: 244 # Open, find last line, find recid 245 fh = file(os.path.join(temp, f)) 246 fh.seek(-1024, 2) 247 data = fh.read(1024) 248 lines = data.split('\n') 249 l= lines[-2] 250 bits = l.split('\x00\t') 251 recids.append(long(bits[1])) 252 currRec = max(recids) 253 # Note that this is the representation regardless of the actual record id 254 # eg string ids are mapped before this point 255 # Then: myIter = recStore.__iter__() 256 # myIter.jump(currRec) 257 # to restart at this position. Remember to call begin_indexing() again 258 return "%012d" % currRec
259 260
261 - def commit_indexing(self, session, index):
262 # Need to do this per index so one store can be doing multiple things at once 263 # Should only be called if begin_indexing() has been 264 p = self.permissionHandlers.get('info:srw/operation/2/index', None) 265 if p: 266 if not session.user: 267 raise PermissionException("Authenticated user required to add to indexStore %s" % self.id) 268 okay = p.hasPermission(session, session.user) 269 if not okay: 270 raise PermissionException("Permission required to add to indexStore %s" % self.id) 271 272 temp = self.get_path(session, 'tempPath') 273 dfp = self.get_path(session, 'defaultPath') 274 if not os.path.isabs(temp): 275 temp = os.path.join(dfp, temp) 276 277 for k in self.outSortFiles: 278 self.outSortFiles[k].close() 279 if (not self.outFiles.has_key(index)): 280 raise FileDoesNotExistException(index.id) 281 sort = self.get_path(session, 'sortPath') 282 if (not os.path.exists(sort)): 283 raise ConfigFileException("Sort executable for %s does not exist" % self.id) 284 285 fh = self.outFiles[index] 286 fh.flush() 287 fh.close() 288 del self.outFiles[index] 289 290 for db in self.identifierMapCxn.values(): 291 db.sync() 292 293 basename = self._generateFilename(index) 294 if (hasattr(session, 'task')): 295 basename += str(session.task) 296 297 basename = os.path.join(temp, basename) 298 tempfile = basename + "_TEMP" 299 sorted = basename + "_SORT" 300 cmd = "%s %s -o %s" % (sort, tempfile, sorted) 301 commands.getoutput(cmd) 302 # Sorting might fail. 303 if (not os.path.exists(sorted)): 304 raise ValueError("Failed to sort %s" % tempfile) 305 os.remove(tempfile) 306 if hasattr(session, 'phase') and session.phase == 'commit_indexing1': 307 return sorted 308 309 # Original terms from data 310 self.commit_indexing2(session, index, sorted)
311
312 - def commit_indexing2(self, session, index, sorted):
313 p = self.permissionHandlers.get('info:srw/operation/2/index', None) 314 if p: 315 if not session.user: 316 raise PermissionException("Authenticated user required to add to indexStore %s" % self.id) 317 okay = p.hasPermission(session, session.user) 318 if not okay: 319 raise PermissionException("Permission required to add to indexStore %s" % self.id) 320 321 cxn = self._openIndex(session, index) 322 cursor = cxn.cursor() 323 nonEmpty = cursor.first() 324 325 # Should this be: 326 # f = codecs.open(sorted, 'r', 'utf-8') ? 327 f = file(sorted) 328 329 termid = long(0) 330 currTerm = None 331 currData = [] 332 l = 1 333 334 s2t = index.deserialise_terms 335 mt = index.merge_terms 336 t2s = index.serialise_terms 337 minTerms = index.get_setting(session, 'minimumSupport') 338 if not minTerms: 339 minTerms = 0 340 341 while(l): 342 l = f.readline()[:-1] 343 data = l.split(nonTextToken) 344 term = data[0] 345 fullinfo = map(long, data[1:]) 346 if term == currTerm: 347 # accumulate 348 totalRecs += 1 349 totalOccs += fullinfo[2] 350 currData.extend(fullinfo) 351 else: 352 # Store 353 #if currData and totalRecs >= minTerms: 354 if currData: 355 if (nonEmpty): 356 val = cxn.get(currTerm) 357 if (val != None): 358 unpacked = s2t(val) 359 unpacked = mt(unpacked, currData, 'add', recs=totalRecs, occs=totalOccs) 360 totalRecs = unpacked[1] 361 totalOccs = unpacked[2] 362 unpacked = unpacked[3:] 363 else: 364 unpacked = currData 365 packed = t2s(termid, unpacked, recs=totalRecs, occs=totalOccs) 366 else: 367 packed = t2s(termid, currData, recs=totalRecs, occs=totalOccs) 368 cxn.put(currTerm, packed) 369 # assign new line 370 try: 371 totalOccs = fullinfo[2] 372 termid += 1 373 currTerm = term 374 currData = fullinfo 375 totalRecs = 1 376 except: 377 # end of file 378 pass 379 380 self._closeIndex(session, index) 381 # os.remove(sorted) 382 return None
383
384 - def create_term(self, session, index, termid, resultSet):
385 # Take resultset and munge to index format, serialise, store 386 387 term = resultSet.queryTerm 388 unpacked = [] 389 # only way to be sure 390 totalRecs = resultSet.totalRecs 391 totalOccs = resultSet.totalOccs 392 for item in resultSet: 393 unpacked.extend([item.docid, self.storeHashReverse[item.recordStore], item.occurences]) 394 packed = index.serialise_terms(termid, unpacked, recs=totalRecs, occs=totalOccs) 395 cxn = self._openIndex(session, index) 396 cxn.put(term, packed)
397 # NB: need to remember to close index manually 398
399 - def fetch_indexList(self, session):
400 # Return IDs not object pointers 401 dfp = self.get_path(session, "defaultPath") 402 files = os.listdir(dfp) 403 files = filter(self._fileFilter, files) 404 ids = [] 405 start = len(self.id) + 1 406 for f in files: 407 ids.append(f[start:-6]) 408 return ids
409
410 - def fetch_indexStats(self, session, index):
411 raise(NotImplementedError)
412
413 - def contains_index(self, session, index):
414 # Send Index object, check exists, return boolean 415 dfp = self.get_path(session, "defaultPath") 416 name = self._generateFilename(index) 417 return os.path.exists(os.path.join(dfp, name))
418
419 - def create_index(self, session, index):
420 # Send Index object to create, null return 421 p = self.permissionHandlers.get('info:srw/operation/1/create', None) 422 if p: 423 if not session.user: 424 raise PermissionException("Authenticated user required to create index in %s" % self.id) 425 okay = p.hasPermission(session, session.user) 426 if not okay: 427 raise PermissionException("Permission required to create index in %s" % self.id) 428 429 dfp = self.get_path(session, "defaultPath") 430 name = self._generateFilename(index) 431 fullname = os.path.join(dfp, name) 432 if os.path.exists(fullname): 433 raise FileDoesNotExistException(fullname) 434 cxn = bdb.db.DB() 435 #cxn.set_flags(bdb.db.DB_RECNUM) 436 try: 437 cxn.open(fullname, dbtype=bdb.db.DB_BTREE, flags=bdb.db.DB_CREATE, mode=0660) 438 cxn.close() 439 except: 440 raise ConfigFileException(fullname) 441 442 if (index.get_setting(session, "sortStore")): 443 try: 444 oxn = bdb.db.DB() 445 #oxn.set_flags(bdb.db.DB_RECNUM) 446 oxn.open(fullname + "_VALUES", dbtype=bdb.db.DB_BTREE, flags=bdb.db.DB_CREATE, mode=0660) 447 oxn.close() 448 except: 449 raise(ValueError) 450 return 1
451 452
453 - def clean_index(self, session, index):
454 cxn = self._openIndex(session, index) 455 cxn.truncate() 456 self._closeIndex(session, index)
457
458 - def delete_index(self, session, index):
459 # Send Index object to delete, null return 460 p = self.permissionHandlers.get('info:srw/operation/1/delete', None) 461 if p: 462 if not session.user: 463 raise PermissionException("Authenticated user required to delete index from %s" % self.id) 464 okay = p.hasPermission(session, session.user) 465 if not okay: 466 raise PermissionException("Permission required to delete index from %s" % self.id) 467 dfp = self.get_path(session, "defaultPath") 468 name = self._generateFilename(index) 469 try: 470 os.remove(os.path.join(dfp, name)) 471 except: 472 raise FileDoesNotExistException(dfp) 473 return 1
474 475
476 - def fetch_sortValue(self, session, index, item):
477 if (self.sortStoreCxn.has_key(index)): 478 cxn = self.sortStoreCxn[index] 479 else: 480 if (not index.get_setting(session, 'sortStore')): 481 raise FileDoesNotExistException() 482 dfp = self.get_path(session, "defaultPath") 483 name = self._generateFilename(index) + "_VALUES" 484 fullname = os.path.join(dfp, name) 485 cxn = bdb.db.DB() 486 #cxn.set_flags(bdb.db.DB_RECNUM) 487 if session.environment == "apache": 488 cxn.open(fullname, flags=bdb.db.DB_NOMMAP) 489 else: 490 cxn.open(fullname) 491 self.sortStoreCxn[index] = cxn 492 return cxn.get(repr(item))
493
494 - def store_terms(self, session, index, termhash, record):
495 # Store terms from hash 496 # Need to store: term, totalOccs, totalRecs, (record id, recordStore id, number of occs in record) 497 # hash is now {tokenA:tokenA, ...} 498 499 p = self.permissionHandlers.get('info:srw/operation/2/index', None) 500 if p: 501 if not session.user: 502 raise PermissionException("Authenticated user required to add to indexStore %s" % self.id) 503 okay = p.hasPermission(session, session.user) 504 if not okay: 505 raise PermissionException("Permission required to add to indexStore %s" % self.id) 506 507 if (not termhash): 508 # No terms to index 509 return 510 511 storeid = record.recordStore 512 if (type(storeid) != types.IntType): 513 # Map 514 if (self.storeHashReverse.has_key(storeid)): 515 storeid = self.storeHashReverse[storeid] 516 else: 517 # YYY: Error or metadata store? 518 self.storeHashReverse[storeid] = len(self.storeHash.keys()) 519 self.storeHash[self.storeHashReverse[storeid]] = storeid 520 raise ConfigFileException("indexStore %s does not recognise recordStore: %s" % (self.id, storeid)) 521 522 docid = record.id 523 if (type(docid) != types.IntType): 524 if (type(docid) == types.StringType and docid.isdigit()): 525 docid = long(docid) 526 else: 527 # Look up identifier in local bdb 528 docid = self._get_internalId(session, record) 529 elif (docid == -1): 530 # Unstored record 531 raise ValueError(str(record)) 532 533 if self.outFiles.has_key(index): 534 # Batch loading 535 value = termhash.values()[0]['text'] 536 if (self.outSortFiles.has_key(index) and value): 537 if type(value) == unicode: 538 sortVal = value.encode('utf-8') 539 else: 540 sortVal = value 541 self.outSortFiles[index].put("%s/%s" % (str(record.recordStore), docid), sortVal) 542 543 start = time.time() 544 prox = termhash[value].has_key('positions') 545 for k in termhash.values(): 546 kw = k['text'] 547 if type(kw) != unicode: 548 kw = kw.decode('utf-8') 549 self.outFiles[index].write(kw) 550 # ensure that docids are sorted to numeric order 551 lineList = ["", "%012d" % docid, str(storeid), str(k['occurences'])] 552 if prox: 553 lineList.extend(map(str, k['positions'])) 554 self.outFiles[index].write(nonTextToken.join(lineList) + "\n") 555 else: 556 # Directly insert into index 557 cxn = self._openIndex(session, index) 558 559 # This is going to be ... slow ... with lots of i/o 560 # Use commit method unless only doing very small amounts of work. 561 for k in hash.values(): 562 key = k['text'] 563 stuff = [docid, storeid, k['occurences']] 564 try: 565 stuff.extend(k['positions']) 566 except: 567 pass 568 val = cxn.get(key.encode('utf-8')) 569 if (val != None): 570 current = index.deserialise_terms(val) 571 unpacked = index.merge_terms(current, stuff, recs=1, occs=k['occurences']) 572 # 3 longs at beginning are reserved 573 (termid, totalRecs, totalOccs) = unpacked[:3] 574 unpacked = unpacked[3:] 575 else: 576 termid = cxn.stat()['nkeys'] 577 unpacked = stuff 578 579 packed = index.serialise_terms(termid, unpacked, recs=totalRecs, occs=totalOccs) 580 cxn.put(key.encode('utf-8'), packed) 581 self._closeIndex(session, index)
582
583 - def delete_terms(self, session, index, terms, record):
584 p = self.permissionHandlers.get('info:srw/operation/2/unindex', None) 585 if p: 586 if not session.user: 587 raise PermissionException("Authenticated user required to delete from indexStore %s" % self.id) 588 okay = p.hasPermission(session, session.user) 589 if not okay: 590 raise PermissionException("Permission required to delete from indexStore %s" % self.id) 591 if not terms: 592 return 593 594 docid = record.id 595 # Hash 596 if (type(docid) == types.StringType and docid.isdigit()): 597 docid = long(docid) 598 elif (type(docid) in [types.IntType, types.LongType]): 599 pass 600 else: 601 # Look up identifier in local bdb 602 docid = self._get_internalId(session, record) 603 604 storeid = record.recordStore 605 if (type(storeid) <> types.IntType): 606 # Map 607 if (self.storeHashReverse.has_key(storeid)): 608 storeid = self.storeHashReverse[storeid] 609 else: 610 # YYY: Error or metadata store? 611 self.storeHashReverse[storeid] = len(self.storeHash.keys()) 612 self.storeHash[self.storeHashReverse[storeid]] = storeid 613 storeid = self.storeHashReverse[storeid] 614 raise ConfigFileException("indexStore %s does not recognise recordStore: %s" % (self.id, storeid)) 615 616 # Directly insert into index 617 cxn = self._openIndex(session, index) 618 619 for k in terms.keys(): 620 val = cxn.get(k.encode('utf-8')) 621 if (val <> None): 622 current = index.deserialise_terms(val) 623 gone = [docid, storeid, terms[k]['occurences']] 624 unpacked = index.merge_terms(current, gone, 'delete') 625 if not unpacked[1]: 626 # all terms deleted 627 cxn.delete(k.encode('utf-8')) 628 else: 629 packed = index.serialise_terms(current[0], unpacked[3:]) 630 cxn.put(k.encode('utf-8'), packed) 631 self._closeIndex(session, index)
632 633 634 # NB: c.set_range('a', dlen=12, doff=0) 635 # --> (key, 12bytestring) 636 # --> unpack for termid, docs, occs 637
638 - def fetch_termList(self, session, index, term, numReq=0, relation="", end="", summary=0, reverse=0):
639 p = self.permissionHandlers.get('info:srw/operation/2/scan', None) 640 if p: 641 if not session.user: 642 raise PermissionException("Authenticated user required to scan indexStore %s" % self.id) 643 okay = p.hasPermission(session, session.user) 644 if not okay: 645 raise PermissionException("Permission required to scan indexStore %s" % self.id) 646 647 if (not (numReq or relation or end)): 648 # XXX Default from config 649 numReq = 20 650 if (not relation and not end): 651 relation = ">=" 652 if (not relation): 653 if (term > end): 654 relation = "<=" 655 else: 656 relation = ">" 657 658 if reverse: 659 dfp = self.get_path(session, "defaultPath") 660 name = self._generateFilename(index) 661 fullname = os.path.join(dfp, name) 662 fullname += "_REVERSE" 663 term = term[::-1] 664 end = end[::-1] 665 cxn = bdb.db.DB() 666 # cxn.set_flags(bdb.db.DB_RECNUM) 667 if session.environment == "apache": 668 cxn.open(fullname, flags=bdb.db.DB_NOMMAP) 669 else: 670 cxn.open(fullname) 671 else: 672 cxn = self._openIndex(session, index) 673 674 if summary: 675 dataLen = index.longStructSize * self.reservedLongs 676 677 c = cxn.cursor() 678 term = term.encode('utf-8') 679 try: 680 if summary: 681 (key, data) = c.set_range(term, dlen=dataLen,doff=0) 682 else: 683 (key, data) = c.set_range(term) 684 except Exception, e: 685 try: 686 if summary: 687 (key, data) = c.last(dlen=dataLen, doff=0) 688 else: 689 (key, data) = c.last() 690 except TypeError: 691 # Index is empty 692 cxn.close() 693 return [] 694 if (relation in [">", ">="] and term > key): 695 # Asked for > than maximum key 696 cxn.close() 697 return [] 698 699 tlist = [] 700 fetching = 1 701 702 if (not (key == term and relation in ['>', '<'])): 703 # We want this one 704 unpacked = index.deserialise_terms(data) 705 if reverse: 706 key = key[::-1] 707 tlist.append([key, unpacked]) 708 if numReq == 1: 709 fetching = 0 710 711 while fetching: 712 dir = relation[0] 713 if (dir == ">"): 714 if summary: 715 tup = c.next(dlen=dataLen, doff=0) 716 else: 717 tup = c.next() 718 else: 719 if summary: 720 tup = c.prev(dlen=dataLen, doff=0) 721 else: 722 tup = c.prev() 723 if tup: 724 (key, rec) = tup 725 if (end and dir == '>' and key >= end): 726 fetching = 0 727 elif (end and dir == "<" and key <= end): 728 fetching = 0 729 else: 730 unpacked = index.deserialise_terms(rec) 731 if reverse: 732 key = key[::-1] 733 tlist.append([key, unpacked]) 734 if (numReq and len(tlist) == numReq): 735 fetching = 0 736 else: 737 if tlist: 738 if (dir == ">"): 739 tlist[-1].append("last") 740 else: 741 tlist[-1].append("first") 742 key = None 743 fetching = 0 744 745 if reverse: 746 cxn.close() 747 return tlist
748
749 - def create_item(self, session, tid, rst, occs, rsitype="SimpleResultSetItem"):
750 recStore = self.storeHash[rst] 751 if self.identifierMapCxn and self.identifierMapCxn.has_key(recStore): 752 numericTid = tid 753 tid = self._get_externalId(session, recStore, tid) 754 else: 755 numericTid = None 756 if rsitype == "SimpleResultSetItem": 757 return SimpleResultSetItem(session, tid, recStore, occs, session.database, numeric=numericTid) 758 elif rsitype == "Hash": 759 return ("%s/%s" % (recStore, tid), {"recordStore" : recStore, "recordId" : tid, "occurences" : occs, "database" : session.database}) 760 else: 761 raise NotImplementedError(rsitype)
762 763
764 - def fetch_term(self, session, index, term, prox=True):
765 p = self.permissionHandlers.get('info:srw/operation/2/search', None) 766 if p: 767 if not session.user: 768 raise PermissionException("Authenticated user required to search indexStore %s" % self.id) 769 okay = p.hasPermission(session, session.user) 770 if not okay: 771 raise PermissionException("Permission required to search indexStore %s" % self.id) 772 unpacked = [] 773 val = self.fetch_packed(session, index, term) 774 if (val <> None): 775 unpacked = index.deserialise_terms(val, prox) 776 return unpacked
777
778 - def fetch_packed(self, session, index, term):
779 try: 780 term = term.encode('utf-8') 781 except: 782 pass 783 cxn = self._openIndex(session, index) 784 val = cxn.get(term) 785 return val
786 787 788 789 # XXX Implement?
790 -class C2IndexStore(BdbIndexStore):
791 """Use C2 style indexes, only one recordStore""" 792 pass
793 794 795 try: 796 from srboo import * 797 798 # Split index files into chunks based on first two letters, 799 # one for each number, one for non letter/non number. 800 # Then construct across the grid per initial chr 801 # Then store index chunks in SRB 802 # SRB layout: 803 # $HOME/cheshire3/databaseName/indexStoreName/indexName/chunk.index 804 805 # To search, pull down appropriate chunk on demand if necessary 806 # write to disk then search it. 807 # This relies on SRB null chr fix of 2005/11/01 808 809 class SrbBdbIndexStore(BdbIndexStore): 810 811 host = "" 812 port = "" 813 user = "" 814 passwd = "" 815 dn = "" 816 domain = "" 817 resource = "" 818 subcollection = "" 819 820 connection = None 821 tempChunks = 0 822 823 def _connect(self): 824 try: 825 self.connection = SrbConnection(self.host, self.port, self.domain, user = self.user, passwd = self.passwd, dn = self.dn) 826 self.connection.resource = self.resource 827 except SrbException: 828 # Couldn't connect :/ 829 raise 830 scs = self.subcollection.split('/') 831 orig = self.connection.collection 832 for c in scs: 833 try: 834 self.connection.create_collection(c) 835 except SrbException, e: 836 # Err, at some point it should fail 837 # trying to create an existing collection... 838 pass 839 self.connection.open_collection(c) 840 self.connection.open_collection(orig) 841 self.connection.open_collection(self.subcollection)
842 843 844 def __init__(self, session, config, parent): 845 BdbIndexStore.__init__(self, session, config, parent) 846 847 self.tempChunks = self.get_setting(session, 'tempChunks') 848 uri = self.get_path(session, 'srbServer') 849 uri = uri.encode('utf-8') 850 uri = uri.strip() 851 if not uri: 852 raise ConfigFileException("No srbServer to connect to.") 853 else: 854 info = parseSrbUrl(uri) 855 for (a,v) in info.items(): 856 setattr(self, a, v) 857 858 if (isinstance(parent, Database)): 859 sc = parent.id + "/" + self.id 860 else: 861 sc = self.id 862 self.subcollection = "cheshire3/" + sc 863 self.connection = None 864 self._connect() 865 866 867 def _openIndexChunk(self, session, index, chunk): 868 dfp = self.get_path(session, 'defaultPath') 869 dbname = os.path.join(dfp, index.id, "%s.index" % chunk) 870 cxn = bdb.db.DB() 871 if session.environment == "apache": 872 cxn.open(dbname, flags=bdb.db.DB_NOMMAP) 873 else: 874 cxn.open(dbname) 875 return cxn 876 877 def _createIndexChunk(self, session, index, chunk): 878 dfp = self.get_path(session, 'defaultPath') 879 dbname = os.path.join(dfp, index.id, "%s.index" % chunk) 880 cxn = bdb.db.DB() 881 cxn.open(dbname, dbtype=bdb.db.DB_BTREE, flags = bdb.db.DB_CREATE, mode=0660) 882 return cxn 883 884 def _storeIndexChunk(self, session, index, chunk): 885 start = time.time() 886 dfp = self.get_path(session, 'defaultPath') 887 fname = "%s.index" % chunk 888 dbname = os.path.join(dfp, index.id, fname) 889 # read file, store in srb 890 try: 891 self.connection.open_collection(index.id) 892 except: 893 self.connection = None 894 while not self.connection: 895 self._connect() 896 self.connection.open_collection(index.id) 897 inh = file(dbname) 898 outh = self.connection.create(fname) 899 data = inh.read(102400) 900 while data: 901 outh.write(data) 902 data = inh.read(102400) 903 inh.close() 904 outh.close() 905 self.connection.up_collection() 906 907 def _whichChunk(self, term): 908 # buckets based on first chrs 909 if not term: 910 return "other" 911 elif term[0].isalnum(): 912 return term[0].lower() 913 elif term[0] > 'z': 914 return "other2" 915 else: 916 return "other" 917 918 # ------------------------- 919 # Split on first two 920 # 921 if not term: 922 return "other" 923 elif term[0].isdigit(): 924 return term[0] 925 elif term[0].isalpha(): 926 if len(term) == 1: 927 return term + "0" 928 elif not term[1].isalnum(): 929 # recursively strip non alnum chars 930 return self._whichChunk(term[0] + term[2:]) 931 else: 932 return term[:2].lower() 933 else: 934 return "other" 935 # 936 # -------------------------- 937 938 def _maybeFetchChunk(self, session, index, term): 939 # Check if we exist, otherwise fetch 940 fn = self._whichChunk(term) 941 fname = "%s.index" % fn 942 dfp = self.get_path(session, "defaultPath") 943 path = os.path.join(dfp, index.id, fname) 944 if not os.path.exists(path): 945 okay = self._fetchChunk(session, index, fn) 946 if not okay: 947 return None 948 return fn 949 950 def _fetchChunk(self, session, index, chunk): 951 try: 952 self.connection.open_collection(index.id) 953 except: 954 self.connection = None 955 while not self.connection: 956 self._connect() 957 self.connection.open_collection(index.id) 958 dfp = self.get_path(session, "defaultPath") 959 fname = "%s.index" % chunk 960 path = os.path.join(dfp, index.id, fname) 961 if not fname in self.connection.walk_names()[1]: 962 self.connection.up_collection() 963 return 0 964 try: 965 inh = self.connection.open(fname) 966 except: 967 # Can't open :( 968 self.connection.up_collection() 969 return 0 970 outh = file(path, 'w') 971 data = inh.read(10240) 972 while data: 973 outh.write(data) 974 data = inh.read(10240) 975 inh.close() 976 outh.close() 977 self.connection.up_collection() 978 return 1 979 980 981 def begin_indexing(self, session, index): 982 if not self.tempChunks: 983 return BdbIndexStore.begin_indexing(self, session, index) 984 temp = self.get_path(session, 'tempPath') 985 if not os.path.isabs(temp): 986 temp = os.path.join(self.get_path(session, 'defaultPath'), temp) 987 self.tempPath = temp 988 if (not os.path.exists(temp)): 989 try: 990 os.mkdir(temp) 991 except: 992 raise(ConfigFileException('TempPath does not exist and is not creatable.')) 993 elif (not os.path.isdir(temp)): 994 raise(ConfigFileException('TempPath is not a directory.')) 995 996 # Make temp files on demand, in hash 997 self.outFiles[index] = {} 998 999 1000 def commit_indexing(self, session, index): 1001 if self.tempChunks: 1002 temp = self.tempPath 1003 keys = self.outFiles[index].keys() 1004 for f in self.outFiles[index].values(): 1005 f.flush() 1006 f.close() 1007 del self.outFiles[index] 1008 sort = self.get_path(session, 'sortPath') 1009 if hasattr(session, 'task'): 1010 task = session.task 1011 else: 1012 task = None 1013 if hasattr(session, 'phase'): 1014 load = 0 1015 else: 1016 load = 1 1017 1018 sfiles = [] 1019 for k in keys: 1020 if task: 1021 fn = '_'.join([self.id, index.id, k, task]) 1022 else: 1023 fn = '_'.join([self.id, index.id, k]) 1024 tf = os.path.join(temp, fn + "_TEMP") 1025 sf = os.path.join(temp, fn + "_SORT") 1026 cmd = "%s -f %s -o %s" % (sort, tf, sf) 1027 f = commands.getoutput(cmd) 1028 os.remove(tf) 1029 if load: 1030 self.commit_indexing2(session, index, sf) 1031 else: 1032 sfiles.append(sf) 1033 return sfiles 1034 else: 1035 BdbIndexStore.commit_indexing(self, session, index) 1036 1037 def commit_indexing2(self, session, index, sorted): 1038 # Look on session for chunk to process 1039 # otherwise process all 1040 1041 f = file(sorted) 1042 1043 # load all chunks from this file 1044 termid = long(0) 1045 done = 0 1046 prevTerm = None 1047 cxn = None 1048 currChunk = None 1049 currFirst = None 1050 l = f.readline() 1051 1052 t2s = index.serialise_terms 1053 whichChunk = self._whichChunk 1054 storeChunk = self._storeIndexChunk 1055 createChunk = self._createIndexChunk 1056 while(l): 1057 data = l.split(nonTextToken) 1058 term = data[0] 1059 if not done and term[:2] != currFirst: 1060 which = whichChunk(term) 1061 if currChunk != which: 1062 if cxn: 1063 cxn.close() 1064 err = storeChunk(session, index, currChunk) 1065 cxn = createChunk(session, index, which) 1066 currChunk = which 1067 currFirst = term[:2] 1068 if prevTerm == None: 1069 prevTerm = term 1070 fullinfo = map(long, data[1:]) 1071 occs = acc(term, fullinfo) 1072 if occs and occs[0] != []: 1073 termid += 1 1074 packed = t2s(termid, occs) 1075 cxn.put(prevTerm, packed) 1076 prevTerm = data[0] 1077 l = f.readline() 1078 l = l[:-1] 1079 if not done and not l: 1080 l = " " 1081 done = 1 1082 f.close() 1083 os.remove(sorted) 1084 cxn.close() 1085 storeChunk(session, index, currChunk) 1086 1087 def create_index(self, session, index): 1088 p = self.permissionHandlers.get('info:srw/operation/1/create', None) 1089 if p: 1090 if not session.user: 1091 raise PermissionException("Authenticated user required to create index in %s" % self.id) 1092 okay = p.hasPermission(session, session.user) 1093 if not okay: 1094 raise PermissionException("Permission required to create index in %s" % self.id) 1095 # Create local temp space 1096 dfp = self.get_path(session, "defaultPath") 1097 dirname = os.path.join(dfp, index.id) 1098 if not os.path.exists(dirname): 1099 os.mkdir(dirname) 1100 # Don't create any bdb files 1101 if (index.get_setting(session, "sortStore")): 1102 raise NotImplementedError("sortStore") 1103 if (index.get_setting(session, "reverseIndex")): 1104 raise NotImplementedError("reverseIndex") 1105 1106 # Create permanent SRB space 1107 try: 1108 dirs = self.connection.walk_names()[0] 1109 except: 1110 self.connection = None 1111 while not self.connection: 1112 self._connect() 1113 dirs = self.connection.walk_names()[0] 1114 if not index.id in dirs: 1115 self.connection.create_collection(index.id) 1116 1117 def clean_index(self, session, index): 1118 # XXX Delete all SRB files 1119 raise NotImplementedError() 1120 1121 def delete_index(self, session, index): 1122 self.clean_index(session, index) 1123 1124 def fetch_sortValue(self, session, index, item): 1125 raise NotImplementedError("sortStore") 1126 1127 def delete_terms(self, session, index, terms, record): 1128 raise NotImplementedError() 1129 1130 def store_terms(self, session, index, hash, record): 1131 if self.tempChunks: 1132 if not hash: 1133 return 1134 1135 # Make sure you know what you're doing 1136 storeid = record.recordStore 1137 if (type(storeid) != types.IntType): 1138 storeid = self.storeHashReverse[storeid] 1139 docid = long(record.id) 1140 1141 for k in hash.values(): 1142 try: 1143 text = k['text'].encode('utf-8') 1144 except: 1145 print text 1146 text = "" 1147 if not text: 1148 continue 1149 lineList = [text, str(docid), str(storeid), str(k['occurences'])] 1150 try: 1151 lineList.append(nonTextToken.join(map(str, k['positions']))) 1152 except KeyError: 1153 # non prox 1154 pass 1155 if not text or not text[0].isalnum(): 1156 tf = "other" 1157 else: 1158 tf = text[0].lower() 1159 1160 try: 1161 outh = self.outFiles[index][tf] 1162 except: 1163 if session.task: 1164 fname = '_'.join([self.id, index.id, tf, session.task, 'TEMP']) 1165 else: 1166 fname = '_'.join([self.id, index.id, tf, 'TEMP']) 1167 fname = os.path.join(self.tempPath, fname) 1168 outh = file(fname, 'w') 1169 self.outFiles[index][tf] = outh 1170 outh.write(nonTextToken.join(lineList) + "\n") 1171 return 1172 if self.outFiles.has_key(index): 1173 BdbIndexStore.store_terms(self, session, index, hash, record) 1174 else: 1175 raise NotImplementedError() 1176 1177 def fetch_termList(self, session, index, term, numReq=0, relation="", end="", summary=0, reverse=0): 1178 if reverse: 1179 raise NotImplementedError("reverseIndex") 1180 if (not (numReq or relation or end)): 1181 # XXX Default from config 1182 numReq = 20 1183 if (not relation and not end): 1184 relation = ">=" 1185 if (not relation): 1186 if (term > end): 1187 relation = "<=" 1188 else: 1189 relation = ">" 1190 1191 # Only return to end of current index? 1192 1193 chunk = self._maybeFetchChunk(session, index, term) 1194 if chunk == None: 1195 # no data 1196 return [] 1197 cxn = self._openIndexChunk(session, index, chunk) 1198 1199 if summary: 1200 dataLen = index.longStructSize * self.reservedLongs 1201 1202 c = cxn.cursor() 1203 term = term.encode('utf-8') 1204 try: 1205 if summary: 1206 (key, data) = c.set_range(term, dlen=dataLen,doff=0) 1207 else: 1208 (key, data) = c.set_range(term) 1209 except Exception, e: 1210 if summary: 1211 (key, data) = c.last(dlen=dataLen, doff=0) 1212 else: 1213 (key, data) = c.last() 1214 if (relation in [">", ">="] and term > key): 1215 # Asked for > than maximum key 1216 cxn.close() 1217 return [] 1218 1219 tlist = [] 1220 fetching = 1 1221 1222 if (not (key == term and relation in ['>', '<'])): 1223 # We want this one 1224 unpacked = index.deserialise_terms(data) 1225 tlist.append([key, unpacked]) 1226 if numReq == 1: 1227 fetching = 0 1228 while fetching: 1229 dir = relation[0] 1230 if (dir == ">"): 1231 if summary: 1232 tup = c.next(dlen=dataLen, doff=0) 1233 else: 1234 tup = c.next() 1235 else: 1236 if summary: 1237 tup = c.prev(dlen=dataLen, doff=0) 1238 else: 1239 tup = c.prev() 1240 if tup: 1241 (key, rec) = tup 1242 if (end and dir == '>' and key >= end): 1243 fetching = 0 1244 elif (end and dir == "<" and key <= end): 1245 fetching = 0 1246 else: 1247 unpacked = index.deserialise_terms(rec) 1248 if reverse: 1249 key = key[::-1] 1250 tlist.append([key, unpacked]) 1251 if (numReq and len(tlist) == numReq): 1252 fetching = 0 1253 else: 1254 key = None 1255 fetching = 0 1256 cxn.close() 1257 return tlist 1258 1259 def fetch_term(self, session, index, term): 1260 val = self.fetch_packed(session, index, term) 1261 if val != None: 1262 return index.deserialise_terms(val) 1263 else: 1264 return [] 1265 1266 def fetch_packed(self, session, index, term): 1267 try: 1268 term = term.encode('utf-8') 1269 except: 1270 pass 1271 chunk = self._maybeFetchChunk(session, index, term) 1272 if chunk == None: 1273 return None 1274 cxn = self._openIndexChunk(session, index, chunk) 1275 val = cxn.get(term) 1276 return val 1277 1278 1279 except: 1280 pass 1281