Module index
[hide private]
[frames] | no frames]

Source Code for Module index

   1   
   2  from baseObjects import Index, Document 
   3  from configParser import C3Object 
   4  from utils import elementType, getFirstData, verifyXPaths, flattenTexts 
   5  from c3errors import ConfigFileException 
   6  import re, types, sys, os, struct, time 
   7  from record import SaxRecord, DomRecord 
   8  from resultSet import SimpleResultSet, SimpleResultSetItem 
   9  from PyZ3950 import CQLParser, SRWDiagnostics 
  10  import codecs 
  11   
  12  try: 
  13      import termine 
  14  except: 
  15      pass 
  16   
  17  from XPathProcessor import XPathProcessor 
  18   
19 -class SimpleIndex(Index):
20 sources = [] 21 xPathAllAbsolute = 1 22 xPathAttributesRequired = [] 23 xPathsNormalized = {} 24 currentFullPath = [] 25 currentPath = [] 26 storeOrig = 0 27 debug = 0 28 29 indexingTerm = "" 30 indexingData = [] 31
32 - def _handleConfigNode(self, session, node):
33 # Source 34 if (node.localName == "source"): 35 process = [] 36 preprocess = [] 37 xp = None 38 for child in node.childNodes: 39 if child.nodeType == elementType: 40 if child.localName == "xpath": 41 if xp == None: 42 ref = child.getAttributeNS(None, 'ref') 43 if ref: 44 xp = self.get_object(session, ref) 45 else: 46 xp = XPathProcessor(session, node, self) 47 xp._handleConfigNode(session, node) 48 elif child.localName == "preprocess": 49 # Record process list 50 for child2 in child.childNodes: 51 if child2.nodeType == elementType and child2.localName == "object": 52 preprocess.append([child2.getAttributeNS(None, 'type'), child2.getAttributeNS(None, 'ref')]) 53 elif child.localName == "process": 54 # Record process list 55 for child2 in child.childNodes: 56 if child2.nodeType == elementType and child2.localName == "object": 57 process.append([child2.getAttributeNS(None, 'type'), child2.getAttributeNS(None, 'ref')]) 58 if xp == None: 59 raise ConfigFileException("No XPath given for index %s" % self.id) 60 self.sources.append((xp, process, preprocess))
61
62 - def __init__(self, session, node, parent):
63 self.sources = [] 64 self.xPathAttributesRequired = [] 65 self.xPathsNormalized = {} 66 self.xPathAllAbsolute = 1 67 self.indexingTerm = "" 68 self.indexingData = [] 69 lss = self.get_setting(session, 'longSize') 70 if lss: 71 self.longStructSize = int(lss) 72 else: 73 self.longStructSize = len(struct.pack('L', 1)) 74 75 Index.__init__(self, session, node, parent) 76 self.debug = self.get_setting(session, 'debug') 77 78 # We need a Store object 79 iStore = self.get_path(session, 'indexStore') 80 self.indexStore = iStore 81 82 if (iStore == None): 83 raise(ConfigFileException("Index (%s) does not have an indexStore." % (self.id))) 84 elif not iStore.contains_index(session, self): 85 iStore.create_index(session, self) 86 87 for s in range(len(self.sources)): 88 if self.sources[s][1][0][0] <> 'extractor': 89 raise(ConfigFileException("First link in process chain must be an Extractor.")) 90 for t in range(len(self.sources[s][1])): 91 o = self.get_object(session, self.sources[s][1][t][1]) 92 if (o <> None): 93 self.sources[s][1][t][1] = o 94 else: 95 raise(ConfigFileException("[%s] Unknown object %s" % (self.id, self.sources[s][1][t][1]))) 96 for t in range(len(self.sources[s][2])): 97 o = self.get_object(session, self.sources[s][2][t][1]) 98 if (o <> None): 99 self.sources[s][2][t][1] = o 100 else: 101 raise(ConfigFileException("Unknown object %s" % (self.sources[s][2][t][1])))
102 103
104 - def _mergeHash(self, a, b):
105 if not a: 106 return b 107 if not b: 108 return a 109 for k in b.keys(): 110 try: 111 a[k]['occurences'] += b[k]['occurences'] 112 try: 113 a[k]['positions'].extend(b[k]['positions']) 114 except: 115 # Non prox 116 pass 117 except: 118 a[k] = b[k] 119 return a
120
121 - def _processChain(self, session, data, process):
122 (otype, obj) = process[0] 123 new = {} 124 if (otype == 'extractor'): 125 # list input of extracted data 126 # Might be different, eg from difft xpaths 127 128 for d in data: 129 if (type(d) == types.ListType): 130 # SAX event 131 new = self._mergeHash(new, obj.process_eventList(session, d)) 132 elif (type(d) in types.StringTypes): 133 # Attribute content 134 new = self._mergeHash(new, obj.process_string(session, d)) 135 else: 136 # DOM nodes 137 new = self._mergeHash(new, obj.process_node(session, d)) 138 139 elif (otype == 'normalizer'): 140 # Woo, down to one line :) 141 new = obj.process_hash(session, data) 142 143 elif (otype == 'preParser'): 144 # Give current keys to the parser to do something with 145 # Will return documents... 146 fn = obj.process_document 147 for d in data.keys(): 148 if (not isinstance(d[0], Document)): 149 doc = StringDocument(d, self) 150 else: 151 doc = d 152 new[fn(session, doc)] = data[d] 153 154 elif (otype == 'parser'): 155 # Give current keys (which had better be documents!) to parser 156 fn = obj.process_document 157 for d in data.keys(): 158 try: 159 new[fn(session, d)] = data[d] 160 except Exception, err: 161 f = d.get_raw() 162 err.text = f 163 raise err 164 165 elif (otype == 'index'): 166 # Had better be a record! 167 # Stomp infinite recursion: 168 # Don't assign anything different to new ? 169 if obj == self: 170 raise(ConfigFileException("Infinitely recursive process chain!")) 171 fn = obj.index_record 172 for d in data.keys(): 173 fn(session, d) 174 175 elif (otype == 'transformer'): 176 # Had better be a record! 177 fn = obj.process_record 178 for d in data.keys(): 179 new[fn(session, d)] = data[d] 180 181 elif (type == 'indexStore'): 182 # XXX Store terms in multiple stores? 183 raise NotImplementedError 184 185 elif (type == 'recordStore'): 186 # Had better be a record! 187 fn = obj.store_record 188 for d in data.keys(): 189 if (not isinstance(d, Record)): 190 raise(ValueError) 191 fn(session, d) 192 new = data 193 else: 194 raise(ConfigFileException("Unknown object type: %s" % (otype))) 195 196 if (len(process) == 1): 197 return new 198 else: 199 return self._processChain(session, new, process[1:])
200 201
202 - def _processRecord(self, session, record, source):
203 (xpath, process, preprocess) = source 204 if preprocess: 205 r2hash = self._processChain(session, {record : ''}, preprocess) 206 record2 = r2hash.keys()[0] 207 else: 208 record2 = record 209 210 hashes = [] 211 rawlist = xpath.process_record(session, record) 212 for raw in rawlist: 213 hashes.append(self._processChain(session, raw, process)) 214 215 # Merge different values 216 processed = hashes[0] 217 for i in range(len(hashes) - 1): 218 processed = self._mergeHash(processed, hashes[i+1]) 219 return processed
220 221 222 # XXX API badness
223 - def extract_data(self, session, record):
224 processed = self._processRecord(session, record, self.sources[0]) 225 if processed: 226 keys = processed.keys() 227 keys.sort() 228 return keys[0] 229 else: 230 return None
231
232 - def index_record(self, session, record):
233 # First extract simple paths, the majority of cases 234 p = self.permissionHandlers.get('info:srw/operation/2/index', None) 235 if p: 236 if not session.user: 237 raise PermissionException("Authenticated user required to add to index %s" % self.id) 238 okay = p.hasPermission(session, session.user) 239 if not okay: 240 raise PermissionException("Permission required to add to index %s" % self.id) 241 for src in self.sources: 242 processed = self._processRecord(session, record, src) 243 self.indexStore.store_terms(session, self, processed, record) 244 return processed
245
246 - def store_terms(self, session, data, record):
247 self.indexStore.store_terms(session, self, data, record)
248
249 - def delete_record(self, session, record):
250 # Extract terms, and remove from store 251 p = self.permissionHandlers.get('info:srw/operation/2/unindex', None) 252 if p: 253 if not session.user: 254 raise PermissionException("Authenticated user required to remove from index %s" % self.id) 255 okay = p.hasPermission(session, session.user) 256 if not okay: 257 raise PermissionException("Permission required to remove from index %s" % self.id) 258 istore = self.get_path(session, 'indexStore') 259 for src in self.sources: 260 processed = self._processRecord(session, record, src) 261 if (istore != None): 262 istore.delete_terms(session, self, processed, record)
263
264 - def begin_indexing(self, session):
265 # Find all indexStores 266 p = self.permissionHandlers.get('info:srw/operation/2/index', None) 267 if p: 268 if not session.user: 269 raise PermissionException("Authenticated user required to add to index %s" % self.id) 270 okay = p.hasPermission(session, session.user) 271 if not okay: 272 raise PermissionException("Permission required to add to index %s" % self.id) 273 stores = [] 274 for src in self.sources: 275 if src[1][-1][0] == 'indexStore': 276 stores.append(src[1][-1][1]) 277 istore = self.get_path(session, 'indexStore') 278 if (istore <> None and not istore in stores): 279 stores.append(istore) 280 for s in stores: 281 s.begin_indexing(session, self)
282 283
284 - def commit_indexing(self, session):
285 p = self.permissionHandlers.get('info:srw/operation/2/index', None) 286 if p: 287 if not session.user: 288 raise PermissionException("Authenticated user required to add to index %s" % self.id) 289 okay = p.hasPermission(session, session.user) 290 if not okay: 291 raise PermissionException("Permission required to add to index %s" % self.id) 292 stores = [] 293 for src in self.sources: 294 if src[1][-1][0] == 'indexStore': 295 stores.append(src[1][-1][1]) 296 istore = self.get_path(session, 'indexStore') 297 if (istore <> None and not istore in stores): 298 stores.append(istore) 299 for s in stores: 300 s.commit_indexing(session, self)
301
302 - def search(self, session, clause, db):
303 # Final destination. Process Term. 304 p = self.permissionHandlers.get('info:srw/operation/2/search', None) 305 if p: 306 if not session.user: 307 raise PermissionException("Authenticated user required to search index %s" % self.id) 308 okay = p.hasPermission(session, session.user) 309 if not okay: 310 raise PermissionException("Permission required to search index %s" % self.id) 311 pn = self.get_setting(session, 'termProcess') 312 if (pn == None): 313 pn = 0 314 else: 315 pn = int(pn) 316 process = self.sources[pn][1] 317 res = self._processChain(session, [clause.term.value], process) 318 store = self.get_path(session, 'indexStore') 319 matches = [] 320 rel = clause.relation 321 322 # XXX check that relation.contextSet is CQL 323 if (clause.relation.value in ['any', 'all', '=', 'exact']): 324 for k in res: 325 term = store.fetch_term(session, self, k) 326 s = self.construct_resultSet(session, term, res[k]) 327 matches.append(s) 328 elif (clause.relation.value in ['>=', '>', '<', '<=']): 329 if (len(res) <> 1): 330 d = SRWDiagnostics.Diagnostic24() 331 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value) 332 raise d 333 else: 334 termList = store.fetch_termList(session, self, res.keys()[0], 0, clause.relation.value) 335 for t in termList: 336 matches.append(self.construct_resultSet(session, t[1])) 337 elif (clause.relation.value == "within"): 338 if (len(res) <> 2): 339 d = SRWDiagnostics.Diagnostic24() 340 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value) 341 raise d 342 else: 343 termList = store.fetch_termList(session, self, res.keys()[0], end=res.keys()[1]) 344 for t in termList: 345 matches.append(self.construct_resultSet(session, t[1])) 346 else: 347 d = SRWDiagnostics.Diagnostic24() 348 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value) 349 raise d 350 base = SimpleResultSet(session) 351 if not matches: 352 return base 353 else: 354 rs = base.combine(session, matches, clause, db) 355 # maybe insert termine stuff 356 tdb = self.get_path(session, 'termineDb') 357 if tdb: 358 rs.termWeight = 0 359 for k in res: 360 w = termine.fetch_weight(session, k, tdb) 361 rs.termWeight += w 362 return rs
363
364 - def scan(self, session, value, numReq, direction=">="):
365 # Process term. 366 p = self.permissionHandlers.get('info:srw/operation/2/scan', None) 367 if p: 368 if not session.user: 369 raise PermissionException("Authenticated user required to scan index %s" % self.id) 370 okay = p.hasPermission(session, session.user) 371 if not okay: 372 raise PermissionException("Permission required to scan index %s" % self.id) 373 pn = self.get_setting(session, 'termProcess') 374 if (pn == None): 375 pn = 0 376 else: 377 pn = int(pn) 378 process = self.sources[pn][1] 379 res = self._processChain(session, [value], process) 380 if (len(res) <> 1): 381 d = SRWDiagnostics.Diagnostic24() 382 d.details = "%s" % (value) 383 raise d 384 store = self.get_path(session, 'indexStore') 385 tList = store.fetch_termList(session, self, res.keys()[0], numReq=numReq, relation=direction, summary=1) 386 # list of (term, occs) 387 return tList
388
389 - def serialise_terms(self, termid, terms, recs=0, occs=0):
390 # in: list of longs 391 if not recs: 392 recs = len(terms) / 3 393 occs = sum(terms[2::3]) 394 fmt = 'lll' * (recs + 1) 395 params = [fmt, termid, recs, occs] + terms 396 return struct.pack(*params)
397
398 - def deserialise_terms(self, data, prox=1):
399 fmt = 'lll' * (len(data) / (3 * self.longStructSize)) 400 return struct.unpack(fmt, data)
401
402 - def merge_terms(self, structTerms, newTerms, op="replace", recs=0, occs=0):
403 # structTerms = output of deserialiseTerms 404 # newTerms = flat list 405 # op = replace, add, delete 406 # recs, occs = total recs/occs in newTerms 407 408 (termid, oldTotalRecs, oldTotalOccs) = structTerms[0:3] 409 structTerms = list(structTerms[3:]) 410 411 if op == 'add': 412 structTerms.extend(newTerms) 413 if recs: 414 trecs = oldTotalRecs + recs 415 toccs = oldTotalOccs + occs 416 else: 417 trecs = oldTotalRecs + len(newTerms) / 3 418 toccs = oldTotalOccs + sum(newTerms[2::3]) 419 elif op == 'replace': 420 for n in range(0,len(newTerms),3): 421 docid = newTerms[n] 422 storeid = newTerms[n+1] 423 replaced = 0 424 for x in range(3, len(structTerms), 3): 425 if structTerms[x] == docid and structTerms[x+1] == storeid: 426 structTerms[x+2] == newTerms[n+2] 427 replaced = 1 428 break 429 if not replaced: 430 structTerms.extend([docid, storeid, newTerms[n+2]]) 431 432 trecs = len(structTerms) / 3 433 toccs = sum(structTerms[2::3]) 434 elif op == 'delete': 435 for n in range(0,len(newTerms),3): 436 docid = newTerms[n] 437 storeid = newTerms[n+1] 438 for x in range(0, len(structTerms), 3): 439 if structTerms[x] == docid and structTerms[x+1] == storeid: 440 del structTerms[x:x+3] 441 break 442 trecs = len(structTerms) / 3 443 toccs = sum(structTerms[2::3]) 444 445 merged = [termid, trecs, toccs] + structTerms 446 return merged
447
448 - def construct_item(self, session, term, rsitype="SimpleResultSetItem"):
449 # in: single triple 450 # out: resultSetItem 451 # Need to map recordStore and docid at indexStore 452 return self.indexStore.create_item(session, term[0], term[1], term[2], rsitype)
453
454 - def construct_resultSet(self, session, terms, queryHash={}):
455 # in: unpacked 456 # out: resultSet 457 l = len(terms) 458 ci = self.indexStore.create_item 459 460 s = SimpleResultSet(session, []) 461 rsilist = [] 462 for t in range(3,len(terms),3): 463 item = ci(session, terms[t], terms[t+1], terms[t+2]) 464 item.resultSet = s 465 rsilist.append(item) 466 s.fromList(rsilist) 467 s.index = self 468 if queryHash: 469 s.queryTerm = queryHash['text'] 470 s.queryFreq = queryHash['occurences'] 471 if (terms): 472 s.termid = terms[0] 473 s.totalRecs = terms[1] 474 s.totalOccs = terms[2] 475 else: 476 s.totalRecs = 0 477 s.totalOccs = 0 478 return s
479 480
481 -class ProximityIndex(SimpleIndex):
482 """ Need to use prox extractor """ 483
484 - def serialise_terms(self, termid, terms, recs=0, occs=0):
485 # in: list of longs 486 fmt = 'l' * (len(terms) + 3) 487 params = [fmt, termid, recs, occs] + terms 488 try: 489 val = struct.pack(*params) 490 except: 491 print params 492 raise 493 return val
494
495 - def deserialise_terms(self, data, prox=1):
496 fmt = 'L' * (len(data) / self.longStructSize) 497 flat = struct.unpack(fmt, data) 498 (termid, totalRecs, totalOccs) = flat[:3] 499 idx = 3 500 docs = [termid, totalRecs, totalOccs] 501 while idx < len(flat): 502 doc = list(flat[idx:idx+3]) 503 nidx = idx + 3 + (doc[2]*2) 504 doc.extend(flat[idx+3:nidx]) 505 idx = nidx 506 docs.append(doc) 507 return docs
508
509 - def merge_terms(self, structTerms, newTerms, op="replace", recs=0, occs=0):
510 # in: struct: deserialised, new: flag 511 # out: flat 512 513 (termid, oldTotalRecs, oldTotalOccs) = structTerms[0:3] 514 structTerms = list(structTerms[3:]) 515 516 if op == 'add': 517 # flatten 518 terms = [] 519 for t in structTerms: 520 terms.extend(t) 521 terms.extend(newTerms) 522 structTerms = terms 523 if recs != 0: 524 trecs = oldTotalRecs + recs 525 toccs = oldTotalOccs + occs 526 else: 527 # ... 528 trecs = oldTotalRecs + len(newTerms) 529 toccs = oldTotalOccs 530 for t in newTerms: 531 toccs = toccs + t[2] 532 raise ValueError("FIXME: mergeTerms needs recs/occs params") 533 elif op == 'replace': 534 raise NotImplementedError() 535 newOccs = 0 536 for new in newTerms: 537 docid = new[0] 538 storeid = new[1] 539 replaced = 0 540 for x in range(3, len(structTerms)): 541 old = structTerms[x] 542 if old[0] == docid and old[1] == storeid: 543 structTerms[x][2] = new[2] 544 structTerms[x][3:] = new[3:] 545 # Assumes that the same record doesn't get replaced 546 # multiple times, which is a pretty reasonable 547 # assumption. 548 newOccs = newOccs + new[2] 549 replaced = 1 550 break 551 if not replaced: 552 structTerms.append(new) 553 newOccs = newOccs + new[2] 554 trecs = len(structTerms) 555 toccs = oldTotalOccs + newOccs 556 elif op == 'delete': 557 delOccs = 0 558 idx = 0 559 while idx < len(newTerms): 560 doc = list(newTerms[idx:idx+3]) 561 idx = idx + 3 + (doc[2]*2) 562 for x in range(len(structTerms)): 563 old = structTerms[x] 564 if old[0] == doc[0] and old[1] == doc[1]: 565 delOccs = delOccs + old[2] 566 del structTerms[x] 567 break 568 trecs = len(structTerms) -3 569 toccs = oldTotalOccs - delOccs 570 # now flatten 571 terms = [] 572 for t in structTerms: 573 terms.extend(t) 574 structTerms = terms 575 576 merged = [termid, trecs, toccs] 577 merged.extend(structTerms) 578 return merged
579
580 - def construct_item(self, session, term):
581 # in: single triple 582 # out: resultSetItem 583 # Need to map recordStore and docid at indexStore 584 item = self.indexStore.create_item(session, term[0], term[1], term[2]) 585 item.proxInfo = term[3:] 586 return item
587
588 - def construct_resultSet(self, session, terms, queryHash={}):
589 # in: unpacked 590 # out: resultSet 591 592 rsilist = [] 593 ci = self.indexStore.create_item 594 s = SimpleResultSet(session, []) 595 for t in terms[3:]: 596 item = ci(session, t[0], t[1], t[2]) 597 item.proxInfo = t[3:] 598 item.resultSet = s 599 rsilist.append(item) 600 s.fromList(rsilist) 601 s.index = self 602 if queryHash: 603 s.queryTerm = queryHash['text'] 604 s.queryFreq = queryHash['occurences'] 605 s.queryPositions = [] 606 for x in queryHash['positions'][1::2]: 607 s.queryPositions.append(x) 608 if (terms): 609 s.termid = terms[0] 610 s.totalRecs = terms[1] 611 s.totalOccs = terms[2] 612 else: 613 s.totalRecs = 0 614 s.totalOccs = 0 615 return s
616 617
618 -class RangeIndex(SimpleIndex):
619 # 1 3 should make 1, 2, 3 620 # a c should match a* b* c 621
622 - def search(self, session, clause, db):
623 # Final destination. Process Term. 624 p = self.permissionHandlers.get('info:srw/operation/2/search', None) 625 if p: 626 if not session.user: 627 raise PermissionException("Authenticated user required to search index %s" % self.id) 628 okay = p.hasPermission(session, session.user) 629 if not okay: 630 raise PermissionException("Permission required to search index %s" % self.id) 631 632 633 # Need to process term based on query. Shouldn't turn into a range 634 pn = self.get_setting(session, 'termProcess') 635 if (pn == None): 636 pn = 0 637 else: 638 pn = int(pn) 639 # Strip off RangeNormalizer 640 process = self.sources[pn][1][:-1] 641 res = self._processChain(session, [clause.term.value], process) 642 643 store = self.get_path(session, 'indexStore') 644 matches = [] 645 rel = clause.relation 646 647 if clause.relation.value == 'encloses': 648 pass 649 650 651 # XXX check that relation.contextSet is CQL 652 if (clause.relation.value in ['any', 'all', '=', 'exact']): 653 for k in res: 654 term = store.fetch_term(session, self, k) 655 s = self.construct_resultSet(session, term, res[k]) 656 matches.append(s) 657 elif (clause.relation.value in ['>=', '>', '<', '<=']): 658 if (len(res) <> 1): 659 d = SRWDiagnostics.Diagnostic24() 660 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value) 661 raise d 662 else: 663 termList = store.fetch_termList(session, self, res.keys()[0], 0, clause.relation.value) 664 for t in termList: 665 matches.append(self.construct_resultSet(session, t[1])) 666 elif (clause.relation.value == "within"): 667 if (len(res) <> 2): 668 d = SRWDiagnostics.Diagnostic24() 669 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value) 670 raise d 671 else: 672 # Not XXX necessarily pulled out in the right order. 673 termList = store.fetch_termList(session, self, res.keys()[0], end=res.keys()[1]) 674 for t in termList: 675 matches.append(self.construct_resultSet(session, t[1])) 676 else: 677 d = SRWDiagnostics.Diagnostic24() 678 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value) 679 raise d 680 base = SimpleResultSet(session) 681 if not matches: 682 return base 683 else: 684 return base.combine(session, matches, clause, db)
685 686 687
688 -class ClusterExtractionIndex(SimpleIndex):
689 690 # Same processing tricks as indexing, just store terms differently 691
692 - def _handleConfigNode(self, session, node):
693 if (node.localName == "cluster"): 694 maps = [] 695 for child in node.childNodes: 696 if (child.nodeType == elementType and child.localName == "map"): 697 t = child.getAttributeNS(None, 'type') 698 map = [] 699 for xpchild in child.childNodes: 700 if (xpchild.nodeType == elementType and xpchild.localName == "xpath"): 701 map.append(flattenTexts(xpchild)) 702 elif (xpchild.nodeType == elementType and xpchild.localName == "process"): 703 p = [] 704 for child2 in xpchild.childNodes: 705 if child2.nodeType == elementType and child2.localName == "object": 706 p.append([child2.getAttributeNS(None, 'type'), child2.getAttributeNS(None, 'ref')]) 707 map.append(p) 708 vxp = verifyXPaths([map[0]]) 709 if (len(map) < 3): 710 # default ExactExtractor 711 map.append([['extractor', 'ExactExtractor']]) 712 if (t == u'key'): 713 self.keyMap = [vxp[0], map[1], map[2]] 714 else: 715 maps.append([vxp[0], map[1], map[2]]) 716 self.maps = maps
717
718 - def __init__(self, session, config, parent):
719 self.keyMap = [] 720 self.maps = [] 721 Index.__init__(self, session, config, parent) 722 723 for m in range(len(self.maps)): 724 for t in range(len(self.maps[m][2])): 725 o = self.get_object(None, self.maps[m][2][t][1]) 726 if (o <> None): 727 self.maps[m][2][t][1] = o 728 else: 729 raise(ConfigFileException("Unknown object %s" % (self.maps[m][2][t][1]))) 730 for t in range(len(self.keyMap[2])): 731 o = self.get_object(None, self.keyMap[2][t][1]) 732 if (o <> None): 733 self.keyMap[2][t][1] = o 734 else: 735 raise(ConfigFileException("Unknown object %s" % (self.keyMap[2][t][1])))
736 737
738 - def begin_indexing(self, session):
739 path = self.get_path(session, "tempPath") 740 if (not os.path.isabs(path)): 741 dfp = self.get_path(session, "defaultPath") 742 path = os.path.join(dfp, path) 743 self.fileHandle = codecs.open(path, "w", 'utf-8')
744
745 - def commit_indexing(self, session):
746 self.fileHandle.close()
747 748
749 - def index_record(self, session, rec):
750 # Extract cluster information, append to temp file 751 # Step through .maps keys 752 p = self.permissionHandlers.get('info:srw/operation/2/cluster', None) 753 if p: 754 if not session.user: 755 raise PermissionException("Authenticated user required to cluster using %s" % self.id) 756 okay = p.hasPermission(session, session.user) 757 if not okay: 758 raise PermissionException("Permission required to cluster using %s" % self.id) 759 760 raw = rec.process_xpath(self.keyMap[0]) 761 keyData = self._processChain(session, raw, self.keyMap[2]) 762 fieldData = [] 763 for map in self.maps: 764 raw = rec.process_xpath(map[0]) 765 fd = self._processChain(session, raw, map[2]) 766 for f in fd.keys(): 767 fieldData.append("%s\x00%s\x00" % (map[1], f)) 768 d = "".join(fieldData) 769 for k in keyData.keys(): 770 try: 771 self.fileHandle.write(u"%s\x00%s\n" % (k, d)) 772 self.fileHandle.flush() 773 except: 774 print k 775 raise
776 777 778 from utils import SimpleBitfield 779 from resultSet import BitmapResultSet 780
781 -class BitmapIndex(SimpleIndex):
782 # store as hex -- fast to generate, 1 byte per 4 bits. 783 # eval to go from hex to long for bit manipulation 784
785 - def __init__(self, session, a, b):
786 SimpleIndex.__init__(self, session, a, b) 787 self.indexingData = SimpleBitfield() 788 self.indexingTerm = "" 789 self.recordStore = self.get_setting(session, 'recordStore')
790
791 - def serialise_terms(self, termid, terms, recs=0, occs=0):
792 # in: list of longs 793 if len(terms) == 1: 794 # HACK. Accept bitfield from mergeTerms 795 bf = terms[0] 796 else: 797 bf = SimpleBitfield() 798 for item in terms[::3]: 799 bf[item] = 1 800 pack = struct.pack('lll', termid, recs, occs) 801 val = pack + str(bf) 802 return val
803
804 - def deserialise_terms(self, data, prox=1):
805 lsize = 3 * self.longStructSize 806 longs = data[:lsize] 807 terms = list(struct.unpack('lll', longs)) 808 if len(data) > lsize: 809 bf = SimpleBitfield(data[lsize:]) 810 terms.append(bf) 811 return terms
812
813 - def merge_terms(self, structTerms, newTerms, op="replace", recs=0, occs=0):
814 (termid, oldTotalRecs, oldTotalOccs, oldBf) = structTerms 815 if op in['add', 'replace']: 816 for t in newTerms[1::3]: 817 oldBf[t] = 1 818 elif op == 'delete': 819 for t in newTerms[1::3]: 820 oldBf[t] = 0 821 trecs = oldBf.lenTrueItems() 822 toccs = trecs 823 merged = [termid, trecs, toccs, oldBf] 824 return merged
825
826 - def construct_item(self, session, term):
827 # in: single triple 828 # out: resultSetItem 829 # Need to map recordStore and docid at indexStore 830 return self.indexStore.create_item(session, term[0], term[1], term[2])
831
832 - def construct_resultSet(self, session, terms, queryHash={}):
833 # in: unpacked 834 # out: resultSet 835 if len(terms) > 3: 836 data = terms[3] 837 s = BitmapResultSet(session, data, recordStore=self.recordStore) 838 else: 839 bmp = SimpleBitfield(0) 840 s = BitmapResultSet(session, bmp, recordStore=self.recordStore) 841 s.index = self 842 if queryHash: 843 s.queryTerm = queryHash['text'] 844 s.queryFreq = queryHash['occurences'] 845 if (terms): 846 s.termid = terms[0] 847 s.totalRecs = terms[1] 848 s.totalOccs = terms[2] 849 else: 850 s.totalRecs = 0 851 s.totalOccs = 0 852 return s
853 854 855 try: 856 from resultSet import ArrayResultSet 857 except: 858 raise 859 860
861 -class RecordIdentifierIndex(Index):
862
863 - def begin_indexing(self, session):
864 pass
865 - def commit_indexing(self, session):
866 pass
867 - def index_record(self, session, record):
868 pass
869 - def delete_record(self, session, record):
870 pass
871
872 - def scan(self, session, clause, db):
873 raise NotImplementedError()
874
875 - def search(self, session, clause, db):
876 # Copy data from clause to resultSetItem after checking exists 877 recordStore = self.get_path(session, 'recordStore') 878 base = SimpleResultSet(session) 879 if clause.relation.value in ['=', 'exact']: 880 t = clause.term.value 881 if t.isdigit(): 882 t = long(t) 883 if recordStore.fetch_size(session, t) > -1: 884 item = SimpleResultSetItem(session) 885 item.docid = t 886 item.recordStore = recordStore.id 887 item.database = db.id 888 items = [item] 889 else: 890 items = [] 891 elif clause.relation.value == 'any': 892 # split on whitespace 893 terms = clause.term.value.split() 894 items = [] 895 for t in terms: 896 if t.isdigit(): 897 t = long(t) 898 if recordStore.fetch_size(session, t) > -1: 899 item = SimpleResultSetItem(session) 900 item.docid = t 901 item.database = db.id 902 item.recordStore = recordStore.id 903 items.append(item) 904 base.fromList(items) 905 return base
906 907 try: 908 import numarray as na 909 910 class ArrayIndex(SimpleIndex): 911 # Store tuples of docid, occurences only 912 913 def __init__(self, session, node, parent): 914 SimpleIndex.__init__(self, session, node, parent) 915 self.indexStore = self.get_path(session, 'indexStore') 916 self.recordStore = self.get_path(session, 'recordStore')
917 918 def search(self, session, clause, db): 919 # Final destination. Process Term. 920 p = self.permissionHandlers.get('info:srw/operation/2/search', None) 921 if p: 922 if not session.user: 923 raise PermissionException("Authenticated user required to search index %s" % self.id) 924 okay = p.hasPermission(session, session.user) 925 if not okay: 926 raise PermissionException("Permission required to search index %s" % self.id) 927 pn = self.get_setting(session, 'termProcess') 928 if (pn == None): 929 pn = 0 930 else: 931 pn = int(pn) 932 process = self.sources[pn][1] 933 res = self._processChain(session, [clause.term.value], process) 934 matches = [] 935 rel = clause.relation 936 937 store = self.indexStore 938 # XXX check that relation.contextSet is CQL 939 if (clause.relation.value in ['any', 'all', '=', 'exact', 'adj']): 940 for k in res: 941 # add prox boolean to save deserialisation time 942 term = store.fetch_term(session, self, k, clause.relation.value == '=') 943 s = self.construct_resultSet(session, term, res[k]) 944 matches.append(s) 945 elif (clause.relation.value in ['>=', '>', '<', '<=']): 946 if (len(res) <> 1): 947 d = SRWDiagnostics.Diagnostic24() 948 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value) 949 raise d 950 else: 951 termList = store.fetch_termList(session, self, res.keys()[0], 0, clause.relation.value) 952 for t in termList: 953 matches.append(self.construct_resultSet(session, t[1])) 954 elif (clause.relation.value == "within"): 955 if (len(res) <> 2): 956 d = SRWDiagnostics.Diagnostic24() 957 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value) 958 raise d 959 else: 960 # Not XXX necessarily pulled out in the right order. 961 termList = store.fetch_termList(session, self, res.keys()[0], end=res.keys()[1]) 962 for t in termList: 963 matches.append(self.construct_resultSet(session, t[1])) 964 else: 965 d = SRWDiagnostics.Diagnostic24() 966 d.details = "%s %s" % (clause.relation.toCQL(), clause.term.value) 967 raise d 968 base = ArrayResultSet(session, [], recordStore = self.recordStore) 969 if not matches: 970 return base 971 else: 972 rs = base.combine(session, matches, clause, db) 973 tdb = self.get_path(session, 'termineDb') 974 if tdb: 975 rs.termWeight = 0 976 for k in res: 977 rs.termWeight += termine.fetch_weight(session, k, tdb) 978 return rs 979 980 def serialise_terms(self, termid, terms, recs=0, occs=0): 981 # in: list 982 if type(terms) == types.ListType: 983 # Will actually be list of triples as we haven't thrown out recStore 984 nterms = len(terms) / 3 985 terms = na.array(terms, 'u4', shape=(nterms, 3)) 986 # now throw out recStore by array mashing 987 arr2 = na.transpose(terms) 988 terms = na.transpose( 989 na.reshape( 990 na.concatenate([arr2[0], arr2[2]]) 991 , (2,nterms) 992 )) 993 994 995 tval = terms.tostring() 996 fmt = 'lll' 997 totalRecs = len(terms) 998 totalOccs = sum(terms[:,1]) 999 pack = struct.pack(fmt, termid, totalRecs, totalOccs) 1000 return pack + tval 1001 1002 def deserialise_terms(self, data, prox=1): 1003 # in: tostring()ified array 1004 # w/ metadata 1005 lsize = 3 * self.longStructSize 1006 (termid, totalRecs, totalOccs) = struct.unpack('lll', data[:lsize]) 1007 shape = ((len(data) -lsize) / 8, 2) 1008 if shape[0]: 1009 return [termid, totalRecs, totalOccs, na.fromstring(data[lsize:], 'u4', shape)] 1010 else: 1011 return [termid, totalRecs, totalOccs] 1012 1013 def merge_terms(self, structTerms, newTerms, op="replace", recs=0, occs=0): 1014 pass 1015 1016 def construct_resultSet(self, session, terms, queryHash={}): 1017 # terms should be formatted array 1018 if len(terms) > 2: 1019 rs = ArrayResultSet(session, terms[3], self.recordStore) 1020 else: 1021 rs = ArrayResultSet(session, [], self.recordStore) 1022 rs.index = self 1023 if queryHash: 1024 rs.queryTerm = queryHash['text'] 1025 rs.queryFreq = queryHash['occurences'] 1026 if (len(terms)): 1027 rs.termid = terms[0] 1028 rs.totalRecs = terms[1] 1029 rs.totalOccs = terms[2] 1030 else: 1031 rs.totalRecs = 0 1032 rs.totalOccs = 0 1033 return rs 1034 1035 1036 class ProximityArrayIndex(ArrayIndex): 1037 1038 def serialise_terms(self, termid, terms, recs=0, occs=0): 1039 # in: list of longs 1040 # out: LLL array L+ 1041 flat = [] 1042 prox = [] 1043 t = 0 1044 lt = len(terms) 1045 while t < lt: 1046 # rec, store, freq, [elem, idx]+ 1047 (id, freq) = terms[t], terms[t+2] 1048 end = t+3+(freq*2) 1049 itemprox = terms[t+3:t+3+(freq*2)] 1050 flat.extend([id, freq]) 1051 prox.extend(itemprox) 1052 t = end 1053 a = na.array(flat, 'u4') 1054 arraystr = a.tostring() 1055 fmt = 'l' * (len(prox)) 1056 params = [fmt] 1057 params.extend(prox) 1058 proxstr = struct.pack(*params) 1059 head = struct.pack('lll', termid, recs, occs) 1060 return head + arraystr + proxstr 1061 1062 def deserialise_terms(self, data, prox=1): 1063 lss = self.longStructSize * 3 1064 (termid, totalRecs, totalOccs) = struct.unpack('lll', data[:lss]) 1065 if len(data) > lss: 1066 arrlen = totalRecs * 8 1067 shape = (totalRecs, 2) 1068 arr = na.fromstring(data[lss:arrlen+lss], 'u4', shape) 1069 1070 if prox: 1071 proxData = data[arrlen+lss:] 1072 fmt = 'l' * (len(proxData) / 4) 1073 prox = struct.unpack(fmt, proxData) 1074 # Now associate prox with item 1075 itemhash = {} 1076 c = 0 1077 for item in arr: 1078 end = c + (item[1] * 2) 1079 itemhash[item[0]] = na.array(prox[c:end], 'u4', shape=((end-c)/2, 2)) 1080 c = end 1081 else: 1082 itemhash = {} 1083 return [termid, totalRecs, totalOccs, arr, itemhash] 1084 else: 1085 return [termid, totalRecs, totalOccs] 1086 1087 def construct_resultSet(self, session, terms, queryHash={}): 1088 # in: unpacked 1089 # out: resultSet 1090 if len(terms) > 2: 1091 rs = ArrayResultSet(session, terms[3], self.recordStore) 1092 rs.proxInfo = terms[4] 1093 else: 1094 rs = ArrayResultSet(session, [], self.recordStore) 1095 rs.index = self 1096 if queryHash: 1097 rs.queryTerm = queryHash['text'] 1098 rs.queryFreq = queryHash['occurences'] 1099 rs.queryPositions = [] 1100 for x in queryHash['positions'][1::2]: 1101 rs.queryPositions.append(x) 1102 if (len(terms)): 1103 rs.termid = terms[0] 1104 rs.totalRecs = terms[1] 1105 rs.totalOccs = terms[2] 1106 else: 1107 rs.totalRecs = 0 1108 rs.totalOccs = 0 1109 return rs 1110 1111 1112 except: 1113 raise 1114