Module resultSet
[hide private]
[frames] | no frames]

Source Code for Module resultSet

   1  from baseObjects import ResultSet, ResultSetItem, Index 
   2  from PyZ3950 import CQLParser 
   3  import math, types 
   4   
   5  import sys 
   6   
   7  from xml.sax import ContentHandler, make_parser, parseString as saxParseString, ErrorHandler, InputSource as SaxInput 
   8  import cStringIO as StringIO 
   9  from xml.sax.saxutils import escape, unescape 
  10  import utils 
  11  import cPickle 
  12   
13 -class DeserializationHandler(ContentHandler):
14 items = [] 15 item = None 16 set = None 17 session = None 18 currContent = "" 19 20
21 - def reinit(self, session, set):
22 self.currContent = "" 23 self.session = session 24 self.set = set 25 self.items = [] 26 self.item = None
27
28 - def startElement(self, name, attrs):
29 if name == "item": 30 self.item = SimpleResultSetItem(self.session)
31
32 - def endElement(self, name):
33 c = self.currContent 34 if name == "queryFreq": 35 self.set.queryFreq = long(c) 36 elif name == "queryTerm": 37 self.set.queryTerm = c 38 elif name == "termWeight": 39 self.set.termWeight = float(c) 40 elif name == "queryPositions" and c: 41 if len(c) > 1: 42 self.set.queryPositions = cPickle.loads(str(c)) 43 elif name == "item": 44 self.set.append(self.item) 45 elif name == "recStore": 46 self.item.recordStore = c 47 elif name == "id": 48 if c.isdigit(): 49 self.item.docid = long(c) 50 else: 51 self.item.docid = c 52 elif name == "weight": 53 self.item.weight = float(c) 54 elif name == "scaledWeight": 55 self.item.scaledWeight = float(c) 56 elif name == "occs": 57 self.item.occurences = long(c) 58 elif name == "database": 59 self.item.database = c 60 elif name == "proxInfo" and c: 61 if len(c) > 1: 62 self.item.proxInfo = cPickle.loads(str(c)) 63 self.currContent = ""
64
65 - def characters(self, text):
66 self.currContent += unescape(text)
67 68 69 localParser = make_parser() 70 localParser.setErrorHandler(ErrorHandler()) 71 localInput = SaxInput() 72 localHandler = DeserializationHandler() 73 localParser.setContentHandler(localHandler) 74 75
76 -class RankedResultSet(ResultSet):
77
78 - def _sumWeights(self, items, n):
79 item = items[0] 80 item.weight = sum([x.weight for x in items]) 81 return item
82
83 - def _meanWeights(self, items, n):
84 item = items[0] 85 item.weight = sum([x.weight for x in items]) 86 item.weight = item.weight / n 87 return item
88
89 - def _normWeights(self, items, n):
90 for i in items: 91 i.weight = i.weight * (i.resultSet.minWeight / i.resultSet.maxWeight) 92 return self._meanWeights(items, n)
93
94 - def _cmbzWeights(self, a, b):
95 a.weight = a.weight * (self.minWeight / self.maxWeight) 96 if b: 97 b.weight = b.weight * (self.minWeight / self.maxWeight) 98 a.weight = (a.weight + b.weight) * 2.0 99 else: 100 a.weight = a.weight / 2.0
101
102 - def _nprvWeights(self, a, b):
103 a.weight = a.weight * (self.minWeight / self.maxWeight) 104 if b: 105 b.weight = b.weight * (self.minWeight / self.maxWeight) 106 a.weight = (a.weight + b.weight) * 2.0 107 else: 108 # Leave high ranking ones high 109 rlen = len(a.resultSet._list) 110 if (( rlen > 150 and item.resultSetPosition > 100) 111 or (rlen < 150 and item.resultSetPosition > rlen/2)): 112 a.weight = a.weight / 2.0
113
114 - def _pivotWeights(self, a, b):
115 # Determine which item is component set, and which item is from document set 116 # If the component's parent document's id is the same as the one in the 117 # full document list, then adjust 118 119 # Normalise min/max as above 120 # Pivot default is 0.7, but allow override 121 # (Pivot * documentScore) + ((1-pivot) * componentScore) 122 123 # If not in the list then just ((1-pivot) * componentScore) 124 125 pass
126 127
128 -class SimpleResultSet(RankedResultSet):
129 _list = [] 130 131 id = "" 132 termid = -1 133 totalOccs = 0 134 totalRecs = 0 135 expires = 0 136 index = None 137 queryTerm = "" 138 queryFreq = 0 139 queryFragment = None 140 queryPositions = [] 141 relevancy = 0 142 maxWeight = 0 143 minWeight = 0 144 termWeight = 0.0 145 recordStore = "" 146
147 - def __init__(self, session, data=[], id="", recordStore=""):
148 self._list = data 149 self.id = id 150 self.recordStore = recordStore
151
152 - def __getitem__(self, k):
153 return self._list[k]
154
155 - def __len__(self):
156 return len(self._list)
157
158 - def fromList(self, data):
159 self._list = data
160
161 - def serialise(self, session):
162 # Turn into XML 163 xml = ['<resultSet>'] 164 xml.append('<queryTerm>%s</queryTerm><queryFreq>%s</queryFreq><queryPositions>%s</queryPositions><termWeight>%s</termWeight>' % (self.queryTerm, self.queryFreq, escape(cPickle.dumps(self.queryPositions)), self.termWeight)) 165 xml.append('<items>') 166 for item in self: 167 if type(item.docid) in types.StringTypes: 168 docid = escape(item.docid) 169 else: 170 docid = str(item.docid) 171 xml.append("<item><recStore>%s</recStore><id>%s</id><occs>%s</occs><weight>%s</weight><scaledWeight>%s</scaledWeight><proxInfo>%s</proxInfo><database>%s</database></item>" % (item.recordStore, 172 docid, 173 item.occurences, 174 item.weight, 175 item.scaledWeight, 176 escape(cPickle.dumps(item.proxInfo)), 177 item.database)) 178 xml.append('</items>') 179 xml.append('</resultSet>') 180 return ''.join(xml)
181
182 - def deserialise(self, session, data):
183 self._list = [] 184 localHandler.reinit(session, self) 185 localInput.setByteStream(StringIO.StringIO(data)) 186 localParser.parse(localInput) 187 return None
188 189
190 - def append(self, item):
191 item.resultSet = self 192 item.resultSetPosition = len(self._list) 193 self._list.append(item)
194
195 - def extend(self, itemList):
196 for i in itemList: 197 self.append(i)
198
199 - def _lrAssign(self, session, others, clause, cql, db):
200 if (db): 201 totalDocs = db.totalRecords 202 if totalDocs == 0: 203 raise ValueErorr("No documents in database?") 204 else: 205 # Uhoh. Can't do it. (XXX Better Error) 206 raise(ValueError("Don't know database for determining relevancy")) 207 208 # William S Cooper proposes: 209 constants = [-3.7, 1.269, -0.31, 0.679, -0.0674, 0.223, 2.01] 210 211 # Ray R Larson proposes: 212 constants = [-3.7, 1.269, -0.31, 0.679, -0.021, 0.223, 4.01] 213 214 # Index Configuration proposes: 215 idx = db.protocolMaps['http://www.loc.gov/zing/srw/'].resolveIndex(session, clause) 216 if (idx): 217 for x in range(7): 218 temp = idx.get_setting(session, 'lr_constant%d' % x) 219 if (temp): 220 constants[x] = float(temp) 221 222 # Query proposes: 223 relSetUri = "info:srw/cql-context-set/2/relevance-1.0" 224 for m in cql.modifiers: 225 # Already been pinged for resolve() 226 if (m.type.prefixURI == relSetUri): 227 if m.type.value[:5] == "const": 228 try: 229 constants[int(m.type.value[5])] = float(m.value) 230 except ValueError: 231 # Invalid literal for float() 232 pass 233 except IndexError: 234 # list index out of range 235 pass 236 237 sumLogQueryFreq = 0.0 238 sumQueryFreq = 0 239 sumIDF = 0.0 240 241 # Sort rss by length 242 243 # Each rs represents one unique word in query 244 for rs in others: 245 sumLogQueryFreq += math.log(rs.queryFreq) 246 sumQueryFreq += rs.queryFreq 247 n = len(rs) 248 if n: 249 rs.idf = math.log(totalDocs / float(n)) 250 x2 = math.sqrt(sumQueryFreq) 251 252 # resultSets will be sorted by item already 253 # Step through all concurrently 254 255 tmpList = [] 256 cont = 1 257 oidxs = range(1,len(others)) 258 nors = len(others) 259 positions = [0] * nors 260 all = cql.value in ['all', 'and', '=', 'prox', 'adj'] 261 maxWeight = -1 262 minWeight = 9999999999 263 264 while cont: 265 items = [others[0][positions[0]]] 266 rspos = [0] 267 for o in oidxs: 268 nitem = others[o][positions[o]] 269 if nitem == items[0]: 270 items.append(nitem) 271 rspos.append(o) 272 elif nitem < items[0]: 273 if all: 274 # skip until equal or greater 275 positions[o] += 1 276 while others[o][positions[o]] < items[0]: 277 positions[o] += 1 278 else: 279 items = [nitem] 280 rspos = [o] 281 for r in rspos: 282 positions[r] += 1 283 284 while others and positions[0] == len(others[0])-1: 285 others.pop(0) 286 positions.pop(0) 287 if not others: 288 cont = 0 289 if all and len(items) < nors: 290 continue 291 292 sumLogDAF = sum(map(math.log, [x.occurences for x in items])) 293 sumIdx = sum([x.resultSet.idf for x in items]) 294 295 x1 = sumLogQueryFreq / float(n) 296 x3 = sumLogDAF / float(n) 297 x5 = sumIDF / float(n) 298 x6 = math.log(float(n)) 299 try: 300 recStore = recStores[item.recordStore] 301 except: 302 db = session.server.get_object(session, session.database) 303 recStore = db.get_object(session, item.recordStore) 304 recStores[item.recordStore] = recStore 305 doclen = recStore.fetch_recordSize(session, item.docid) 306 x4 = math.sqrt(doclen) 307 logodds = constants[0] + (constants[1] * x1) + (constants[2] * x2) + \ 308 (constants[3] * x3) + (constants[4] * x4) + (constants[5] * x5) + \ 309 (constants[6] * x6) 310 item.weight= 0.75 * (math.exp(logodds) / (1 + math.exp(logodds))) 311 tmplist.append(item) 312 if item.weight > maxWeight: 313 maxWeight = item.weight 314 elif item.weight < minWeight: 315 minWeight = item.weight 316 317 self._list = tmplist 318 self.minWeight = minWeight 319 self.maxWeight = maxWeight 320 self.relevancy = 1 321 return 1
322
323 - def _coriAssign(self, session, others, clause, cql, db):
324 if (db): 325 totalDocs = float(db.totalRecords) 326 avgSize = float(db.meanRecordSize) 327 if not totalDocs or not avgSize: 328 raise ValueError("0 documents in database") 329 else: 330 # Uhoh. Can't do it. (XXX Better Error) 331 raise(ValueError("Don't know database for determining relevancy")) 332 333 recStores = {} 334 for rs in others: 335 matches = float(len(rs)) 336 if not matches: 337 rs.minWeight = 1.0 338 rs.maxWeight = -1.0 339 continue 340 I = math.log((totalDocs + 0.5) / matches) / math.log(totalDocs + 1.0) 341 rs.minWeight = 1000000.0 342 rs.maxWeight = -1.0 343 for item in rs: 344 df = float(item.occurences) 345 recStore = recStores.get(item.recordStore, None) 346 if not recStore: 347 recStore = db.get_object(session, item.recordStore) 348 recStores[item.recordStore] = recStore 349 size = recStore.fetch_recordSize(session, item.docid) 350 T = df / ( df + 50.0 + (( 150.0 * size) / avgSize)) 351 item.weight = 0.4 + (0.6 * T * I) 352 return 0
353
354 - def _tfidfAssign(self, session, others, clause, cql, db):
355 # w(i,j) = tf(i,j) * (log ( N / df(i))) 356 if (db): 357 totalDocs = float(db.totalRecords) 358 if not totalDocs: 359 raise ValueError("0 documents in database") 360 else: 361 # Uhoh. Can't do it. (XXX Better Error) 362 raise(ValueError("Don't know database for determining relevancy")) 363 364 for rs in others: 365 matches = float(len(rs)) 366 rs.minWeight = 10000000.0 367 rs.maxWeight = -1.0 368 for item in rs: 369 weight = item.occurences * math.log(totalDocs / matches) 370 item.weight = weight 371 if rs.maxWeight < weight: 372 rs.maxWeight = weight 373 if rs.minWeight > weight: 374 rs.minWeight = weight 375 return 0
376
377 - def combine(self, session, others, clause, db=None):
378 379 if (isinstance(clause, CQLParser.Triple)): 380 cql = clause.boolean 381 else: 382 cql = clause.relation 383 384 # XXX To Configuration 385 relSetUri = "info:srw/cql-context-set/2/relevance-1.0" 386 cqlSet = "info:srw/cql-context-set/1/cql-v1.1" 387 388 relevancy = 0 389 algorithm = "cori" 390 combine = "mean" 391 modType = "" 392 for m in cql.modifiers: 393 m.type.parent = clause 394 m.type.resolvePrefix() 395 if (m.type.prefixURI == relSetUri): 396 # Relevancy info 397 relevancy = 1 398 if m.type.value == "algorithm": 399 algorithm = m.value 400 elif m.type.value == "combine": 401 combine = m.value 402 elif (m.type.prefixURI == cqlSet and m.type.value == "relevant"): 403 # Generic 'relevancy please' request 404 relevancy = 1 405 406 # Check if any others are relevance ranked already and preserve 407 if (not relevancy): 408 for x in others: 409 if (x.relevancy): 410 relevancy = 1 411 break 412 pi = 0 413 for m in cql.modifiers: 414 if m.type.value == 'proxinfo': 415 pi = 1 416 break 417 418 # sort result sets by length 419 all = cql.value in ['all', 'and', '=', 'prox', 'adj'] 420 if cql.value != "not": 421 keys = [(len(x), x) for x in others] 422 keys.sort(reverse=not all) 423 others = [x for (key,x) in keys] 424 425 if (relevancy): 426 self.relevancy = 1 427 if (isinstance(cql, CQLParser.Relation)): 428 fname = "_%sAssign" % algorithm 429 if (hasattr(self, fname)): 430 fn = getattr(self, fname) 431 else: 432 raise NotImplementedError 433 finish = fn(session, others, clause, cql, db) 434 if finish: 435 return self 436 437 if len(others) == 1: 438 # Just adding relevance to items? 439 if relevancy: 440 others[0].relevancy = 1 441 return others[0] 442 else: 443 # Merge 444 if relevancy: 445 maxWeight = -1 446 minWeight = 9999999999 447 fname = "_%sWeights" % combine 448 if (hasattr(self, fname)): 449 fn = getattr(self, fname) 450 else: 451 raise NotImplementedError 452 453 tmplist = [] 454 cont = 1 455 oidxs = range(1,len(others)) 456 lens = [len(x) for x in others] 457 if all and 0 in lens: 458 # no point, just return empty result set 459 return self 460 nors = len(others) 461 positions = [0] * nors 462 cmpHash = {'<' : [-1], 463 '<=' : [-1, 0], 464 '=' : [0], 465 '>=' : [0, 1], 466 '>' : [1]} 467 distance = 1 468 unit = "word" 469 comparison = "=" 470 if (cql.value == 'prox' and cql.modifiers): 471 if (cql['unit']): 472 unit = cql['unit'].value 473 if (cql['distance']): 474 distance = int(cql['distance'].value) 475 comparison = cql['distance'].comparison 476 chitem = cmpHash[comparison] 477 if unit == "word": 478 proxtype = 1 479 elif unit == "element" and distance == 0 and comparison == "=": 480 proxtype = 2 481 else: 482 raise NotImplementedError() 483 hasGetItemList = [hasattr(o, 'get_item') for o in others] 484 while cont: 485 items = [others[0][positions[0]]] 486 rspos = [0] 487 for o in oidxs: 488 if o != -1: 489 if hasGetItemList[o]: 490 nitem = others[o].get_item(items[0]) 491 if not nitem: 492 continue 493 else: 494 try: 495 nitem = others[o][positions[o]] 496 except IndexError: 497 oidxs[o-1] = -1 498 continue 499 if nitem < items[0]: 500 if all or cql.value == 'not': 501 # skip until equal or greater 502 while True: 503 positions[o] += 1 504 if positions[o] >= lens[o] or others[o][positions[o]] >= items[0]: 505 break 506 if positions[o] != lens[o]: 507 nitem = others[o][positions[o]] 508 else: 509 items = [nitem] 510 rspos = [o] 511 continue 512 if nitem == items[0]: 513 items.append(nitem) 514 rspos.append(o) 515 516 for r in rspos: 517 positions[r] += 1 518 519 while others and positions[0] > len(others[0])-1: 520 others.pop(0) 521 positions.pop(0) 522 lens.pop(0) 523 if not others or ((cql.value == 'not' or all) and len(others) != nors): 524 cont = 0 525 if (all and len(items) < nors): 526 continue 527 elif cql.value == 'not' and len(items) != 1: 528 continue 529 elif cql.value in ["prox", 'adj', '=']: 530 # proxInfo is hash of (docid, recStore) to list of locations in record 531 # sort items by query position. Repeat set at each posn 532 533 newitems = [] 534 mqp = -1 535 for i in items: 536 i.queryTerm = i.resultSet.queryTerm 537 i.queryPositions = i.resultSet.queryPositions 538 for qp in i.queryPositions: 539 mqp = max(mqp, qp) 540 for idx in range(mqp+1): 541 for i in items: 542 if idx in i.queryPositions: 543 newitems.append(i) 544 break 545 546 items = newitems[:] 547 litem = items.pop(0) 548 nomatch = 0 549 while len(items): 550 ritem = items.pop(0) 551 matchlocs = [] 552 for r in range(0,len(ritem.proxInfo),2): 553 relem = ritem.proxInfo[r] 554 rwpos = ritem.proxInfo[r+1] 555 for l in range(0, len(litem.proxInfo), 2): 556 if (proxtype == 1 and litem.proxInfo[l] == relem and (cmp(litem.proxInfo[l+1]+distance,rwpos) in chitem)): 557 matchlocs.extend([relem, rwpos]) 558 elif proxtype == 2 and litem.proxInfo[l][0] == relem: 559 matchlocs.extend([relem, rwpos]) 560 if matchlocs: 561 #Can't do this, as might have more later. a b c a would fail 562 #ritem.proxInfo = matchlocs 563 litem = ritem 564 else: 565 # no match, break to next set of items 566 nomatch = 1 567 break 568 if nomatch: 569 continue 570 items = newitems 571 # do stuff on items to reduce to single representative 572 if relevancy: 573 item = fn(items, nors) 574 if item.weight > maxWeight: 575 maxWeight = item.weight 576 if item.weight < minWeight: 577 minWeight = item.weight 578 else: 579 item = items[0] 580 if pi: 581 # copy proxInfo around 582 for o in items[1:]: 583 item.proxInfo.extend(o.proxInfo) 584 tmplist.append(item) 585 586 self._list = tmplist 587 if relevancy: 588 self.relevancy = 1 589 self.minWeight = minWeight 590 self.maxWeight = maxWeight 591 return self
592
593 - def order(self, session, spec):
594 # sort according to some spec 595 # spec can be index, "docid", xpath, XXX other? 596 # XXX Need secondary sort specs 597 598 l = self._list 599 600 if not l: 601 # don't try to sort empty set 602 return 603 604 if (isinstance(spec, Index) and spec.get_setting(session, 'sortStore')): 605 # check pre-processed db 606 istore = spec.get_path(session, 'indexStore') 607 tmplist = [(istore.fetch_sortValue(session, spec, x), x) for x in l] 608 tmplist.sort() 609 self._list = [x for (key,x) in tmplist] 610 elif isinstance(spec, Index): 611 # Extract data as per indexing, MUCH slower 612 recs = [] 613 storeHash = {} 614 for r in l: 615 store = r.recordStore 616 o = storeHash.get(store, spec.get_object(session, store)) 617 storeHash[store] = o 618 recs.append(o.fetch_record(session, r.docid)) 619 tmplist = [(spec.extract_data(session, recs[x]), l[x]) for x in range(len(l))] 620 tmplist.sort() 621 self._list = [x for (key,x) in tmplist] 622 elif (type(spec) == str and hasattr(self[0], spec)): 623 # Sort by attribute of item 624 tmplist = [(getattr(x, spec), x) for x in l] 625 if spec == 'docid': 626 tmplist.sort() 627 else: 628 tmplist.sort(reverse=True) 629 self._list = [x for (key, x) in tmplist] 630 elif isinstance(spec, str): 631 # XPath? 632 raise NotImplementedError 633 else: 634 raise NotImplementedError
635
636 - def reverse(self, session):
637 self._list.reverse()
638
639 - def retrieve(self, session, start, numReq, cache=0):
640 end = min(start+numReq+1, len(self)) 641 recs = [] 642 # XXX This should cache server, db and resultSet 643 for r in range(start, end): 644 recs.append(self[r].fetch_record(session)) 645 return recs
646
647 - def scale_weights(self):
648 mw = self.minWeight 649 r = 1 / (self.maxWeight - mw) 650 # faster than equivalent list comprehension! 651 for rsi in self._list: 652 rsi.scaledWeight = (rsi.weight - mw) * r
653
654 -class SimpleResultSetItem(ResultSetItem):
655 docid = 0 656 numericId = None 657 recordStore = "" 658 database = "" 659 occurences = 0 660 weight = 0.5 661 scaledWeight = 0.5 662 diagnostic = None 663 proxInfo = [] 664
665 - def __init__(self, session, docid=0, recStore="", occs=0, database="", diagnostic=None, weight=0.5, resultSet = None, numeric=None):
666 self.docid = docid 667 self.recordStore = recStore 668 self.occurences = occs 669 self.weight = weight 670 self.scaledWeight = 0.5 671 self.database = database 672 self.resultSet = resultSet 673 self.proxInfo = [] 674 self.numericId = numeric
675
676 - def fetch_record(self, session):
677 # return record from store 678 if (session.server): 679 # XXX 680 # db = session.server.get_object(session, self.database) 681 db = session.server.get_object(session, session.database) 682 recStore = db.get_object(session, self.recordStore) 683 rec = recStore.fetch_record(session, self.docid) 684 rec.resultSetItem = self 685 return rec
686
687 - def __eq__(self, other):
688 return self.docid == other.docid and self.recordStore == other.recordStore
689
690 - def __str__(self):
691 return "%s/%s" % (self.recordStore, self.docid)
692 - def __repr__(self):
693 return "Ptr:%s/%s" % (self.recordStore, self.docid)
694
695 - def __cmp__(self, other):
696 # default sort by docid 697 if self.numericId != None: 698 if other.numericId != None: 699 oid = other.numericId 700 else: 701 oid = other.docid 702 c = cmp(self.numericId, oid) 703 else: 704 c = cmp(self.docid, other.docid) 705 if not c: 706 return cmp(self.recordStore, other.recordStore) 707 else: 708 return c
709
710 - def __hash__(self):
711 # Hash of recordstore + id 712 return hash(str(self))
713 714 715 from utils import SimpleBitfield 716
717 -class BitmapResultSet(ResultSet):
718 bitfield = None 719 currItems = None 720 recordStore = None 721 722 relevancy = 0 723 termid = -1 724 totalOccs = 0 725 totalRecs = 0 726 id = "" 727 index = None 728 queryTerm = "" 729 queryFreq = 0 730 queryFragment = None 731 queryPositions = [] 732 relevancy = 0 733 maxWeight = 0 734 minWeight = 0 735
736 - def __init__(self, session, data=0, recordStore=None):
737 if isinstance(data, SimpleBitfield): 738 self.bitfield = data 739 else: 740 self.bitfield = SimpleBitfield(data) 741 self.currItems = None 742 self.recordStore = recordStore 743 self.relevancy = 0
744
745 - def __getitem__(self, k):
746 if self.currItems == None: 747 self.currItems = self.bitfield.trueItems() 748 return SimpleResultSetItem(None, self.currItems[k], self.recordStore, 1)
749
750 - def __len__(self):
751 return self.bitfield.lenTrueItems()
752
753 - def serialise(self):
754 return str(self.bitfield)
755
756 - def deserialise(self, data):
757 self.bitfield = SimpleBitfield(data)
758
759 - def get_item(self, item):
760 if self.bitfield[item.docid]: 761 return item 762 else: 763 return None
764
765 - def combine(self, session, others, clause, db=None):
766 if (isinstance(clause, CQLParser.Triple)): 767 cql = clause.boolean 768 else: 769 cql = clause.relation 770 v = cql.value 771 if v in ['=', 'exact', 'prox']: 772 if len(others) == 1: 773 return others[0] 774 else: 775 raise NotImplementedError() 776 elif (v in ['all', 'and']): 777 s = others[0] 778 for o in others[1:]: 779 s = s.intersection(o) 780 elif (v in ['any', 'or', '>', '>=', '<', '<=']): 781 s = others[0] 782 for o in others[1:]: 783 s = s.union(o) 784 elif (v == 'not'): 785 s = others[0] 786 for o in others[1:]: 787 s = s.difference(o) 788 else: 789 raise NotImplementedError() 790 return s
791
792 - def order(self, spec):
793 # Reorder a bitmap?! 794 raise NotImplementedError()
795
796 - def retrieve(self, numReq, start, cache=0):
797 end = min(start+numrecs+1, len(self)) 798 recs = [] 799 # XXX This should cache server, db and resultSet 800 for r in range(start, end): 801 recs.append(self[r].fetch_record(session)) 802 return recs
803 804 805 try: 806 import numarray as na 807 808 class ArrayResultSet(SimpleResultSet): 809 810 _array = None 811 recordStore = None 812 proxInfo = {} 813 814 def __init__(self, session, data, recordStore = None): 815 # data is (docid, freq) array 816 self.recordStore = recordStore 817 self.proxInfo = {} 818 if len(data) > 0: 819 z = na.zeros(len(data), 'f4')[:,na.NewAxis] 820 d2 = na.transpose(data) 821 z2 = na.transpose(z) 822 final = na.transpose(na.concatenate([d2,z2])) 823 self._array = final 824 else: 825 self._array = na.array([])
826 827 def __getitem__(self, k): 828 item = SimpleResultSetItem(None, int(self._array[k][0]), self.recordStore.id, int(self._array[k][1])) 829 item.weight = self._array[k][2] 830 item.proxInfo = self.proxInfo.get(item.docid, []) 831 return item 832 833 def __len__(self): 834 return len(self._array) 835 836 def _toBitmap(self, session): 837 bf = SimpleBitfield(0) 838 for x in self._array: 839 bf[long(x[0])] = 1 840 return BitmapResultSet(session, bf) 841 842 # Relevance Rank Algorithms 843 844 def _lrAssign(self, session, others, clause, cql, db): 845 if (db): 846 totalDocs = db.totalRecords 847 if totalDocs == 0: 848 raise ValueErorr("No documents in database?") 849 else: 850 # Uhoh. Can't do it. (XXX Better Error) 851 raise(ValueError("Don't know database for determining relevancy")) 852 853 # William S Cooper proposes: 854 constants = [-3.7, 1.269, -0.31, 0.679, -0.0674, 0.223, 2.01] 855 856 # Ray R Larson proposes: 857 constants = [-3.7, 1.269, -0.31, 0.679, -0.021, 0.223, 4.01] 858 859 # Index Configuration proposes: 860 idx = db.protocolMaps['http://www.loc.gov/zing/srw/'].resolveIndex(session, clause) 861 if (idx): 862 for x in range(7): 863 temp = idx.get_setting(session, 'lr_constant%d' % x) 864 if (temp): 865 constants[x] = float(temp) 866 867 # Query proposes: 868 relSetUri = "info:srw/cql-context-set/2/relevance-1.0" 869 for m in cql.modifiers: 870 # Already been pinged for resolve() 871 if (m.type.prefixURI == relSetUri): 872 if m.type.value[:5] == "const": 873 try: 874 constants[int(m.type.value[5])] = float(m.value) 875 except ValueError: 876 # Invalid literal for float() 877 pass 878 except IndexError: 879 # list index out of range 880 pass 881 882 nors = len(others) 883 all = cql.value in ['all', 'and', '=', 'prox', 'adj'] 884 alst = [] 885 886 sumLogQueryFreq = 0.0 887 sumQueryFreq = 0 888 sumIDF = 0.0 889 890 # Each rs represents one unique word in query 891 for rs in others: 892 sumLogQueryFreq += math.log(float(rs.queryFreq)) 893 sumQueryFreq += rs.queryFreq 894 if len(rs): 895 n = len(rs) 896 idf = math.log(totalDocs / float(n)) 897 # Now stick idf in weight slot for the mean time 898 # Will be replaced with real weight 899 a = rs._array 900 a.transpose() 901 l = [idf] * n 902 b = na.array([l]) 903 c = na.concatenate([a,b]) 904 c.transpose() 905 alst.append(c) 906 907 x2 = math.sqrt(sumQueryFreq) 908 909 merged = na.concatenate(alst) 910 idx = merged.argsort(0)[:,0] 911 srtd = na.take(merged, idx) 912 lsrtd = len(srtd) 913 914 getSize = self.recordStoreObj.fetch_recordSize 915 item = None 916 i = 0 917 tmplist = [] 918 while i < lsrtd: 919 item = srtd[i] 920 n = 1 921 sumLogDAF = math.log(item[1]) 922 sumIdf = item[2] 923 i += 1 924 if i < lsrtd: 925 nitem = srtd[i] 926 while nitem[0] == item[0]: 927 sumLogDAF += math.log(nitem[1]) 928 sumIdf += nitem[2] 929 i += 1 930 n += 1 931 if i < lsrtd: 932 nitem = srtd[i] 933 else: 934 break 935 if all and n < nors: 936 continue 937 x1 = sumLogQueryFreq / float(n) 938 x3 = sumLogDAF / float(n) 939 x5 = sumIDF / float(n) 940 x6 = math.log(float(n)) 941 doclen = getSize(session, int(item[0])) 942 x4 = math.sqrt(doclen) 943 logodds = constants[0] + (constants[1] * x1) + (constants[2] * x2) + \ 944 (constants[3] * x3) + (constants[4] * x4) + (constants[5] * x5) + \ 945 (constants[6] * x6) 946 item[2]= 0.75 * (math.exp(logodds) / (1 + math.exp(logodds))) 947 tmplist.append(item) 948 return tmplist 949 950 951 def _coriAssign(self, session, others, clause, cql, db): 952 if (db): 953 totalDocs = float(db.totalRecords) 954 avgSize = float(db.meanRecordSize) 955 if not totalDocs or not avgSize: 956 raise ValueError("0 documents in database") 957 else: 958 # Uhoh. Can't do it. (XXX Better Error) 959 raise(ValueError("Don't know database for determining relevancy")) 960 961 # CORI proposes: 962 constants = [0.5, 50.0, 150.0, 0.4, 0.6] 963 964 # Index Configuration proposes: 965 idx = db.protocolMaps['http://www.loc.gov/zing/srw/'].resolveIndex(session, clause) 966 if (idx): 967 for x in range(7): 968 temp = idx.get_setting(session, 'cori_constant%d' % x) 969 if (temp): 970 constants[x] = float(temp) 971 972 # Query proposes: 973 relSetUri = "info:srw/cql-context-set/2/relevance-1.0" 974 for m in cql.modifiers: 975 # Already been pinged for resolve() 976 if (m.type.prefixURI == relSetUri): 977 if m.type.value[:5] == "const": 978 try: 979 constants[int(m.type.value[5])] = float(m.value) 980 except ValueError: 981 # Invalid literal for float() or int() 982 pass 983 except IndexError: 984 # list index out of range 985 pass 986 987 getSize = self.recordStore.fetch_recordSize 988 for rs in others: 989 matches = float(len(rs)) 990 if not matches: 991 continue 992 I = math.log((totalDocs + 0.5) / matches) / math.log(totalDocs + 1.0) 993 for i in range(len(rs._array)): 994 item = rs._array[i] 995 # array(id, occs, weight) 996 df = float(item[1]) 997 size = getSize(session, int(item[0])) 998 T = df / ( df + 50.0 + (( 150.0 * size) / avgSize)) 999 rs._array[i][2] = 0.4 + (0.6 * T * I) 1000 return [] 1001 1002 def _tfidfAssign(self, session, others, clause, cql, db): 1003 if (db): 1004 totalDocs = float(db.totalRecords) 1005 if not totalDocs: 1006 raise ValueError("0 documents in database") 1007 else: 1008 # Uhoh. Can't do it. (XXX Better Error) 1009 raise(ValueError("Don't know database for determining relevancy")) 1010 for rs in others: 1011 idf = math.log(totalDocs / float(len(rs))) 1012 for i in range(len(rs)): 1013 rs._array[i][2] = rs._array[i][1] * idf 1014 return [] 1015 1016 # *** Combine Algorithms *** 1017 1018 def _meanWeightsArray(self, items, n): 1019 item = items[0] 1020 for i in items[1:]: 1021 item[2] += i[2] 1022 item[2] = item[2] / float(n) 1023 return item 1024 1025 def _sumWeightsArray(self, items, n): 1026 item = items[0] 1027 for i in items[1:]: 1028 item[2] += i[2] 1029 return item 1030 1031 # API 1032 1033 1034 def SLOW_combine(self, session, others, clause, db=None): 1035 if (isinstance(clause, CQLParser.Triple)): 1036 cql = clause.boolean 1037 else: 1038 cql = clause.relation 1039 1040 # XXX To Configuration 1041 relSetUri = "info:srw/cql-context-set/2/relevance-1.0" 1042 cqlSet = "info:srw/cql-context-set/1/cql-v1.1" 1043 1044 relevancy = 0 1045 algorithm = "cori" 1046 combine = "mean" 1047 modType = "" 1048 for m in cql.modifiers: 1049 m.type.parent = clause 1050 m.type.resolvePrefix() 1051 if (m.type.prefixURI == relSetUri): 1052 # Relevancy info 1053 relevancy = 1 1054 if m.type.value == "algorithm": 1055 algorithm = m.value 1056 elif m.type.value == "combine": 1057 combine = m.value 1058 elif (m.type.prefixURI == cqlSet and m.type.value == "relevant"): 1059 # Generic 'relevancy please' request 1060 relevancy = 1 1061 1062 # Check if any others are relevance ranked already and preserve 1063 if (not relevancy): 1064 for x in others: 1065 if (x.relevancy): 1066 relevancy = 1 1067 break 1068 pi = 0 1069 for m in cql.modifiers: 1070 if m.type.value == 'proxinfo': 1071 pi = 1 1072 break 1073 1074 # sort result sets by length 1075 all = cql.value in ['all', 'and', '=', 'prox', 'adj'] 1076 if cql.value != "not": 1077 keys = [(len(x), x) for x in others] 1078 keys.sort(reverse=not all) 1079 others = [x for (key,x) in keys] 1080 1081 if (relevancy): 1082 if (isinstance(cql, CQLParser.Relation)): 1083 fname = "_%sAssign" % algorithm 1084 if (hasattr(self, fname)): 1085 fn = getattr(self, fname) 1086 else: 1087 raise NotImplementedError 1088 finish = fn(session, others, clause, cql, db) 1089 if finish: 1090 return self 1091 1092 if len(others) == 1: 1093 # Just adding relevance to items? 1094 if relevancy: 1095 self.relevancy = 1 1096 return others[0] 1097 else: 1098 # Merge 1099 if relevancy: 1100 maxWeight = -1 1101 minWeight = 9999999999 1102 1103 tmplist = [] 1104 cont = 1 1105 oidxs = range(1,len(others)) 1106 lens = [len(x) for x in others] 1107 nors = len(others) 1108 positions = [0] * nors 1109 cmpHash = {'<' : [-1], 1110 '<=' : [-1, 0], 1111 '=' : [0], 1112 '>=' : [0, 1], 1113 '>' : [1]} 1114 distance = 1 1115 unit = "word" 1116 comparison = "=" 1117 if (cql.value == 'prox' and cql.modifiers): 1118 if (cql['unit']): 1119 unit = cql['unit'].value 1120 if (cql['distance']): 1121 distance = int(cql['distance'].value) 1122 comparison = cql['distance'].comparison 1123 chitem = cmpHash[comparison] 1124 if unit == "word": 1125 proxtype = 1 1126 elif unit == "element" and distance == 0 and comparison == "=": 1127 proxtype = 2 1128 else: 1129 raise NotImplementedError() 1130 hasGetItemList = [hasattr(o, 'get_item') for o in others] 1131 isArrayRs = [isinstance(o, ArrayResultSet) for o in others] 1132 if sum(isArrayRs) == len(isArrayRs): 1133 1134 if relevancy: 1135 fname = "_%sWeightsArray" % combine 1136 if (hasattr(self, fname)): 1137 fn = getattr(self, fname) 1138 else: 1139 raise NotImplementedError 1140 1141 # All arrays, don't create RSI unnecessarily 1142 while cont: 1143 # item is: array(recid, occs, weight) 1144 items = [others[0]._array[positions[0]]] 1145 rspos = [0] 1146 for o in oidxs: 1147 if o != -1: 1148 try: 1149 nitem = others[o]._array[positions[o]] 1150 except IndexError: 1151 oidxs[o-1] = -1 1152 continue 1153 if nitem[0] < items[0][0]: 1154 if all or cql.value == 'not': 1155 # skip until equal or greater 1156 positions[o] += 1 1157 while others[o]._array[positions[o]][0] < items[0][0]: 1158 positions[o] += 1 1159 if positions[o] == lens[o]: 1160 break 1161 if positions[o] != lens[o]: 1162 nitem = others[o]._array[positions[o]] 1163 else: 1164 items = [nitem] 1165 rspos = [o] 1166 continue 1167 if nitem[0] == items[0][0]: 1168 items.append(nitem) 1169 rspos.append(o) 1170 1171 for r in rspos: 1172 positions[r] += 1 1173 1174 while others and positions[0] > len(others[0])-1: 1175 others.pop(0) 1176 positions.pop(0) 1177 lens.pop(0) 1178 if not others or ((cql.value == 'not' or all) and len(others) != nors): 1179 cont = 0 1180 if (all and len(items) < nors): 1181 continue 1182 elif cql.value == 'not' and len(items) != 1: 1183 continue 1184 elif cql.value in ["prox", 'adj', '=']: 1185 1186 newitems = [] 1187 mqp = -1 1188 qts = [] 1189 qps = [] 1190 for i in range(len(items)): 1191 rs = others[rspos[i]] 1192 qts.append(rs.queryTerm) 1193 qps.append(rs.queryPositions) 1194 for qp in rs.queryPositions: 1195 mqp = max(mqp, qp) 1196 for idx in range(mqp+1): 1197 for i in range(len(items)): 1198 if idx in qps[i].queryPositions: 1199 newitems.append(items[i]) 1200 newrspos.append(rspos[i]) 1201 break 1202 1203 # XXX: Fix from here 1204 items = newitems[:] 1205 rspos = newrspos[:] 1206 litem = items.pop(0) 1207 nomatch = 0 1208 while len(items): 1209 ritem = items.pop(0) 1210 matchlocs = [] 1211 for r in range(0,len(ritem.proxInfo),2): 1212 relem = ritem.proxInfo[r] 1213 rwpos = ritem.proxInfo[r+1] 1214 for l in range(0, len(litem.proxInfo), 2): 1215 if (proxtype == 1 and litem.proxInfo[l] == relem and (cmp(litem.proxInfo[l+1]+distance,rwpos) in chitem)): 1216 matchlocs.extend([relem, rwpos]) 1217 elif proxtype == 2 and litem.proxInfo[l][0] == relem: 1218 matchlocs.extend([relem, rwpos]) 1219 if matchlocs: 1220 ritem.proxInfo = matchlocs 1221 litem = ritem 1222 else: 1223 # no match, break to next set of items 1224 nomatch = 1 1225 break 1226 if nomatch: 1227 continue 1228 items = newitems 1229 # do stuff on items to reduce to single representative 1230 if relevancy: 1231 item = fn(items, nors) 1232 if item[2] > maxWeight: 1233 maxWeight = item[2] 1234 if item[2] < minWeight: 1235 minWeight = item[2] 1236 else: 1237 item = items[0] 1238 if pi: 1239 # copy proxInfo around 1240 raise NotImplementedError 1241 for o in items[1:]: 1242 item.proxInfo.extend(o.proxInfo) 1243 tmplist.append(item) 1244 1245 else: 1246 # not all array based, use slower RSI creation 1247 1248 if relevancy: 1249 fname = "_%sWeights" % combine 1250 if (hasattr(self, fname)): 1251 fn = getattr(self, fname) 1252 else: 1253 raise NotImplementedError 1254 1255 while cont: 1256 items = [others[0][positions[0]]] 1257 rspos = [0] 1258 for o in oidxs: 1259 if o != -1: 1260 if hasGetItemList[o]: 1261 nitem = others[o].get_item(items[0]) 1262 if not nitem: 1263 continue 1264 else: 1265 try: 1266 nitem = others[o][positions[o]] 1267 except IndexError: 1268 oidxs[o-1] = -1 1269 continue 1270 if nitem < items[0]: 1271 if all or cql.value == 'not': 1272 # skip until equal or greater 1273 positions[o] += 1 1274 while others[o][positions[o]] < items[0]: 1275 positions[o] += 1 1276 if positions[o] == lens[o]: 1277 break 1278 if positions[o] != lens[o]: 1279 nitem = others[o][positions[o]] 1280 else: 1281 items = [nitem] 1282 rspos = [o] 1283 continue 1284 if nitem == items[0]: 1285 items.append(nitem) 1286 rspos.append(o) 1287 1288 for r in rspos: 1289 positions[r] += 1 1290 1291 while others and positions[0] > len(others[0])-1: 1292 others.pop(0) 1293 positions.pop(0) 1294 lens.pop(0) 1295 if not others or ((cql.value == 'not' or all) and len(others) != nors): 1296 cont = 0 1297 if (all and len(items) < nors): 1298 continue 1299 elif cql.value == 'not' and len(items) != 1: 1300 continue 1301 elif cql.value in ["prox", 'adj', '=']: 1302 # proxInfo is hash of (docid, recStore) to list of locations in record 1303 # sort items by query position. Repeat set at each posn 1304 1305 newitems = [] 1306 mqp = -1 1307 for i in items: 1308 i.queryTerm = i.resultSet.queryTerm 1309 i.queryPositions = i.resultSet.queryPositions 1310 for qp in i.queryPositions: 1311 mqp = max(mqp, qp) 1312 for idx in range(mqp+1): 1313 for i in items: 1314 if idx in i.queryPositions: 1315 newitems.append(i) 1316 break 1317 items = newitems[:] 1318 litem = items.pop(0) 1319 nomatch = 0 1320 while len(items): 1321 ritem = items.pop(0) 1322 matchlocs = [] 1323 for r in range(0,len(ritem.proxInfo),2): 1324 relem = ritem.proxInfo[r] 1325 rwpos = ritem.proxInfo[r+1] 1326 for l in range(0, len(litem.proxInfo), 2): 1327 if (proxtype == 1 and litem.proxInfo[l] == relem and (cmp(litem.proxInfo[l+1]+distance,rwpos) in chitem)): 1328 matchlocs.extend([relem, rwpos]) 1329 elif proxtype == 2 and litem.proxInfo[l][0] == relem: 1330 matchlocs.extend([relem, rwpos]) 1331 if matchlocs: 1332 ritem.proxInfo = matchlocs 1333 litem = ritem 1334 else: 1335 # no match, break to next set of items 1336 nomatch = 1 1337 break 1338 if nomatch: 1339 continue 1340 items = newitems 1341 # do stuff on items to reduce to single representative 1342 if relevancy: 1343 item = fn(items, nors) 1344 if item.weight > maxWeight: 1345 maxWeight = item.weight 1346 if item.weight < minWeight: 1347 minWeight = item.weight 1348 else: 1349 item = items[0] 1350 if pi: 1351 # copy proxInfo around 1352 for o in items[1:]: 1353 item.proxInfo.extend(o.proxInfo) 1354 tmplist.append(na.array([float(item.docid), float(item.occurences), item.weight])) 1355 1356 self._array = na.array(tmplist) 1357 if relevancy: 1358 self.relevancy = 1 1359 self.minWeight = minWeight 1360 self.maxWeight = maxWeight 1361 return self 1362 1363 1364 1365 def combine(self, session, others, clause, db=None): 1366 if (isinstance(clause, CQLParser.Triple)): 1367 cql = clause.boolean 1368 else: 1369 cql = clause.relation 1370 # XXX To Configuration 1371 relSetUri = "info:srw/cql-context-set/2/relevance-1.0" 1372 cqlSet = "info:srw/cql-context-set/1/cql-v1.1" 1373 1374 relevancy = 0 1375 algorithm = "cori" 1376 combine = "mean" 1377 modType = "" 1378 for m in cql.modifiers: 1379 m.type.parent = clause 1380 m.type.resolvePrefix() 1381 if (m.type.prefixURI == relSetUri): 1382 # Relevancy info 1383 relevancy = 1 1384 if m.type.value == "algorithm": 1385 algorithm = m.value 1386 elif m.type.value == "combine": 1387 combine = m.value 1388 elif (m.type.prefixURI == cqlSet and m.type.value == "relevant"): 1389 # Generic 'relevancy please' request 1390 relevancy = 1 1391 1392 # Check if any others are relevance ranked already 1393 if (not relevancy): 1394 for x in others: 1395 if (x.relevancy): 1396 relevancy = 1 1397 break 1398 1399 tmplist = [] 1400 if (relevancy): 1401 if (isinstance(cql, CQLParser.Relation)): 1402 fname = "_%sAssign" % algorithm 1403 if (hasattr(self, fname)): 1404 fn = getattr(self, fname) 1405 else: 1406 raise NotImplementedError 1407 tmplist = fn(session, others, clause, cql, db) 1408 1409 if (not tmplist): 1410 if len(others) == 1: 1411 return others[0] 1412 # Merge 1413 fname = "_%sWeights" % combine 1414 if (hasattr(self, fname)): 1415 fn = getattr(self, fname) 1416 else: 1417 raise NotImplementedError 1418 1419 alst = [] 1420 for o in others: 1421 alst.append(o._array) 1422 merged = na.concatenate(alst) 1423 idx = merged.argsort(0)[:,0] 1424 srtd = na.take(merged, idx) 1425 lsrtd = len(srtd) 1426 nors = len(others) 1427 1428 if (cql.value in ["or", 'any', 'within', '>', '>=', '<', '<=']): 1429 item = None 1430 i = 0 1431 while i < lsrtd: 1432 item = srtd[i] 1433 i += 1 1434 items = [item] 1435 if i < lsrtd: 1436 nitem = srtd[i] 1437 while nitem[0] == item[0]: 1438 items.append(nitem) 1439 i += 1 1440 if i < lsrtd: 1441 nitem = srtd[i] 1442 else: 1443 break 1444 tmplist.append(fn(items, nors)) 1445 elif (cql.value in ['and', 'all']): 1446 item = None 1447 i = 0 1448 while i < lsrtd: 1449 item = srtd[i] 1450 items = [item] 1451 i += 1 1452 if i < lsrtd: 1453 nitem = srtd[i] 1454 while nitem[0] == item[0]: 1455 items.append(nitem) 1456 i += 1 1457 if i < lsrtd: 1458 nitem = srtd[i] 1459 else: 1460 break 1461 if (len(items) == nors): 1462 tmplist.append(fn(items, nors)) 1463 1464 if tmplist: 1465 newarray = na.array(tmplist) 1466 # Now sort on relv 1467 idx = newarray.argsort(0)[::-1,2] 1468 final = na.take(newarray, idx) 1469 self._array = final 1470 else: 1471 self._array = na.array([]) 1472 else: 1473 # Not ranked 1474 if len(others) == 1: 1475 return others[0] 1476 1477 alst = [] 1478 empty = [] 1479 if (len(others) == 2 and others[1].__class__ != self.__class__): 1480 # Handle different result set objects 1481 base = others[0] 1482 other = others[1] 1483 l = [] 1484 # try blocks in case past end of bitmap numbering 1485 if other.__class__ == BitmapResultSet: 1486 bf = other.bitfield 1487 if cql.value == 'and': 1488 for item in base._array: 1489 try: 1490 if bf[int(item[0])]: 1491 l.append(item) 1492 except: 1493 pass 1494 self._array = na.array(l) 1495 return self 1496 elif cql.value == 'not': 1497 for item in base._array: 1498 try: 1499 if not bf[int(item[0])]: 1500 l.append(item) 1501 except: 1502 pass 1503 self._array = na.array(l) 1504 return self 1505 elif cql.value == 'or': 1506 # ORing a bitmap? :/ 1507 # Convert bitmap to array :( 1508 alst.append(base._array) 1509 items = bf.trueItems() 1510 il = len(items) 1511 items.extend([0] * (2 * il)) 1512 arr = na.array(items, 'f4', shape=(3,il)) 1513 arr.transpose() 1514 alst.append(arr) 1515 else: 1516 # Prox 1517 raise NotImplementedError() 1518 else: 1519 raise NotImplementedError() 1520 else: 1521 # Merging all same class 1522 for o in others: 1523 if len(o): 1524 alst.append(o._array) 1525 else: 1526 empty.append(o._array) 1527 merged = na.concatenate(alst) 1528 idx = merged.argsort(0)[:,0] 1529 srtd = na.take(merged, idx) 1530 lsrtd = len(srtd) 1531 nors = len(others) 1532 if (cql.value in ["or", 'any']): 1533 item = None 1534 i = 0 1535 while i < lsrtd: 1536 item = srtd[i] 1537 i += 1 1538 if i < lsrtd: 1539 nitem = srtd[i] 1540 while nitem[0] == item[0]: 1541 i += 1 1542 if i < lsrtd: 1543 nitem = srtd[i] 1544 else: 1545 break 1546 tmplist.append(item) 1547 elif (cql.value in ['and', 'all']): 1548 if empty: 1549 # An empty resultSet will nerf 1550 return self 1551 item = None 1552 i = 0 1553 while i < lsrtd: 1554 item = srtd[i] 1555 n = 1 1556 i += 1 1557 if i < lsrtd: 1558 nitem = srtd[i] 1559 while nitem[0] == item[0]: 1560 n += 1 1561 i += 1 1562 if i < lsrtd: 1563 nitem = srtd[i] 1564 else: 1565 break 1566 if (n == nors): 1567 tmplist.append(item) 1568 elif cql.value in ['=', 'prox', 'adj']: 1569 # proxInfo stored on resultSet 1570 if empty: 1571 return self 1572 distance = 1 1573 unit = "word" 1574 comparison = "=" 1575 cmpHash = {'<' : [-1], 1576 '<=' : [-1, 0], 1577 '=' : [0], 1578 '>=' : [0, 1], 1579 '>' : [1]} 1580 1581 if (cql.value == 'prox' and cql.modifiers): 1582 if (cql['unit']): 1583 unit = cql['unit'].value 1584 if (cql['distance']): 1585 distance = int(cql['distance'].value) 1586 comparison = cql['distance'].comparison 1587 1588 # proxInfo is hash of docid to list of locations in record 1589 1590 newothers = [] 1591 idx = 0 1592 cont = 1 1593 while others: 1594 for o in others: 1595 if idx in o.queryPositions: 1596 newothers.append(o) 1597 o.queryPositions.remove(idx) 1598 if not o.queryPositions: 1599 others.remove(o) 1600 idx += 1 1601 break 1602 others = newothers 1603 chitem = cmpHash[comparison] 1604 first = others[0] 1605 leftProx = first.proxInfo 1606 if (unit == 'word'): 1607 for o in others[1:]: 1608 rightProx = o.proxInfo 1609 newProx = {} 1610 for doc in leftProx: 1611 matchLocs = [] 1612 if (rightProx.has_key(doc)): 1613 leftLocs = leftProx[doc] 1614 rightLocs = rightProx[doc] 1615 for ll in leftLocs: 1616 for rl in rightLocs: 1617 # [0] == element, [1] == word position 1618 if (ll[0] == rl[0] and (cmp(ll[1]+distance,rl[1]) in chitem)): 1619 matchLocs.append(rl) 1620 if (matchLocs): 1621 newProx[doc] = matchLocs 1622 leftProx = newProx 1623 self.proxInfo = leftProx 1624 # Now update ResultSetItems 1625 else: 1626 raise NotImplementedError() 1627 1628 # This destroys all freq and weight info 1629 docids = leftProx.keys() 1630 dl = len(docids) 1631 zeros = [0] * (dl * 2) 1632 docids.extend(zeros) 1633 switch = na.array(docids, 'f4', shape=(3,dl)) 1634 self._array = na.transpose(switch) 1635 return self 1636 1637 elif cql.value == "not": 1638 # others[0], not (others[1:]) 1639 1640 base = others[0]._array 1641 docids = base[:,0] 1642 indexHash = dict(zip(docids, range(len(docids)))) 1643 tmpset = set() 1644 1645 # Build set of all docid positions we don't want 1646 for o in others[1:]: 1647 for item in o._array: 1648 try: 1649 tmpset.add(indexHash[item[0]]) 1650 except: 1651 pass 1652 # Create condition array of positions 1653 remove = na.array(list(tmpset)) 1654 ones = na.ones(len(docids)) 1655 put(ones, remove, 0) 1656 # Compress base by condition array (0 == remove) 1657 self._array = compress(remove, base) 1658 return self 1659 elif cql.value == "exact" and len(others) == 1: 1660 # Should only be one rs 1661 return others[0] 1662 else: 1663 raise NotImplementedError() 1664 # endif cql.value 1665 newarray = na.array(tmplist) 1666 self._array = newarray 1667 # endif relevance 1668 return self 1669 1670 def order(self, session, spec): 1671 if (isinstance(spec, Index) and spec.get_setting(session, 'sortStore')): 1672 # check pre-processed db 1673 istore = spec.get_path(session, 'indexStore') 1674 tmplist = [(istore.fetch_sortValue(session, spec, int(x[0])), x) for x in self._array] 1675 tmplist.sort() 1676 l = [x for (key,x) in tmplist] 1677 self._array = na.array(l) 1678 elif isinstance(spec, Index): 1679 # Extract data as per indexing, MUCH slower 1680 tmplist = [] 1681 for r in range(len(self._array)): 1682 rec = self.recordStore.fetch_record(session, int(self._array[r][0])) 1683 tmplist.append((spec.extract_data(session, rec), r)) 1684 tmplist.sort() 1685 l = [x for (key,x) in tmplist] 1686 self._array = self._array.take(na.array(l)) 1687 elif spec == "docid": 1688 idx = self._array.argsort(0)[::-1,0] 1689 self._array = self._array.take(idx) 1690 elif spec == "occurences": 1691 idx = self._array.argsort(0)[::-1,1] 1692 self._array = self._array.take(idx) 1693 elif spec == "relevance": 1694 idx = self._array.argsort(0)[::-1,2] 1695 self._array = self._array.take(idx) 1696 elif isinstance(spec, str): 1697 tmplist = [] 1698 xp = utils.verifyXPaths([spec])[0] 1699 for r in range(len(self._array)): 1700 rec = self.recordStore.fetch_record(session, int(self._array[r][0])) 1701 tmplist.append((rec.process_xpath(xp), r)) 1702 tmplist.sort() 1703 l = [x for (key,x) in tmplist] 1704 self._array = self._array.take(na.array(l)) 1705 else: 1706 raise NotImplementedError 1707 1708 def reverse(self, session): 1709 self._array = self._array[::-1] 1710 1711 1712 except: 1713 raise 1714