1 from baseObjects import ResultSet, ResultSetItem, Index
2 from PyZ3950 import CQLParser
3 import math, types
4
5 import sys
6
7 from xml.sax import ContentHandler, make_parser, parseString as saxParseString, ErrorHandler, InputSource as SaxInput
8 import cStringIO as StringIO
9 from xml.sax.saxutils import escape, unescape
10 import utils
11 import cPickle
12
67
68
69 localParser = make_parser()
70 localParser.setErrorHandler(ErrorHandler())
71 localInput = SaxInput()
72 localHandler = DeserializationHandler()
73 localParser.setContentHandler(localHandler)
74
75
77
82
88
93
101
113
115
116
117
118
119
120
121
122
123
124
125 pass
126
127
129 _list = []
130
131 id = ""
132 termid = -1
133 totalOccs = 0
134 totalRecs = 0
135 expires = 0
136 index = None
137 queryTerm = ""
138 queryFreq = 0
139 queryFragment = None
140 queryPositions = []
141 relevancy = 0
142 maxWeight = 0
143 minWeight = 0
144 termWeight = 0.0
145 recordStore = ""
146
147 - def __init__(self, session, data=[], id="", recordStore=""):
151
154
156 return len(self._list)
157
160
181
188
189
194
196 for i in itemList:
197 self.append(i)
198
199 - def _lrAssign(self, session, others, clause, cql, db):
200 if (db):
201 totalDocs = db.totalRecords
202 if totalDocs == 0:
203 raise ValueErorr("No documents in database?")
204 else:
205
206 raise(ValueError("Don't know database for determining relevancy"))
207
208
209 constants = [-3.7, 1.269, -0.31, 0.679, -0.0674, 0.223, 2.01]
210
211
212 constants = [-3.7, 1.269, -0.31, 0.679, -0.021, 0.223, 4.01]
213
214
215 idx = db.protocolMaps['http://www.loc.gov/zing/srw/'].resolveIndex(session, clause)
216 if (idx):
217 for x in range(7):
218 temp = idx.get_setting(session, 'lr_constant%d' % x)
219 if (temp):
220 constants[x] = float(temp)
221
222
223 relSetUri = "info:srw/cql-context-set/2/relevance-1.0"
224 for m in cql.modifiers:
225
226 if (m.type.prefixURI == relSetUri):
227 if m.type.value[:5] == "const":
228 try:
229 constants[int(m.type.value[5])] = float(m.value)
230 except ValueError:
231
232 pass
233 except IndexError:
234
235 pass
236
237 sumLogQueryFreq = 0.0
238 sumQueryFreq = 0
239 sumIDF = 0.0
240
241
242
243
244 for rs in others:
245 sumLogQueryFreq += math.log(rs.queryFreq)
246 sumQueryFreq += rs.queryFreq
247 n = len(rs)
248 if n:
249 rs.idf = math.log(totalDocs / float(n))
250 x2 = math.sqrt(sumQueryFreq)
251
252
253
254
255 tmpList = []
256 cont = 1
257 oidxs = range(1,len(others))
258 nors = len(others)
259 positions = [0] * nors
260 all = cql.value in ['all', 'and', '=', 'prox', 'adj']
261 maxWeight = -1
262 minWeight = 9999999999
263
264 while cont:
265 items = [others[0][positions[0]]]
266 rspos = [0]
267 for o in oidxs:
268 nitem = others[o][positions[o]]
269 if nitem == items[0]:
270 items.append(nitem)
271 rspos.append(o)
272 elif nitem < items[0]:
273 if all:
274
275 positions[o] += 1
276 while others[o][positions[o]] < items[0]:
277 positions[o] += 1
278 else:
279 items = [nitem]
280 rspos = [o]
281 for r in rspos:
282 positions[r] += 1
283
284 while others and positions[0] == len(others[0])-1:
285 others.pop(0)
286 positions.pop(0)
287 if not others:
288 cont = 0
289 if all and len(items) < nors:
290 continue
291
292 sumLogDAF = sum(map(math.log, [x.occurences for x in items]))
293 sumIdx = sum([x.resultSet.idf for x in items])
294
295 x1 = sumLogQueryFreq / float(n)
296 x3 = sumLogDAF / float(n)
297 x5 = sumIDF / float(n)
298 x6 = math.log(float(n))
299 try:
300 recStore = recStores[item.recordStore]
301 except:
302 db = session.server.get_object(session, session.database)
303 recStore = db.get_object(session, item.recordStore)
304 recStores[item.recordStore] = recStore
305 doclen = recStore.fetch_recordSize(session, item.docid)
306 x4 = math.sqrt(doclen)
307 logodds = constants[0] + (constants[1] * x1) + (constants[2] * x2) + \
308 (constants[3] * x3) + (constants[4] * x4) + (constants[5] * x5) + \
309 (constants[6] * x6)
310 item.weight= 0.75 * (math.exp(logodds) / (1 + math.exp(logodds)))
311 tmplist.append(item)
312 if item.weight > maxWeight:
313 maxWeight = item.weight
314 elif item.weight < minWeight:
315 minWeight = item.weight
316
317 self._list = tmplist
318 self.minWeight = minWeight
319 self.maxWeight = maxWeight
320 self.relevancy = 1
321 return 1
322
323 - def _coriAssign(self, session, others, clause, cql, db):
324 if (db):
325 totalDocs = float(db.totalRecords)
326 avgSize = float(db.meanRecordSize)
327 if not totalDocs or not avgSize:
328 raise ValueError("0 documents in database")
329 else:
330
331 raise(ValueError("Don't know database for determining relevancy"))
332
333 recStores = {}
334 for rs in others:
335 matches = float(len(rs))
336 if not matches:
337 rs.minWeight = 1.0
338 rs.maxWeight = -1.0
339 continue
340 I = math.log((totalDocs + 0.5) / matches) / math.log(totalDocs + 1.0)
341 rs.minWeight = 1000000.0
342 rs.maxWeight = -1.0
343 for item in rs:
344 df = float(item.occurences)
345 recStore = recStores.get(item.recordStore, None)
346 if not recStore:
347 recStore = db.get_object(session, item.recordStore)
348 recStores[item.recordStore] = recStore
349 size = recStore.fetch_recordSize(session, item.docid)
350 T = df / ( df + 50.0 + (( 150.0 * size) / avgSize))
351 item.weight = 0.4 + (0.6 * T * I)
352 return 0
353
376
377 - def combine(self, session, others, clause, db=None):
378
379 if (isinstance(clause, CQLParser.Triple)):
380 cql = clause.boolean
381 else:
382 cql = clause.relation
383
384
385 relSetUri = "info:srw/cql-context-set/2/relevance-1.0"
386 cqlSet = "info:srw/cql-context-set/1/cql-v1.1"
387
388 relevancy = 0
389 algorithm = "cori"
390 combine = "mean"
391 modType = ""
392 for m in cql.modifiers:
393 m.type.parent = clause
394 m.type.resolvePrefix()
395 if (m.type.prefixURI == relSetUri):
396
397 relevancy = 1
398 if m.type.value == "algorithm":
399 algorithm = m.value
400 elif m.type.value == "combine":
401 combine = m.value
402 elif (m.type.prefixURI == cqlSet and m.type.value == "relevant"):
403
404 relevancy = 1
405
406
407 if (not relevancy):
408 for x in others:
409 if (x.relevancy):
410 relevancy = 1
411 break
412 pi = 0
413 for m in cql.modifiers:
414 if m.type.value == 'proxinfo':
415 pi = 1
416 break
417
418
419 all = cql.value in ['all', 'and', '=', 'prox', 'adj']
420 if cql.value != "not":
421 keys = [(len(x), x) for x in others]
422 keys.sort(reverse=not all)
423 others = [x for (key,x) in keys]
424
425 if (relevancy):
426 self.relevancy = 1
427 if (isinstance(cql, CQLParser.Relation)):
428 fname = "_%sAssign" % algorithm
429 if (hasattr(self, fname)):
430 fn = getattr(self, fname)
431 else:
432 raise NotImplementedError
433 finish = fn(session, others, clause, cql, db)
434 if finish:
435 return self
436
437 if len(others) == 1:
438
439 if relevancy:
440 others[0].relevancy = 1
441 return others[0]
442 else:
443
444 if relevancy:
445 maxWeight = -1
446 minWeight = 9999999999
447 fname = "_%sWeights" % combine
448 if (hasattr(self, fname)):
449 fn = getattr(self, fname)
450 else:
451 raise NotImplementedError
452
453 tmplist = []
454 cont = 1
455 oidxs = range(1,len(others))
456 lens = [len(x) for x in others]
457 if all and 0 in lens:
458
459 return self
460 nors = len(others)
461 positions = [0] * nors
462 cmpHash = {'<' : [-1],
463 '<=' : [-1, 0],
464 '=' : [0],
465 '>=' : [0, 1],
466 '>' : [1]}
467 distance = 1
468 unit = "word"
469 comparison = "="
470 if (cql.value == 'prox' and cql.modifiers):
471 if (cql['unit']):
472 unit = cql['unit'].value
473 if (cql['distance']):
474 distance = int(cql['distance'].value)
475 comparison = cql['distance'].comparison
476 chitem = cmpHash[comparison]
477 if unit == "word":
478 proxtype = 1
479 elif unit == "element" and distance == 0 and comparison == "=":
480 proxtype = 2
481 else:
482 raise NotImplementedError()
483 hasGetItemList = [hasattr(o, 'get_item') for o in others]
484 while cont:
485 items = [others[0][positions[0]]]
486 rspos = [0]
487 for o in oidxs:
488 if o != -1:
489 if hasGetItemList[o]:
490 nitem = others[o].get_item(items[0])
491 if not nitem:
492 continue
493 else:
494 try:
495 nitem = others[o][positions[o]]
496 except IndexError:
497 oidxs[o-1] = -1
498 continue
499 if nitem < items[0]:
500 if all or cql.value == 'not':
501
502 while True:
503 positions[o] += 1
504 if positions[o] >= lens[o] or others[o][positions[o]] >= items[0]:
505 break
506 if positions[o] != lens[o]:
507 nitem = others[o][positions[o]]
508 else:
509 items = [nitem]
510 rspos = [o]
511 continue
512 if nitem == items[0]:
513 items.append(nitem)
514 rspos.append(o)
515
516 for r in rspos:
517 positions[r] += 1
518
519 while others and positions[0] > len(others[0])-1:
520 others.pop(0)
521 positions.pop(0)
522 lens.pop(0)
523 if not others or ((cql.value == 'not' or all) and len(others) != nors):
524 cont = 0
525 if (all and len(items) < nors):
526 continue
527 elif cql.value == 'not' and len(items) != 1:
528 continue
529 elif cql.value in ["prox", 'adj', '=']:
530
531
532
533 newitems = []
534 mqp = -1
535 for i in items:
536 i.queryTerm = i.resultSet.queryTerm
537 i.queryPositions = i.resultSet.queryPositions
538 for qp in i.queryPositions:
539 mqp = max(mqp, qp)
540 for idx in range(mqp+1):
541 for i in items:
542 if idx in i.queryPositions:
543 newitems.append(i)
544 break
545
546 items = newitems[:]
547 litem = items.pop(0)
548 nomatch = 0
549 while len(items):
550 ritem = items.pop(0)
551 matchlocs = []
552 for r in range(0,len(ritem.proxInfo),2):
553 relem = ritem.proxInfo[r]
554 rwpos = ritem.proxInfo[r+1]
555 for l in range(0, len(litem.proxInfo), 2):
556 if (proxtype == 1 and litem.proxInfo[l] == relem and (cmp(litem.proxInfo[l+1]+distance,rwpos) in chitem)):
557 matchlocs.extend([relem, rwpos])
558 elif proxtype == 2 and litem.proxInfo[l][0] == relem:
559 matchlocs.extend([relem, rwpos])
560 if matchlocs:
561
562
563 litem = ritem
564 else:
565
566 nomatch = 1
567 break
568 if nomatch:
569 continue
570 items = newitems
571
572 if relevancy:
573 item = fn(items, nors)
574 if item.weight > maxWeight:
575 maxWeight = item.weight
576 if item.weight < minWeight:
577 minWeight = item.weight
578 else:
579 item = items[0]
580 if pi:
581
582 for o in items[1:]:
583 item.proxInfo.extend(o.proxInfo)
584 tmplist.append(item)
585
586 self._list = tmplist
587 if relevancy:
588 self.relevancy = 1
589 self.minWeight = minWeight
590 self.maxWeight = maxWeight
591 return self
592
593 - def order(self, session, spec):
594
595
596
597
598 l = self._list
599
600 if not l:
601
602 return
603
604 if (isinstance(spec, Index) and spec.get_setting(session, 'sortStore')):
605
606 istore = spec.get_path(session, 'indexStore')
607 tmplist = [(istore.fetch_sortValue(session, spec, x), x) for x in l]
608 tmplist.sort()
609 self._list = [x for (key,x) in tmplist]
610 elif isinstance(spec, Index):
611
612 recs = []
613 storeHash = {}
614 for r in l:
615 store = r.recordStore
616 o = storeHash.get(store, spec.get_object(session, store))
617 storeHash[store] = o
618 recs.append(o.fetch_record(session, r.docid))
619 tmplist = [(spec.extract_data(session, recs[x]), l[x]) for x in range(len(l))]
620 tmplist.sort()
621 self._list = [x for (key,x) in tmplist]
622 elif (type(spec) == str and hasattr(self[0], spec)):
623
624 tmplist = [(getattr(x, spec), x) for x in l]
625 if spec == 'docid':
626 tmplist.sort()
627 else:
628 tmplist.sort(reverse=True)
629 self._list = [x for (key, x) in tmplist]
630 elif isinstance(spec, str):
631
632 raise NotImplementedError
633 else:
634 raise NotImplementedError
635
638
639 - def retrieve(self, session, start, numReq, cache=0):
646
653
713
714
715 from utils import SimpleBitfield
716
718 bitfield = None
719 currItems = None
720 recordStore = None
721
722 relevancy = 0
723 termid = -1
724 totalOccs = 0
725 totalRecs = 0
726 id = ""
727 index = None
728 queryTerm = ""
729 queryFreq = 0
730 queryFragment = None
731 queryPositions = []
732 relevancy = 0
733 maxWeight = 0
734 minWeight = 0
735
736 - def __init__(self, session, data=0, recordStore=None):
744
749
752
755
758
764
765 - def combine(self, session, others, clause, db=None):
766 if (isinstance(clause, CQLParser.Triple)):
767 cql = clause.boolean
768 else:
769 cql = clause.relation
770 v = cql.value
771 if v in ['=', 'exact', 'prox']:
772 if len(others) == 1:
773 return others[0]
774 else:
775 raise NotImplementedError()
776 elif (v in ['all', 'and']):
777 s = others[0]
778 for o in others[1:]:
779 s = s.intersection(o)
780 elif (v in ['any', 'or', '>', '>=', '<', '<=']):
781 s = others[0]
782 for o in others[1:]:
783 s = s.union(o)
784 elif (v == 'not'):
785 s = others[0]
786 for o in others[1:]:
787 s = s.difference(o)
788 else:
789 raise NotImplementedError()
790 return s
791
793
794 raise NotImplementedError()
795
796 - def retrieve(self, numReq, start, cache=0):
803
804
805 try:
806 import numarray as na
807
808 class ArrayResultSet(SimpleResultSet):
809
810 _array = None
811 recordStore = None
812 proxInfo = {}
813
814 def __init__(self, session, data, recordStore = None):
815
816 self.recordStore = recordStore
817 self.proxInfo = {}
818 if len(data) > 0:
819 z = na.zeros(len(data), 'f4')[:,na.NewAxis]
820 d2 = na.transpose(data)
821 z2 = na.transpose(z)
822 final = na.transpose(na.concatenate([d2,z2]))
823 self._array = final
824 else:
825 self._array = na.array([])