Module documentFactory
[hide private]
[frames] | no frames]

Source Code for Module documentFactory

   1   
   2  import socket, time 
   3  socket.setdefaulttimeout(30) 
   4   
   5  from baseObjects import DocumentFactory 
   6  from document import StringDocument 
   7  from record import SaxRecord 
   8  from bootstrap import BSParser 
   9  from utils import elementType, getFirstData, flattenTexts, reader, verifyXPaths 
  10  import re, os, c3errors, tarfile, cStringIO, sys, gzip 
  11  import mimetypes, httplib, urllib, urlparse, urllib2 
  12  import commands, codecs, types 
  13  from ZSI.client import Binding 
  14  from PyZ3950 import zoom 
  15  import SRW 
  16  from c3errors import * 
  17  from ftplib import FTP 
  18  from GoogleSearch_services import * 
  19  from utils import reader 
  20   
  21  mimetypes.add_type('application/marc', '.marc') 
  22   
  23  # NB: 
  24  # cache = 0:  yield, no caching 
  25  # cache = 1:  step through, cache positions in stream 
  26  # cache = 2:  step through, cache full documents 
  27  # other cache values undefined 
  28   
30 streamLocation = "" 31 format = "" 32 schema = "" 33 codec = "" 34 factory = None 35 filterRe = None 36 stream = None 37 locations = [] 38 documents = [] 39 length = 0 40
41 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):
42 self.factory = factory 43 self.format = format 44 self.schema = schema 45 self.codec = codec 46 self.stream = self.open_stream(stream)
47
48 - def open_stream(self, stream):
49 if hasattr(stream, 'read') and hasattr(stream, 'seek'): 50 # is a stream 51 self.streamLocation = "UNKNOWN" 52 return stream 53 else: 54 if os.path.exists(stream): 55 # is a file 56 self.streamLocation = stream 57 if not os.path.isdir(stream): 58 if self.codec: 59 return codecs.open(self.streamLocation, 'r', self.codec) 60 else: 61 return file(self.streamLocation) 62 else: 63 return stream 64 else: 65 # is a string 66 self.streamLocation = "STRING" 67 return cStringIO.StringIO(stream)
68
69 - def fetch_document(self, idx):
70 if self.length and idx >= self.length: 71 raise StopIteration 72 if self.documents: 73 return self.documents[idx] 74 elif self.locations: 75 self.stream.seek(self.locations[idx][0]) 76 data = self.stream.read(self.locations[idx][1]) 77 return data 78 else: 79 raise StopIteration
80 81
82 -class TermHashDocumentStream(BaseDocumentStream):
83
84 - def open_stream(self, stream):
85 # is a hash... 86 self.streamLocation = "TERM-STRING" 87 return stream.keys()
88
89 - def find_documents(self, session, cache=0):
90 # step through terms 91 if cache == 0: 92 for k in self.stream: 93 yield StringDocument(k) 94 raise StopIteration 95 elif cache == 2: 96 documents = [] 97 for k in self.stream: 98 documents.append(StringDocument(k)) 99 self.documents = documents
100 101
102 -class XmlDocumentStream(BaseDocumentStream):
103 start = None 104 endtag = "" 105
106 - def __init__(self, session, stream, format, schema="", codec="", factory=None):
107 BaseDocumentStream.__init__(self, session, stream, format, schema, codec, factory) 108 if (not schema): 109 self.start = re.compile("<([-a-zA-Z0-9_.]+:)?([-a-zA-Z0-9_.]+)[\s>]") 110 self.endtag = "" 111 else: 112 self.start = re.compile("<%s[\s>]" % schema) 113 self.endtag = "</" + schema + ">"
114
115 - def find_documents(self, session, cache=0):
116 docs = [] 117 locs = [] 118 endtag = self.endtag 119 let = len(endtag) 120 myTell = 0 121 xpi = "" 122 line = "" 123 while True: 124 ol = len(line) 125 line += self.stream.read(1024) 126 pi = line.find("<?xml ") 127 if (pi > -1): 128 # Store info 129 endpi = line.find("?>") 130 xpi = line[pi:endpi+2] + "\n" 131 xpi= "" 132 m = self.start.search(line) 133 if m: 134 if not self.endtag: 135 endtag = "</%s>" % m.group()[1:-1] 136 let = len(endtag) 137 s = m.start() 138 line = line[s:] 139 myTell += s 140 start = myTell 141 end = -1 142 strStart = 0 143 while end == -1: 144 if strStart: 145 # allow for end tag to be broken across reads 146 end = line.find(endtag, strStart-let) 147 else: 148 end = line.find(endtag) 149 if end > 0: 150 tlen = end+len(endtag) 151 txt = line[:tlen] 152 line = line[tlen:] 153 myTell += tlen 154 if cache == 0: 155 yield StringDocument(xpi + txt, mimeType="text/xml", schema=self.schema) 156 elif cache == 1: 157 locs.append((start, tlen)) 158 elif cache == 2: 159 docs.append(StringDocument(xpi + txt, mimeType="text/xml", schema=self.schema)) 160 else: 161 strStart = len(line) 162 line += self.stream.read(1024) 163 if len(line) == ol and not m: 164 if cache == 0: 165 self.stream.close() 166 raise StopIteration 167 else: 168 break 169 self.stream.close() 170 self.locations = locs 171 self.documents = docs 172 self.length = max(len(locs), len(docs))
173 174
175 -class MarcDocumentStream(BaseDocumentStream):
176
177 - def find_documents(self, session, cache=0):
178 docs = [] 179 locs = [] 180 data = self.stream.read(1536) 181 myTell = 0 182 while data: 183 rt = data.find("\x1D") 184 while (rt > -1): 185 txt = data[:rt+1] 186 tlen = len(txt) 187 if cache == 0: 188 yield StringDocument(txt, mimeType="application/marc") 189 elif cache == 1: 190 locs.append((myTell, tlen)) 191 elif cache == 2: 192 docs.append(StringDocument(txt, mimeType="application/marc")) 193 data = data[rt+1:] 194 myTell += tlen 195 rt = data.find("\x1D") 196 dlen = len(data) 197 data += self.stream.read(1536) 198 if (len(data) == dlen): 199 # Junk at end of file 200 data = "" 201 self.stream.close() 202 self.locations = locs 203 self.documents = docs 204 self.length = max(len(locs), len(docs))
205 206 # XmlTapeDocStream 207 # ArcFileDocStream 208 # MetsDocStream 209 210
211 -class MultipleDocumentStream(BaseDocumentStream):
212
213 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):
214 BaseDocumentStream.__init__(self, session, stream, format, schema, codec, factory) 215 filterStr = factory.get_setting(session, 'filterRegexp', "\.([a-zA-Z0-9]+|tar.gz|tar.bz2)$") 216 if filterStr: 217 self.filterRe = re.compile(filterStr) 218 else: 219 self.filterRe = None
220
221 - def _fetchStream(self, path):
222 return self.open_stream(path)
223
224 - def _fetchName(self, item):
225 return item
226
227 - def _processFile(self, session, item):
228 name = self._fetchName(item) 229 if self.filterRe: 230 m = self.filterRe.search(name) 231 if not m: 232 return None 233 mimetype = mimetypes.guess_type(name, 0) 234 235 if (mimetype[0] in ['text/sgml', 'text/xml']): 236 trip = ('stream', XmlDocumentStream, 'xml') 237 elif (mimetype[0] == 'application/x-tar'): 238 trip = ('stream', TarDocumentStream, ftype) 239 elif (mimetype[0] == 'application/zip'): 240 trip = ('stream', ZipDocumentStream, 'zip') 241 elif (mimetype[0] == 'application/marc'): 242 trip = ('stream', MarcDocumentStream, 'marc') 243 else: 244 trip = ('document', None, mimetype) 245 246 s = self._fetchStream(item) 247 if trip[0] == 'stream': 248 cls = trip[1] 249 nstream = cls(session, s, format=trip[2], schema=self.schema, codec=self.codec, factory=self.factory) 250 return ('stream', nstream) 251 elif trip[0] == 'document': 252 data = s.read() 253 s.close() 254 doc = StringDocument(data, mimeType=trip[2], filename=name) 255 return ('document', doc)
256
257 - def _processFiles(self, session, items, cache=0):
258 docs = [] 259 for item in items: 260 # Look for records in these places 261 stuff = self._processFile(session, item) 262 if not stuff: 263 # None means skip object 264 continue 265 (dtype, obj) = stuff 266 if dtype == 'stream': 267 gen = obj.find_documents(session, cache=cache) 268 if cache == 0: 269 # Will yield its documents, yield back up 270 for g in gen: 271 yield g 272 elif cache == 1: 273 try: 274 gen.next() 275 except: 276 pass 277 locs.append((fullname, mimetype, nstream.locs)) 278 elif cache == 2: 279 try: 280 gen.next() 281 except: 282 pass 283 docs.extend(nstream.docs) 284 elif dtype == 'document': 285 if cache == 0: 286 yield obj 287 elif cache == 1: 288 raise NotImplementedError 289 elif cache == 2: 290 docs.append(obj) 291 self.documents = docs
292
293 -class DirectoryDocumentStream(MultipleDocumentStream):
294
295 - def find_documents(self, session, cache=0):
296 for root, dirs, files in os.walk(self.streamLocation): 297 for d in dirs: 298 if os.path.islink(os.path.join(root, d)): 299 for root2, dirs2, files2 in os.walk(os.path.join(root,d)): 300 files2.sort() 301 files2 = map(lambda x: os.path.join(root2, x), files2) 302 for f in self._processFiles(session, files2, cache): 303 yield f 304 files.sort() 305 files = map(lambda x: os.path.join(root, x), files) 306 for f in self._processFiles(session, files, cache): 307 yield f
308 #XXXself.length = max(len(self.locs), len(self.docs)) 309 310 311 # XXX: Tar and Zip to use _processFiles for non documents 312 # eg tar of multiple XML document files 313 # also for filtering 314
315 -class TarDocumentStream(MultipleDocumentStream):
316
317 - def open_stream(self, stream):
318 if self.format in ['tar.gz', 'tgz']: 319 modeSuf = "gz" 320 elif self.format == 'tar.bz2': 321 modeSuf = "bz2" 322 else: 323 modeSuf = "" 324 325 if hasattr(stream, 'read'): 326 return tarfile.open(fileobj=stream, mode="r|%s" % modeSuf) 327 elif os.path.exists(stream): 328 return tarfile.open(stream, mode="r") # transparent 329 else: 330 s = cStringIO.StringIO(stream) 331 return tarfile.open(fileobj=s, mode="r|%s" % modeSuf)
332
333 - def _processFile(self, session, item):
334 name = self._fetchName(item) 335 if name[-1] == "/": 336 return None 337 else: 338 return MultipleDocumentStream._processFile(self, session, item)
339
340 - def _fetchStream(self, item):
341 return self.stream.extractfile(item)
342 - def _fetchName(self, item):
343 return item.name
344
345 - def find_documents(self, session, cache=0):
346 # NB can't reverse in stream, send each in turn 347 for tarinfo in self.stream: 348 for doc in self._processFiles(session, [tarinfo], cache): 349 yield doc 350 self.stream.close()
351 352
353 -class ZipDocumentStream(DirectoryDocumentStream):
354 - def open_stream(self, stream):
355 if hasattr(stream, 'read') or os.path.exists(stream): 356 return zipfile.ZipFile(stream, mode="r") 357 else: 358 s = cStringIO.StringIO(stream) 359 return zipfile.ZipFile(s, mode="r")
360
361 - def _fetchStream(self, item):
362 return cStringIO.StringIO(self.stream.read(item))
363 - def _fetchName(self, item):
364 return item.filename
365
366 - def find_documents(self, session, cache=0):
367 for info in self.stream.infolist(): 368 for doc in self._processFiles(session, [info], cache): 369 yield doc 370 self.stream.close()
371 372 # RarDocStream 373
374 -class LocateDocumentStream(DirectoryDocumentStream):
375 - def find_documents(self, session, cache=0):
376 fl = commands.getoutput("locate %s | grep %s$" % (self.stream, self.stream)) 377 docs = fl.split('\n') 378 while docs and docs[0][:8] == "warning:": 379 docs.pop(0) 380 self._processFiles("", docs, cache)
381 382
383 -class ClusterDocumentStream(BaseDocumentStream):
384 # Take a raw cluster file, create documents from it. 385
386 - def open_stream(self, stream):
387 # stream must be the filename 388 # And we don't really care about it until after sorting 389 if os.path.exists(stream): 390 self.streamLocation = stream 391 else: 392 dfp = self.factory.get_path(session, 'defaultPath') 393 abspath = os.path.join(dfp, stream) 394 if os.path.exists(abspath): 395 self.streamLocation = abspath 396 else: 397 raise FileDoesNotExistException(stream)
398
399 - def find_documents(self, session, cache=0):
400 if cache == 1: 401 # Can't store offsets as there's no file to offset to. 402 raise NotImplementedError 403 404 data = self.streamLocation 405 sortx = self.factory.get_path(session, 'sortPath', None) 406 if sortx == None: 407 sortx = commands.getoutput('which sort') 408 sorted = data + "_SORT" 409 os.spawnl(os.P_WAIT, sortx, sortx, data, '-o', sorted) 410 411 # Now construct cluster documents. 412 doc = ["<cluster>"] 413 f = file(sorted) 414 l = f.readline() 415 # term docid recstore occs (line, posn)* 416 currKey = "" 417 while(l): 418 docdata = {} 419 ldata = l.split('\x00') 420 key = ldata[0] 421 if (not key): 422 # Data from records with no key 423 l = f.readline() 424 l = l[:-1] 425 continue 426 427 doc.append("<key>%s</key>\n" % (key)) 428 ldata = ldata[1:-1] 429 for bit in range(len(ldata)/2): 430 d = docdata.get(ldata[bit*2], []) 431 d.append(ldata[bit*2+1]) 432 docdata[ldata[bit*2]] = d 433 l = f.readline() 434 l = l[:-1] 435 ldata2 = l.split('\x00') 436 key2 = ldata2[0] 437 while key == key2: 438 ldata2 = ldata2[1:-1] 439 for bit in range(len(ldata2)/2): 440 d = docdata.get(ldata2[bit*2], []) 441 d.append(ldata2[bit*2+1]) 442 docdata[ldata2[bit*2]] = d 443 l = f.readline() 444 l = l[:-1] 445 ldata2 = l.split('\x00') 446 key2 = ldata2[0] 447 for k in docdata.keys(): 448 doc.append("<%s>" % (k)) 449 for i in docdata[k]: 450 doc.append("%s" % i) 451 doc.append("</%s>" % (k)) 452 doc.append("</cluster>") 453 sdoc = StringDocument(" ".join(doc)) 454 if cache == 0: 455 yield sdoc 456 else: 457 self.documents.append(sdoc) 458 459 doc = ["<cluster>"] 460 l = f.readline() 461 l = l[:-1] 462 f.close()
463
464 -class ComponentDocumentStream(BaseDocumentStream):
465 # Accept a record, and componentize 466 sources = [] 467
468 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):
471
472 - def open_stream(self, stream):
473 return stream
474
475 - def find_documents(self, session, cache=0):
476 # Should extract records by xpath or span and store as X/SGML 477 if cache == 1: 478 # nothing to offset into 479 raise NotImplementedError 480 rec = self.stream 481 for src in self.sources: 482 raw = rec.process_xpath(src[0]) 483 if (len(src) == 1): 484 # Simple XPath component 485 for r in raw: 486 if (type(r) == types.ListType): 487 tempRec = SaxRecord(r) 488 docstr = tempRec.get_xml() 489 saxid = r[-1][r[-1].rfind(' ')+1:] 490 if r[0][0] == "4": 491 docstr = "<c3:component xmlns:c3=\"http://www.cheshire3.org/\" parent=\"%r\" event=\"%s\">%s</c3:component>" % (rec, saxid, docstr) 492 else: 493 docstr = "<c3component parent=\"%r\" event=\"%s\">%s</c3component>" % (rec, saxid, docstr) 494 elif (type(r) == types.StringType): 495 docstr = "<data>%s</data>" % (escape(r)) 496 else: 497 # XXX Minidom specfic 498 docstr = r.toxml() 499 doc = StringDocument(docstr) 500 if cache == 0: 501 yield doc 502 else: 503 self.documents.append(doc) 504 505 elif (len(src) > 1): 506 # Span Component 507 endTag = src[1][-1][0][1] 508 for r in raw: 509 start = int(r[-1][r[-1].rfind(' ')+1:]) 510 comp = [rec.sax[start]] 511 startTag = rec._convert_elem(comp[0])[0] 512 usingNs = comp[0][0] 513 n = 0 514 okay = 1 515 saxlen = len(rec.sax) -1 516 openTags = [] 517 while okay and start + n < saxlen: 518 n += 1 519 line = rec.sax[start+n] 520 if(line[0] in ['1', '4']): 521 # Check it 522 if (rec._checkSaxXPathLine(src[1][-1], start + n)): 523 # Matched end 524 okay = 0 525 else: 526 # Add tags to close 527 if line[0] == '4': 528 end = line.rfind("}") 529 stuff = eval(line[2:end+1]) 530 ns, tag = stuff[0], stuff[1] 531 openTags.append((ns, tag)) 532 else: 533 openTags.append(rec._convert_elem(line)[0]) 534 comp.append(line) 535 elif (line[0] in ['2', '5']): 536 # check we're open 537 if (line[0] == '2'): 538 end = line.rfind(' ') 539 tag = line[2:end] 540 else: 541 tag = eval(line[2:line.rfind(',')])[0:2] 542 if ((n == 1 and tag[1] == startTag) or (openTags and openTags[-1] == tag)): 543 comp.append(line) 544 if openTags: 545 openTags.pop(-1) 546 elif (line[0] == '3'): 547 comp.append(line) 548 if (openTags): 549 openTags.reverse() 550 for o in openTags: 551 if usingNs == '1': 552 comp.append("2 %s" % o) 553 else: 554 comp.append("5 u'%s', u'%s', u'', None" % o) 555 tempRec = SaxRecord(comp) 556 docstr = tempRec.get_xml() 557 docstr = "<c3:component xmlns:c3=\"http://www.cheshire3.org/\" parent=\"%r\" event=\"%s\">%s</c3:component>" % (rec, start, docstr) 558 doc = StringDocument(docstr) 559 if cache == 0: 560 yield doc 561 else: 562 self.documents.append(doc)
563 564
565 -class RemoteDocumentStream(BaseDocumentStream):
566 # Heirarchical Class 567
568 - def parse_url(self, url):
569 bits = urlparse.urlsplit(url) 570 transport = bits[0] 571 uphp = bits[1].split('@') 572 user = '' 573 passwd = '' 574 if len(uphp) == 2: 575 (user, passwd) = uphp[0].split(':') 576 uphp.pop(0) 577 hp = uphp[0].split(':') 578 host = hp[0] 579 if len(hp) == 2: 580 port = int(hp[1]) 581 else: 582 # require subclass to default 583 port = 0 584 # now cwd to the directory, check if last chunk is dir or file 585 (dirname,filename) = os.path.split(bits[2]) 586 params = map(lambda x: x.split('='), bits[3].split('&')) 587 params = dict(params) 588 anchor = bits[4] 589 return (transport, user, passwd, host, port, dirname, filename, params, anchor)
590 591 592 # XXX Should go to grid package
593 -class SrbDocumentStream(RemoteDocumentStream, MultipleDocumentStream):
594 # SRB://user.domain:pass@host:port/path/to/object?DEFAULTRESOURCE=res 595 pass
596 597
598 -class FtpDocumentStream(RemoteDocumentStream, MultipleDocumentStream):
599 # FTP://user:pass@host:port/path/to/object
600 - def open_stream(self, path):
601 # streamLocation is a ftp URL 602 (transport, user, passwd, host, port, dirname, filename, params, anchor) = self.parse_url(self.streamLocation) 603 self.stream = FTP(host, port) 604 if user: 605 self.stream.login(user, passwd) 606 else: 607 self.stream.login() 608 self.dirname = dirname 609 self.file = filename
610
611 - def _fetchStream(self, item):
612 currItem = [] 613 self.stream.retrbinary(item, lambda x: currItem.append(x)) 614 return cStringIO.StringIO(''.join(self.currItem))
615
616 - def _fetchName(self, item):
617 return item
618
619 - def _descend(self, session, dirname, cache=0):
620 self.stream.cwd(dirname) 621 lines = [] 622 self.stream.retrlines('LIST', lambda x: lines.append(x)) 623 filelist = [] 624 for l in lines: 625 # XXX "this file name won't work.txt" 626 # (DDTT) 627 name = ' '.join(l.split()[8:]) 628 if l[0] == 'l': 629 # symlink, ignore? 630 pass 631 elif l[0] == 'd': 632 # directory 633 self._descend(session, name, cache) 634 elif l[0] == '-': 635 filelist.append(name) 636 else: 637 # unknown, ignore 638 pass 639 yield self._processFiles(session, filelist, cache) 640 self.stream.cwd('..')
641
642 - def find_documents(self, session, cache=0):
643 yield self._descend(session, self.dirname, cache) 644 self.stream.quit()
645 646
647 -class Z3950DocumentStream(RemoteDocumentStream):
648 # Z3950://host:port/database?query=cql&... 649 # (NB ... not official) 650
651 - def open_stream(self, stream):
652 server = stream.replace('z3950', 'https') 653 (transport, user, passwd, host, port, dirname, filename, args, anchor) = self.parse_url(server) 654 655 conn = zoom.Connection(host, port) 656 conn.databaseName = dirname 657 q = args['query'] 658 qo = zoom.Query('CQL', q) 659 660 if args.has_key('preferredRecordSyntax'): 661 conn.preferredRecordSyntax = args['preferredRecordSyntax'] 662 else: 663 conn.preferredRecordSyntax = 'USMARC' 664 if args.has_key('elementSetName'): 665 conn.elementSetName = args['elementSetName'] 666 else: 667 conn.elementSetName = 'F' 668 rs = conn.search(qo) 669 self.total = len(rs) 670 return rs
671
672 - def find_documents(self, session, cache=0):
673 # stream is ZOOM.resultSet 674 docs = [] 675 for item in self.stream: 676 if self.resultSet.preferredRecordSyntax == 'USMARC': 677 mt = "application/marc" 678 else: 679 mt = mimetypes.guess_type(self.resultSet.preferredRecordSyntax) 680 doc = StringDocument(item.data, mimeType=mt) 681 if cache == 0: 682 yield doc 683 elif cache == 2: 684 docs.append(doc) 685 else: 686 raise NotImplementedError 687 self.docs = docs 688 raise StopIteration
689 690 691 # XXX Should go to SQL package
692 -class SQLDocumentStream(RemoteDocumentStream):
693 # type://host:port/database?QUERY=sql 694 # NB ... not official 695 pass
696 697 698
700 - def __init__(self, real):
701 self.real = real 702 self.charset = real.headers.getparam('charset') 703 self.mimetype = real.headers.type
704
705 - def __getattr__(self, item):
706 return getattr(self.real, item)
707
708 - def read(self):
709 data = self.real.read() 710 if self.charset: 711 try: 712 data = unicode(data, self.charset) 713 except: 714 pass 715 return data
716 717
718 -class HttpDocumentStream(MultipleDocumentStream):
719
720 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):
722 723
724 - def _processFile(self, session, item):
725 if self.filterRe: 726 m = self.filterRe.search(item) 727 if not m: 728 return None 729 730 mimetype = mimetypes.guess_type(item, 0) 731 if mimetype[0] == None: 732 # get mimetype from stream 733 s = self._fetchStream(item) 734 mimetype = (s.mimetype, None) 735 else: 736 s = None 737 738 if (mimetype[0] in ['text/sgml', 'text/xml']): 739 trip = ('stream', XmlDocumentStream, 'xml') 740 elif (mimetype[0] == 'application/x-tar'): 741 trip = ('stream', TarDocumentStream, ftype) 742 elif (mimetype[0] == 'application/zip'): 743 trip = ('stream', ZipDocumentStream, 'zip') 744 elif (mimetype[0] == 'application/marc'): 745 trip = ('stream', MarcDocumentStream, 'marc') 746 else: 747 trip = ('document', None, mimetype) 748 749 if not s: 750 s = self._fetchStream(item) 751 if trip[0] == 'stream': 752 cls = trip[1] 753 nstream = cls(session, s, format=trip[2], schema=self.schema, codec=self.codec, factory=self.factory) 754 return ('stream', nstream) 755 elif trip[0] == 'document': 756 data = s.read() 757 s.close() 758 doc = StringDocument(data, mimeType=trip[2], filename=item) 759 return ('document', doc)
760 761
762 - def find_documents(self, session, cache=0):
763 url = self.stream.read() 764 self.stream.close() 765 for f in self._processFiles(session, [url], cache): 766 yield f
767
768 - def _fetchName(self, item):
769 # lookup fake name from mimetype, so we can then reguess mimetype 770 mt = item.mimetype 771 ext = mimetypes.guess_extension(mt) 772 return "remote%s" % ext
773
774 - def _fetchStream(self, url):
775 u = urllib2.urlopen(url) 776 return UrllibUnicodeFileThing(u)
777 778
779 -class SruDocumentStream(HttpDocumentStream):
780
781 - def open_stream(self, stream):
782 # streamLocation is an SRU search 783 (transport, user, passwd, host, port, dirname, filename, args, anchor) = self.parse_url(stream) 784 if not port: 785 port = 80 786 if user: 787 self.server = "%s://%s:%s@%s:%s/%s/%s?" % (transport, user, passwd, host, port, dirname, filename) 788 else: 789 self.server = "%s://%s:%s%s/%s?" % (transport, host, port, dirname, filename) 790 791 if (not args.has_key('query')): 792 # XXX Better Error 793 raise ValueError 794 if (not args.has_key('version')): 795 args['version'] = '1.1' 796 if (not args.has_key('maximumRecords')): 797 args['maximumRecords'] = 25 798 if (not args.has_key('recordPacking')): 799 args['recordPacking'] = 'string' 800 args['operation'] = 'searchRetrieve' 801 802 self.args = args 803 self.xmlver = re.compile("[ ]*<\?xml[^>]+>") 804 return None
805 806
807 - def find_documents(self, session, cache=0):
808 # Construct SRU url, fetch, parse. 809 start = 1 810 docs = [] 811 while True: 812 self.args['startRecord'] = start 813 params = urllib.urlencode(self.args) 814 req = urllib2.Request(url="%s%s" % (self.server, params)) 815 f = urllib2.urlopen(req) 816 data = f.read() 817 f.close() 818 # subst out xmldecl 819 data = self.xmlver.sub("", data); 820 soapy = '<SOAP:Envelope xmlns:SOAP="http://schemas.xmlsoap.org/soap/envelope/"><SOAP:Body>%s</SOAP:Body></SOAP:Envelope>' % data 821 ps = ZSI.ParsedSoap(soapy, readerclass=reader) 822 resp = ps.Parse(SRW.types.SearchRetrieveResponse) 823 824 self.total = resp.numberOfRecords 825 for d in resp.records: 826 doc = StringDocument(d.recordData, mimeType='text/xml') 827 if cache ==0: 828 yield doc 829 elif cache==2: 830 docs.append(doc) 831 else: 832 raise NotImplementedError 833 start += len(resp.records) 834 if start > self.total: 835 if cache == 0: 836 raise StopIteration 837 else: 838 break 839 self.docs = docs
840
841 -class SrwDocumentStream(HttpDocumentStream):
842 # same as Sru, but use request object and ZSI to fetch 843
844 - def open_stream(self, stream):
845 # stream is SRU style URL to be opened as SRW 846 (transport, user, passwd, host, port, dirname, filename, args, anchor) = self.parse_url(stream) 847 if not port: 848 port = 80 849 database = os.path.join(dirname, filename) 850 851 self.binding = Binding(host=host, port=port, url=database, nsdict=self.namespaces) 852 return SRW.types.SearchRetrieveRequest('searchRetrieveRequest', opts=args)
853
854 - def find_documents(self, session, cache=0):
855 docs = [] 856 curr = 1 857 while True: 858 self.stream.startRecord = curr 859 resp = self.binding.RPC(self.binding.url, 860 "searchRetrieveRequest", 861 self.request, 862 requestclass=SRW.types.SearchRetrieveRequest, 863 replytype=SRW.types.SearchRetrieveResponse.typecode, 864 readerclass=reader) 865 total = resp.numberOfRecords 866 curr += len(resp.records) 867 for d in resp.records: 868 doc = StringDocument(d.recordData, mimeType='text/xml') 869 doc.recordSchema = d.recordSchema 870 if cache ==0: 871 yield doc 872 elif cache == 2: 873 docs.append(doc) 874 else: 875 raise NotImplementedError 876 if curr > total: 877 if cache == 0: 878 raise StopIteration 879 else: 880 break 881 self.docs = docs
882 883 884
885 -class OaiDocumentStream(HttpDocumentStream):
886 - def open_stream(self, stream):
887 # nothing to do yet 888 return None
889
890 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):
891 BaseDocumentStream.__init__(self, stream, format, schema, codec, factory) 892 893 # stream is URL to ListIdentifiers 894 # possible params: metadataPrefix, set, from, until 895 bits = urlparse.urlsplit(stream) 896 self.params = dict(map(lambda x: x.split('='), bits[3].split('&'))) 897 self.metadataPrefix = params.get('metadataPrefix', 'oai_dc') 898 base = bits[0] + "://" + bits[1] + '/' + bits[2] + '?' 899 self.server = base
900
901 - def find_documents(self, session, cache=0):
902 if cache != 0: 903 raise NotImplementedError 904 905 self._listIdentifiers() 906 while self.idcache: 907 for rec in self._getRecord(): 908 yield rec 909 self._listIdentifiers() 910 raise StopIteration
911 912
913 - def _listIdentifiers(self):
914 s = "%sverb=ListIdentifiers&" % (self.server) 915 s += urllib.urlencode(self.params) 916 resp = self._fetchStream(s) 917 data = resp.read() 918 919 # self.lastResponse = resp 920 # Now use existing infrastructure to parse 921 doc = StringDocument(data, self.id, mimeType='text/xml') 922 rec = BSParser.process_document(None, doc) 923 dom = rec.get_dom() 924 for top in dom.childNodes: 925 if (top.nodeType == elementType): 926 break 927 for c in top.childNodes: 928 if (c.nodeType == elementType and c.localName == 'ListIdentifiers'): 929 for c2 in c.childNodes: 930 if (c2.nodeType == elementType and c2.localName == 'header'): 931 for c3 in c2.childNodes: 932 if (c3.nodeType == elementType and c3.localName == 'identifier'): 933 self.ids.append(getFirstData(c3)) 934 elif (c2.nodeType == elementType and c2.localName == 'resumptionToken'): 935 t = getFirstData(c2) 936 if (t): 937 self.token = t 938 try: 939 self.total = c2.getAttr('completeListSize') 940 except: 941 pass
942
943 - def _getRecord(self):
944 for oaiid in self.idcache: 945 s = "%sverb=GetRecord&%s" % (self.server, urllib.urlencode({'metadataPrefix': self.metadataPrefix, 'identifier': oaiid})) 946 resp = self._fetchStream(s) 947 data = resp.read() 948 doc = StringDocument(data, self.id, mimeType='text/xml') 949 rec = BSParser.process_document(None, doc) 950 dom = rec.get_dom() 951 for top in dom.childNodes: 952 if top.nodeType == elementType: 953 break 954 for c in top.childNodes: 955 if (c.nodeType == elementType and c.localName == 'GetRecord'): 956 for c2 in c.childNodes: 957 if (c2.nodeType == elementType and c2.localName == 'record'): 958 for c3 in c2.childNodes: 959 if (c3.nodeType == elementType and c3.localName == 'metadata'): 960 for c4 in c3.childNodes: 961 if (c4.nodeType == elementType): 962 data = c4.toxml() 963 yield StringDocument(data, self.id, mimeType='text/xml') 964 break 965 break 966 break 967 raise StopIteration
968 969
970 -class GoogleDocumentStream(HttpDocumentStream):
971 portType = None 972 total = 0 973 current = 0 974
975 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):
977
978 - def open_stream(self, stream):
979 self.key = self.factory.get_setting(None, 'googleKey') 980 loc = GoogleSearchServiceLocator() 981 kw = {'readerclass' : reader} 982 self.portType = loc.getGoogleSearchPort(**kw) 983 req = doGoogleSearchWrapper() 984 req._key = self.key 985 req._q = self.streamLocation 986 req._filter = 0 987 req._start = 1 988 # Need one result or totalResultsCount == 0 989 req._maxResults = 10 990 req._safeSearch = 0 991 req._oe = "latin1" 992 req._ie = "latin1" 993 req._lr = "lang_en" 994 req._restrict = "" 995 return req
996 997
998 - def find_documents(self, session, cache=0):
999 if cache != 0: 1000 raise NotImplementedError 1001 # Ask for 10 and then step through those, then ask for next 1002 cont = 1 1003 current = 0 1004 while cont: 1005 self.stream._start = current 1006 current += 10 1007 self.response = self.portType.doGoogleSearch(self.stream) 1008 1009 for i in self.response._return._resultElements: 1010 try: 1011 s = self._fetchStream(i._URL) 1012 d = s.read() 1013 except socket.timeout: 1014 data = "" 1015 yield StringDocument(data) 1016 if not self.response._return._resultElements: 1017 cont = 0 1018 raise StopIteration
1019 1020 1021 try: 1022 # OS feed for open OS feeds: http://a9.com/-/opensearch/public/osrss 1023 # http://a9.com/-/opensearch/public/osd 1024 1025 from opensearch import Client 1026 class OpensearchDocumentStream(HttpDocumentStream): 1027 # Need to know OSD location and query params 1028 # stream should be (osd location, query) 1029 # or just query if osd is set on factory config 1030 1031 def toXml(self, i): 1032 xml = ['<sdc:dc xmlns:sdc="info:srw/schema/1/dc-schema" xmlns:dc="http://purl.org/dc/elements/1.1/">'] 1033 # title, description, date, link 1034 keys = i.keys() 1035 if 'title' in keys: 1036 xml.append('<dc:title>%s</dc:title>' % i.title) 1037 if 'link' in keys: 1038 xml.append('<dc:source>%s</dc:source>' % i.link) 1039 if 'author' in keys: 1040 xml.append('<dc:creator>%s</dc:creator>' % i.author) 1041 if 'updated_parsed' in keys: 1042 xml.append('<dc:date>%d-%02d-%02d %02d:%02d:%02d</dc:date>' % i.updated_parsed[:6]) 1043 if 'summary' in keys: 1044 xml.append('<dc:description><![CDATA[%s]]></dc:description>' % i.summary) 1045 1046 xml.append("</sdc:dc>") 1047 return '\n'.join(xml)
1048 1049 def open_stream(self, stream): 1050 if type(self.streamLocation) == tuple: 1051 c = Client(self.streamLocation[0]) 1052 self.query = streamLocation[1] 1053 else: 1054 osd = self.factory.get_setting(session, 'OsdUrl', '') 1055 if osd: 1056 c = Client(osd) 1057 else: 1058 raise ConfigFileException 1059 self.query = streamLocation 1060 return c 1061 1062 def find_documents(self, session, cache=0): 1063 results = self.stream.search(self.query) 1064 docs = [] 1065 for r in results: 1066 doc = self.toXml(r) 1067 if cache == 0: 1068 yield StringDocument(doc) 1069 elif cache == 2: 1070 docs.append(StringDocument(doc)) 1071 else: 1072 raise NotImplementedError 1073 self.docs = docs 1074 except: 1075 class OpensearchDocumentStream: 1076 pass 1077 1078 try: 1079 import feedparser 1080 class SyndicationDocumentStream(HttpDocumentStream): 1081 # Use universal feed parser to import rss, atom, etc 1082 1083 def toXml(self, i): 1084 xml = ['<sdc:dc xmlns:sdc="info:srw/schema/1/dc-schema" xmlns:dc="http://purl.org/dc/elements/1.1/">'] 1085 # title, description, date, link 1086 keys = i.keys() 1087 if 'id' in keys: 1088 xml.append('<dc:identifier>%s</dc:identifier>' % i.id) 1089 if 'title' in keys: 1090 xml.append('<dc:title>%s</dc:title>' % i.title) 1091 if 'link' in keys: 1092 xml.append('<dc:source>%s</dc:source>' % i.link) 1093 if 'author' in keys: 1094 xml.append('<dc:creator>%s</dc:creator>' % i.author) 1095 if 'updated_parsed' in keys: 1096 xml.append('<dc:date>%d-%02d-%02d %02d:%02d:%02d</dc:date>' % i.updated_parsed[:6]) 1097 if 'summary' in keys: 1098 xml.append('<dc:description><![CDATA[%s]]></dc:description>' % i.summary) 1099 xml.append("</sdc:dc>") 1100 return '\n'.join(xml) 1101 1102 def open_stream(self, stream): 1103 # stream may be URL, filename or buffer. Nice. 1104 c = feedparser.parse(stream) 1105 return c 1106 1107 def find_documents(self, session, cache=0): 1108 docs = [] 1109 linked = self.factory.get_setting(session, 'linkedItem', 0) 1110 for e in self.stream.entries: 1111 if linked == 0: 1112 doc = self.toXml(e) 1113 else: 1114 s = self._fetchStream(e.link) 1115 doc = s.read() 1116 if cache == 0: 1117 yield StringDocument(doc) 1118 elif cache == 2: 1119 docs.append(StringDocument(doc)) 1120 else: 1121 raise NotImplementedError 1122 self.docs = docs 1123 1124 except: 1125 class SyndicationDocumentStream(RemoteDocumentStream): 1126 pass 1127 1128 1129
1130 -class BaseDocumentFactory(DocumentFactory):
1131 cache = 0 1132 format = "" 1133 schema = "" 1134 codec = "" 1135 dataPath = "" 1136 previousIdx = -1 1137 streamHash = {} 1138 docStream = None 1139 generator = None 1140
1141 - def __init__(self, session, config, parent):
1142 DocumentFactory.__init__(self, session, config, parent) 1143 self.docStream = None 1144 self.generator = None 1145 self.streamHash = {"xml" : XmlDocumentStream, 1146 "marc" : MarcDocumentStream, 1147 "dir" : DirectoryDocumentStream, 1148 "tar" : TarDocumentStream, 1149 "zip" : ZipDocumentStream, 1150 "srb" : SrbDocumentStream, 1151 "cluster" : ClusterDocumentStream, 1152 "locate" : LocateDocumentStream, 1153 "component" : ComponentDocumentStream, 1154 "oai" : OaiDocumentStream, 1155 "sru" : SruDocumentStream, 1156 "opensearch" : OpensearchDocumentStream, 1157 "z3950" : Z3950DocumentStream, 1158 "ftp" : FtpDocumentStream, 1159 "http" : HttpDocumentStream, 1160 "termHash" : TermHashDocumentStream, 1161 "rss" : SyndicationDocumentStream, 1162 "google" : GoogleDocumentStream 1163 } 1164 self.cache = int(self.get_default(session, 'cache', 0)) 1165 self.format = self.get_default(session, 'format', '') 1166 self.schema = self.get_default(session, 'schema', '') 1167 self.codec = self.get_default(session, 'codec', "") 1168 self.dataPath = self.get_default(session, 'data', '') 1169 self.previousIdx = -1
1170
1171 - def register_stream(self, format, cls):
1172 self.streamHash[format] = cls
1173
1174 - def load(self, session, data=None, cache=None, format=None, schema=None, codec=None):
1175 if data == None: 1176 data = self.dataPath 1177 1178 if format == None: 1179 format = self.format 1180 if cache == None: 1181 cache = self.cache 1182 if schema == None: 1183 schema = self.schema 1184 if codec == None: 1185 codec = self.codec 1186 1187 # Some laziness checking 1188 1189 if not format: 1190 if os.path.exists(data): 1191 if data[-4:] == '.zip': 1192 format = 'zip' 1193 elif data[-4:] == '.tar': 1194 format = 'tar' 1195 elif data[-4:] == '.xml': 1196 format = 'xml' 1197 elif data[-5:] == '.marc': 1198 format = 'marc' 1199 elif os.path.isdir(data): 1200 format = 'dir' 1201 else: 1202 if data[:6] == "ftp://": 1203 format = 'ftp' 1204 elif data[:6] == "srb://": 1205 format = 'srb' 1206 elif data[:7] == "http://" or data[:8] == "https://": 1207 format = "http" 1208 if data.find('?') > -1: 1209 # parse url and extract param names 1210 bits = urlparse.urlsplit(data) 1211 plist = map(lambda x: x.split('=')[0], bits[3].split('&')) 1212 if 'verb' in plist and 'metadataPrefix' in plist: 1213 format = 'oai' 1214 elif 'operation' in plist and 'version' in plist and 'query' in plist: 1215 format = 'sru' 1216 1217 cls = self.streamHash[format] 1218 ds = cls(session, data, format, schema, codec, self) 1219 1220 # Store and call generator on first ping 1221 self.docStream = ds 1222 self.generator = ds.find_documents(session, cache=cache) 1223 self.previousIdx = -1 1224 self.cache = cache 1225 # Return self for workflows, mostly can ignore 1226 return self
1227
1228 - def __iter__(self):
1229 return self
1230
1231 - def next(self):
1232 return self.get_document()
1233
1234 - def get_document(self, session=None, idx=-1):
1235 if idx == -1: 1236 self.previousIdx += 1 1237 idx = self.previousIdx 1238 if self.cache == 0: 1239 # gen will yield, return 1240 return self.generator.next() 1241 elif self.cache == 1: 1242 # XXX generate document from stream 1243 1244 return doc 1245 elif self.cache == 2: 1246 if idx == -1: 1247 self.previousIdx += 1 1248 idx = self.previousIdx 1249 return self.docs[idx]
1250 1251
1252 -class ComponentDocumentFactory(BaseDocumentFactory):
1253
1254 - def _handleConfigNode(self, session, node):
1255 # Source 1256 if (node.localName == "source"): 1257 xpaths = [] 1258 for child in node.childNodes: 1259 if child.nodeType == elementType: 1260 if child.localName == "xpath": 1261 # add XPath 1262 xpaths.append(getFirstData(child)) 1263 cxps = verifyXPaths(xpaths) 1264 self.sources.append(cxps)
1265
1266 - def __init__(self, session, config, parent):
1267 self.sources = [] 1268 BaseDocumentFactory.__init__(self, session, config, parent)
1269