1
2 import socket, time
3 socket.setdefaulttimeout(30)
4
5 from baseObjects import DocumentFactory
6 from document import StringDocument
7 from record import SaxRecord
8 from bootstrap import BSParser
9 from utils import elementType, getFirstData, flattenTexts, reader, verifyXPaths
10 import re, os, c3errors, tarfile, cStringIO, sys, gzip
11 import mimetypes, httplib, urllib, urlparse, urllib2
12 import commands, codecs, types
13 from ZSI.client import Binding
14 from PyZ3950 import zoom
15 import SRW
16 from c3errors import *
17 from ftplib import FTP
18 from GoogleSearch_services import *
19 from utils import reader
20
21 mimetypes.add_type('application/marc', '.marc')
22
23
24
25
26
27
28
80
81
100
101
103 start = None
104 endtag = ""
105
106 - def __init__(self, session, stream, format, schema="", codec="", factory=None):
114
116 docs = []
117 locs = []
118 endtag = self.endtag
119 let = len(endtag)
120 myTell = 0
121 xpi = ""
122 line = ""
123 while True:
124 ol = len(line)
125 line += self.stream.read(1024)
126 pi = line.find("<?xml ")
127 if (pi > -1):
128
129 endpi = line.find("?>")
130 xpi = line[pi:endpi+2] + "\n"
131 xpi= ""
132 m = self.start.search(line)
133 if m:
134 if not self.endtag:
135 endtag = "</%s>" % m.group()[1:-1]
136 let = len(endtag)
137 s = m.start()
138 line = line[s:]
139 myTell += s
140 start = myTell
141 end = -1
142 strStart = 0
143 while end == -1:
144 if strStart:
145
146 end = line.find(endtag, strStart-let)
147 else:
148 end = line.find(endtag)
149 if end > 0:
150 tlen = end+len(endtag)
151 txt = line[:tlen]
152 line = line[tlen:]
153 myTell += tlen
154 if cache == 0:
155 yield StringDocument(xpi + txt, mimeType="text/xml", schema=self.schema)
156 elif cache == 1:
157 locs.append((start, tlen))
158 elif cache == 2:
159 docs.append(StringDocument(xpi + txt, mimeType="text/xml", schema=self.schema))
160 else:
161 strStart = len(line)
162 line += self.stream.read(1024)
163 if len(line) == ol and not m:
164 if cache == 0:
165 self.stream.close()
166 raise StopIteration
167 else:
168 break
169 self.stream.close()
170 self.locations = locs
171 self.documents = docs
172 self.length = max(len(locs), len(docs))
173
174
176
178 docs = []
179 locs = []
180 data = self.stream.read(1536)
181 myTell = 0
182 while data:
183 rt = data.find("\x1D")
184 while (rt > -1):
185 txt = data[:rt+1]
186 tlen = len(txt)
187 if cache == 0:
188 yield StringDocument(txt, mimeType="application/marc")
189 elif cache == 1:
190 locs.append((myTell, tlen))
191 elif cache == 2:
192 docs.append(StringDocument(txt, mimeType="application/marc"))
193 data = data[rt+1:]
194 myTell += tlen
195 rt = data.find("\x1D")
196 dlen = len(data)
197 data += self.stream.read(1536)
198 if (len(data) == dlen):
199
200 data = ""
201 self.stream.close()
202 self.locations = locs
203 self.documents = docs
204 self.length = max(len(locs), len(docs))
205
206
207
208
209
210
212
213 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):
220
223
226
228 name = self._fetchName(item)
229 if self.filterRe:
230 m = self.filterRe.search(name)
231 if not m:
232 return None
233 mimetype = mimetypes.guess_type(name, 0)
234
235 if (mimetype[0] in ['text/sgml', 'text/xml']):
236 trip = ('stream', XmlDocumentStream, 'xml')
237 elif (mimetype[0] == 'application/x-tar'):
238 trip = ('stream', TarDocumentStream, ftype)
239 elif (mimetype[0] == 'application/zip'):
240 trip = ('stream', ZipDocumentStream, 'zip')
241 elif (mimetype[0] == 'application/marc'):
242 trip = ('stream', MarcDocumentStream, 'marc')
243 else:
244 trip = ('document', None, mimetype)
245
246 s = self._fetchStream(item)
247 if trip[0] == 'stream':
248 cls = trip[1]
249 nstream = cls(session, s, format=trip[2], schema=self.schema, codec=self.codec, factory=self.factory)
250 return ('stream', nstream)
251 elif trip[0] == 'document':
252 data = s.read()
253 s.close()
254 doc = StringDocument(data, mimeType=trip[2], filename=name)
255 return ('document', doc)
256
258 docs = []
259 for item in items:
260
261 stuff = self._processFile(session, item)
262 if not stuff:
263
264 continue
265 (dtype, obj) = stuff
266 if dtype == 'stream':
267 gen = obj.find_documents(session, cache=cache)
268 if cache == 0:
269
270 for g in gen:
271 yield g
272 elif cache == 1:
273 try:
274 gen.next()
275 except:
276 pass
277 locs.append((fullname, mimetype, nstream.locs))
278 elif cache == 2:
279 try:
280 gen.next()
281 except:
282 pass
283 docs.extend(nstream.docs)
284 elif dtype == 'document':
285 if cache == 0:
286 yield obj
287 elif cache == 1:
288 raise NotImplementedError
289 elif cache == 2:
290 docs.append(obj)
291 self.documents = docs
292
294
296 for root, dirs, files in os.walk(self.streamLocation):
297 for d in dirs:
298 if os.path.islink(os.path.join(root, d)):
299 for root2, dirs2, files2 in os.walk(os.path.join(root,d)):
300 files2.sort()
301 files2 = map(lambda x: os.path.join(root2, x), files2)
302 for f in self._processFiles(session, files2, cache):
303 yield f
304 files.sort()
305 files = map(lambda x: os.path.join(root, x), files)
306 for f in self._processFiles(session, files, cache):
307 yield f
308
309
310
311
312
313
314
316
318 if self.format in ['tar.gz', 'tgz']:
319 modeSuf = "gz"
320 elif self.format == 'tar.bz2':
321 modeSuf = "bz2"
322 else:
323 modeSuf = ""
324
325 if hasattr(stream, 'read'):
326 return tarfile.open(fileobj=stream, mode="r|%s" % modeSuf)
327 elif os.path.exists(stream):
328 return tarfile.open(stream, mode="r")
329 else:
330 s = cStringIO.StringIO(stream)
331 return tarfile.open(fileobj=s, mode="r|%s" % modeSuf)
332
339
344
351
352
355 if hasattr(stream, 'read') or os.path.exists(stream):
356 return zipfile.ZipFile(stream, mode="r")
357 else:
358 s = cStringIO.StringIO(stream)
359 return zipfile.ZipFile(s, mode="r")
360
365
371
372
373
376 fl = commands.getoutput("locate %s | grep %s$" % (self.stream, self.stream))
377 docs = fl.split('\n')
378 while docs and docs[0][:8] == "warning:":
379 docs.pop(0)
380 self._processFiles("", docs, cache)
381
382
384
385
398
400 if cache == 1:
401
402 raise NotImplementedError
403
404 data = self.streamLocation
405 sortx = self.factory.get_path(session, 'sortPath', None)
406 if sortx == None:
407 sortx = commands.getoutput('which sort')
408 sorted = data + "_SORT"
409 os.spawnl(os.P_WAIT, sortx, sortx, data, '-o', sorted)
410
411
412 doc = ["<cluster>"]
413 f = file(sorted)
414 l = f.readline()
415
416 currKey = ""
417 while(l):
418 docdata = {}
419 ldata = l.split('\x00')
420 key = ldata[0]
421 if (not key):
422
423 l = f.readline()
424 l = l[:-1]
425 continue
426
427 doc.append("<key>%s</key>\n" % (key))
428 ldata = ldata[1:-1]
429 for bit in range(len(ldata)/2):
430 d = docdata.get(ldata[bit*2], [])
431 d.append(ldata[bit*2+1])
432 docdata[ldata[bit*2]] = d
433 l = f.readline()
434 l = l[:-1]
435 ldata2 = l.split('\x00')
436 key2 = ldata2[0]
437 while key == key2:
438 ldata2 = ldata2[1:-1]
439 for bit in range(len(ldata2)/2):
440 d = docdata.get(ldata2[bit*2], [])
441 d.append(ldata2[bit*2+1])
442 docdata[ldata2[bit*2]] = d
443 l = f.readline()
444 l = l[:-1]
445 ldata2 = l.split('\x00')
446 key2 = ldata2[0]
447 for k in docdata.keys():
448 doc.append("<%s>" % (k))
449 for i in docdata[k]:
450 doc.append("%s" % i)
451 doc.append("</%s>" % (k))
452 doc.append("</cluster>")
453 sdoc = StringDocument(" ".join(doc))
454 if cache == 0:
455 yield sdoc
456 else:
457 self.documents.append(sdoc)
458
459 doc = ["<cluster>"]
460 l = f.readline()
461 l = l[:-1]
462 f.close()
463
465
466 sources = []
467
468 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):
471
474
476
477 if cache == 1:
478
479 raise NotImplementedError
480 rec = self.stream
481 for src in self.sources:
482 raw = rec.process_xpath(src[0])
483 if (len(src) == 1):
484
485 for r in raw:
486 if (type(r) == types.ListType):
487 tempRec = SaxRecord(r)
488 docstr = tempRec.get_xml()
489 saxid = r[-1][r[-1].rfind(' ')+1:]
490 if r[0][0] == "4":
491 docstr = "<c3:component xmlns:c3=\"http://www.cheshire3.org/\" parent=\"%r\" event=\"%s\">%s</c3:component>" % (rec, saxid, docstr)
492 else:
493 docstr = "<c3component parent=\"%r\" event=\"%s\">%s</c3component>" % (rec, saxid, docstr)
494 elif (type(r) == types.StringType):
495 docstr = "<data>%s</data>" % (escape(r))
496 else:
497
498 docstr = r.toxml()
499 doc = StringDocument(docstr)
500 if cache == 0:
501 yield doc
502 else:
503 self.documents.append(doc)
504
505 elif (len(src) > 1):
506
507 endTag = src[1][-1][0][1]
508 for r in raw:
509 start = int(r[-1][r[-1].rfind(' ')+1:])
510 comp = [rec.sax[start]]
511 startTag = rec._convert_elem(comp[0])[0]
512 usingNs = comp[0][0]
513 n = 0
514 okay = 1
515 saxlen = len(rec.sax) -1
516 openTags = []
517 while okay and start + n < saxlen:
518 n += 1
519 line = rec.sax[start+n]
520 if(line[0] in ['1', '4']):
521
522 if (rec._checkSaxXPathLine(src[1][-1], start + n)):
523
524 okay = 0
525 else:
526
527 if line[0] == '4':
528 end = line.rfind("}")
529 stuff = eval(line[2:end+1])
530 ns, tag = stuff[0], stuff[1]
531 openTags.append((ns, tag))
532 else:
533 openTags.append(rec._convert_elem(line)[0])
534 comp.append(line)
535 elif (line[0] in ['2', '5']):
536
537 if (line[0] == '2'):
538 end = line.rfind(' ')
539 tag = line[2:end]
540 else:
541 tag = eval(line[2:line.rfind(',')])[0:2]
542 if ((n == 1 and tag[1] == startTag) or (openTags and openTags[-1] == tag)):
543 comp.append(line)
544 if openTags:
545 openTags.pop(-1)
546 elif (line[0] == '3'):
547 comp.append(line)
548 if (openTags):
549 openTags.reverse()
550 for o in openTags:
551 if usingNs == '1':
552 comp.append("2 %s" % o)
553 else:
554 comp.append("5 u'%s', u'%s', u'', None" % o)
555 tempRec = SaxRecord(comp)
556 docstr = tempRec.get_xml()
557 docstr = "<c3:component xmlns:c3=\"http://www.cheshire3.org/\" parent=\"%r\" event=\"%s\">%s</c3:component>" % (rec, start, docstr)
558 doc = StringDocument(docstr)
559 if cache == 0:
560 yield doc
561 else:
562 self.documents.append(doc)
563
564
566
567
569 bits = urlparse.urlsplit(url)
570 transport = bits[0]
571 uphp = bits[1].split('@')
572 user = ''
573 passwd = ''
574 if len(uphp) == 2:
575 (user, passwd) = uphp[0].split(':')
576 uphp.pop(0)
577 hp = uphp[0].split(':')
578 host = hp[0]
579 if len(hp) == 2:
580 port = int(hp[1])
581 else:
582
583 port = 0
584
585 (dirname,filename) = os.path.split(bits[2])
586 params = map(lambda x: x.split('='), bits[3].split('&'))
587 params = dict(params)
588 anchor = bits[4]
589 return (transport, user, passwd, host, port, dirname, filename, params, anchor)
590
591
592
594
595 pass
596
597
599
601
602 (transport, user, passwd, host, port, dirname, filename, params, anchor) = self.parse_url(self.streamLocation)
603 self.stream = FTP(host, port)
604 if user:
605 self.stream.login(user, passwd)
606 else:
607 self.stream.login()
608 self.dirname = dirname
609 self.file = filename
610
612 currItem = []
613 self.stream.retrbinary(item, lambda x: currItem.append(x))
614 return cStringIO.StringIO(''.join(self.currItem))
615
618
619 - def _descend(self, session, dirname, cache=0):
620 self.stream.cwd(dirname)
621 lines = []
622 self.stream.retrlines('LIST', lambda x: lines.append(x))
623 filelist = []
624 for l in lines:
625
626
627 name = ' '.join(l.split()[8:])
628 if l[0] == 'l':
629
630 pass
631 elif l[0] == 'd':
632
633 self._descend(session, name, cache)
634 elif l[0] == '-':
635 filelist.append(name)
636 else:
637
638 pass
639 yield self._processFiles(session, filelist, cache)
640 self.stream.cwd('..')
641
645
646
648
649
650
652 server = stream.replace('z3950', 'https')
653 (transport, user, passwd, host, port, dirname, filename, args, anchor) = self.parse_url(server)
654
655 conn = zoom.Connection(host, port)
656 conn.databaseName = dirname
657 q = args['query']
658 qo = zoom.Query('CQL', q)
659
660 if args.has_key('preferredRecordSyntax'):
661 conn.preferredRecordSyntax = args['preferredRecordSyntax']
662 else:
663 conn.preferredRecordSyntax = 'USMARC'
664 if args.has_key('elementSetName'):
665 conn.elementSetName = args['elementSetName']
666 else:
667 conn.elementSetName = 'F'
668 rs = conn.search(qo)
669 self.total = len(rs)
670 return rs
671
673
674 docs = []
675 for item in self.stream:
676 if self.resultSet.preferredRecordSyntax == 'USMARC':
677 mt = "application/marc"
678 else:
679 mt = mimetypes.guess_type(self.resultSet.preferredRecordSyntax)
680 doc = StringDocument(item.data, mimeType=mt)
681 if cache == 0:
682 yield doc
683 elif cache == 2:
684 docs.append(doc)
685 else:
686 raise NotImplementedError
687 self.docs = docs
688 raise StopIteration
689
690
691
693
694
695 pass
696
697
698
701 self.real = real
702 self.charset = real.headers.getparam('charset')
703 self.mimetype = real.headers.type
704
706 return getattr(self.real, item)
707
709 data = self.real.read()
710 if self.charset:
711 try:
712 data = unicode(data, self.charset)
713 except:
714 pass
715 return data
716
717
719
720 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):
722
723
725 if self.filterRe:
726 m = self.filterRe.search(item)
727 if not m:
728 return None
729
730 mimetype = mimetypes.guess_type(item, 0)
731 if mimetype[0] == None:
732
733 s = self._fetchStream(item)
734 mimetype = (s.mimetype, None)
735 else:
736 s = None
737
738 if (mimetype[0] in ['text/sgml', 'text/xml']):
739 trip = ('stream', XmlDocumentStream, 'xml')
740 elif (mimetype[0] == 'application/x-tar'):
741 trip = ('stream', TarDocumentStream, ftype)
742 elif (mimetype[0] == 'application/zip'):
743 trip = ('stream', ZipDocumentStream, 'zip')
744 elif (mimetype[0] == 'application/marc'):
745 trip = ('stream', MarcDocumentStream, 'marc')
746 else:
747 trip = ('document', None, mimetype)
748
749 if not s:
750 s = self._fetchStream(item)
751 if trip[0] == 'stream':
752 cls = trip[1]
753 nstream = cls(session, s, format=trip[2], schema=self.schema, codec=self.codec, factory=self.factory)
754 return ('stream', nstream)
755 elif trip[0] == 'document':
756 data = s.read()
757 s.close()
758 doc = StringDocument(data, mimeType=trip[2], filename=item)
759 return ('document', doc)
760
761
767
769
770 mt = item.mimetype
771 ext = mimetypes.guess_extension(mt)
772 return "remote%s" % ext
773
777
778
780
782
783 (transport, user, passwd, host, port, dirname, filename, args, anchor) = self.parse_url(stream)
784 if not port:
785 port = 80
786 if user:
787 self.server = "%s://%s:%s@%s:%s/%s/%s?" % (transport, user, passwd, host, port, dirname, filename)
788 else:
789 self.server = "%s://%s:%s%s/%s?" % (transport, host, port, dirname, filename)
790
791 if (not args.has_key('query')):
792
793 raise ValueError
794 if (not args.has_key('version')):
795 args['version'] = '1.1'
796 if (not args.has_key('maximumRecords')):
797 args['maximumRecords'] = 25
798 if (not args.has_key('recordPacking')):
799 args['recordPacking'] = 'string'
800 args['operation'] = 'searchRetrieve'
801
802 self.args = args
803 self.xmlver = re.compile("[ ]*<\?xml[^>]+>")
804 return None
805
806
808
809 start = 1
810 docs = []
811 while True:
812 self.args['startRecord'] = start
813 params = urllib.urlencode(self.args)
814 req = urllib2.Request(url="%s%s" % (self.server, params))
815 f = urllib2.urlopen(req)
816 data = f.read()
817 f.close()
818
819 data = self.xmlver.sub("", data);
820 soapy = '<SOAP:Envelope xmlns:SOAP="http://schemas.xmlsoap.org/soap/envelope/"><SOAP:Body>%s</SOAP:Body></SOAP:Envelope>' % data
821 ps = ZSI.ParsedSoap(soapy, readerclass=reader)
822 resp = ps.Parse(SRW.types.SearchRetrieveResponse)
823
824 self.total = resp.numberOfRecords
825 for d in resp.records:
826 doc = StringDocument(d.recordData, mimeType='text/xml')
827 if cache ==0:
828 yield doc
829 elif cache==2:
830 docs.append(doc)
831 else:
832 raise NotImplementedError
833 start += len(resp.records)
834 if start > self.total:
835 if cache == 0:
836 raise StopIteration
837 else:
838 break
839 self.docs = docs
840
842
843
845
846 (transport, user, passwd, host, port, dirname, filename, args, anchor) = self.parse_url(stream)
847 if not port:
848 port = 80
849 database = os.path.join(dirname, filename)
850
851 self.binding = Binding(host=host, port=port, url=database, nsdict=self.namespaces)
852 return SRW.types.SearchRetrieveRequest('searchRetrieveRequest', opts=args)
853
855 docs = []
856 curr = 1
857 while True:
858 self.stream.startRecord = curr
859 resp = self.binding.RPC(self.binding.url,
860 "searchRetrieveRequest",
861 self.request,
862 requestclass=SRW.types.SearchRetrieveRequest,
863 replytype=SRW.types.SearchRetrieveResponse.typecode,
864 readerclass=reader)
865 total = resp.numberOfRecords
866 curr += len(resp.records)
867 for d in resp.records:
868 doc = StringDocument(d.recordData, mimeType='text/xml')
869 doc.recordSchema = d.recordSchema
870 if cache ==0:
871 yield doc
872 elif cache == 2:
873 docs.append(doc)
874 else:
875 raise NotImplementedError
876 if curr > total:
877 if cache == 0:
878 raise StopIteration
879 else:
880 break
881 self.docs = docs
882
883
884
887
888 return None
889
890 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):
900
911
912
914 s = "%sverb=ListIdentifiers&" % (self.server)
915 s += urllib.urlencode(self.params)
916 resp = self._fetchStream(s)
917 data = resp.read()
918
919
920
921 doc = StringDocument(data, self.id, mimeType='text/xml')
922 rec = BSParser.process_document(None, doc)
923 dom = rec.get_dom()
924 for top in dom.childNodes:
925 if (top.nodeType == elementType):
926 break
927 for c in top.childNodes:
928 if (c.nodeType == elementType and c.localName == 'ListIdentifiers'):
929 for c2 in c.childNodes:
930 if (c2.nodeType == elementType and c2.localName == 'header'):
931 for c3 in c2.childNodes:
932 if (c3.nodeType == elementType and c3.localName == 'identifier'):
933 self.ids.append(getFirstData(c3))
934 elif (c2.nodeType == elementType and c2.localName == 'resumptionToken'):
935 t = getFirstData(c2)
936 if (t):
937 self.token = t
938 try:
939 self.total = c2.getAttr('completeListSize')
940 except:
941 pass
942
944 for oaiid in self.idcache:
945 s = "%sverb=GetRecord&%s" % (self.server, urllib.urlencode({'metadataPrefix': self.metadataPrefix, 'identifier': oaiid}))
946 resp = self._fetchStream(s)
947 data = resp.read()
948 doc = StringDocument(data, self.id, mimeType='text/xml')
949 rec = BSParser.process_document(None, doc)
950 dom = rec.get_dom()
951 for top in dom.childNodes:
952 if top.nodeType == elementType:
953 break
954 for c in top.childNodes:
955 if (c.nodeType == elementType and c.localName == 'GetRecord'):
956 for c2 in c.childNodes:
957 if (c2.nodeType == elementType and c2.localName == 'record'):
958 for c3 in c2.childNodes:
959 if (c3.nodeType == elementType and c3.localName == 'metadata'):
960 for c4 in c3.childNodes:
961 if (c4.nodeType == elementType):
962 data = c4.toxml()
963 yield StringDocument(data, self.id, mimeType='text/xml')
964 break
965 break
966 break
967 raise StopIteration
968
969
971 portType = None
972 total = 0
973 current = 0
974
975 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):
977
996
997
999 if cache != 0:
1000 raise NotImplementedError
1001
1002 cont = 1
1003 current = 0
1004 while cont:
1005 self.stream._start = current
1006 current += 10
1007 self.response = self.portType.doGoogleSearch(self.stream)
1008
1009 for i in self.response._return._resultElements:
1010 try:
1011 s = self._fetchStream(i._URL)
1012 d = s.read()
1013 except socket.timeout:
1014 data = ""
1015 yield StringDocument(data)
1016 if not self.response._return._resultElements:
1017 cont = 0
1018 raise StopIteration
1019
1020
1021 try:
1022
1023
1024
1025 from opensearch import Client
1026 class OpensearchDocumentStream(HttpDocumentStream):
1027
1028
1029
1030
1031 def toXml(self, i):
1032 xml = ['<sdc:dc xmlns:sdc="info:srw/schema/1/dc-schema" xmlns:dc="http://purl.org/dc/elements/1.1/">']
1033
1034 keys = i.keys()
1035 if 'title' in keys:
1036 xml.append('<dc:title>%s</dc:title>' % i.title)
1037 if 'link' in keys:
1038 xml.append('<dc:source>%s</dc:source>' % i.link)
1039 if 'author' in keys:
1040 xml.append('<dc:creator>%s</dc:creator>' % i.author)
1041 if 'updated_parsed' in keys:
1042 xml.append('<dc:date>%d-%02d-%02d %02d:%02d:%02d</dc:date>' % i.updated_parsed[:6])
1043 if 'summary' in keys:
1044 xml.append('<dc:description><![CDATA[%s]]></dc:description>' % i.summary)
1045
1046 xml.append("</sdc:dc>")
1047 return '\n'.join(xml)