1
2 from baseObjects import PreParser
3 from document import StringDocument
4 import re, gzip, string, binascii, cStringIO as StringIO
5 import bz2
6 import httplib, mimetypes, tempfile, os, commands, time
7 from PyZ3950.zmarc import MARC
8 from xml.sax.saxutils import escape
9
10
11
12
13
14
16 """ Calls a named Normalizer to do the conversion """
17
18 - def __init__(self, session, config, parent):
23
28
29
30
31
32
34 """ Attempts to reduce HTML to its raw text """
35
36 - def __init__(self, session, config, parent):
37 PreParser.__init__(self, session, config, parent)
38 self.body = re.compile('<body(.*?)</body>', re.S | re.I)
39 self.tagstrip = re.compile('<[^>]+>')
40 self.title = re.compile('<title[^>]*>(.+?)</title>', re.S | re.I)
41 self.script = re.compile('<script(.*?)</script>', re.S | re.I)
42 self.style = re.compile('<style(.*?)</style>', re.S | re.I)
43 self.comment = re.compile('<!--(.*?)-->', re.S | re.I)
44
46 data = self.script.sub('', doc.get_raw())
47 data = self.style.sub('', data)
48 data = self.comment.sub('', data)
49 tm = self.title.search(data)
50 if tm:
51 title = data[tm.start():tm.end()]
52 else:
53 title = ""
54 m = self.body.search(data)
55 if m:
56 body = data[m.start():m.end()]
57 else:
58 body = data
59 text = self.tagstrip.sub(' ', body)
60 text = text.replace('<', '<')
61 text = text.replace('>', '>')
62 text = text.replace(" ", ' ')
63 text = text.replace(" ", ' ')
64
65 l = text.split()
66 text = ' '.join(l)
67 data = "<html><head>%s</head><body>%s</body></html>" % (title, text)
68 return StringDocument(data)
69
70
72 """ Either strip, replace or keep data which matches a given regular expression """
73 - def __init__(self, session, config, parent):
84
86 data = doc.get_raw()
87 if self.keep:
88 l = self.regexp.findall(data)
89 if l and l[0] and type(l[0]) == tuple:
90 r = []
91 for e in l:
92 r.append(e[0])
93 l = r
94 d2 = self.char.join(l)
95 else:
96 d2 = self.regexp.sub(self.char, data)
97 return StringDocument(d2)
98
99 try:
100 import tidy
101
102 class HtmlTidyPreParser(PreParser):
103 """ Uses TidyLib to turn HTML into XHTML for parsing """
104 def process_document(self, session, doc):
105 d = tidy.parseString(doc.get_raw(), output_xhtml=1, add_xml_decl=0, tidy_mark=0, indent=0)
106 return StringDocument(str(d))
210 """ Convert SGML into XML """
211 entities = {}
212 emptyTags = []
213 doctype_re = None
214 attr_re = None
215 elem_re = None
216 amp_re = None
217 inMimeType = "text/sgml"
218 outMimeType = "text/xml"
219
220 - def __init__(self, session, server, config):
221
222 PreParser.__init__(self, session, server, config)
223 self.doctype_re = (re.compile('<!DOCTYPE (.+?)"(.+?)">'))
224 self.attr_re = re.compile(' ([a-zA-Z0-9_]+)[ ]*=[ ]*([-:_.a-zA-Z0-9]+)([ >])')
225 self.pi_re = re.compile("<\?(.*?)\?>")
226 self.elem_re = re.compile('(<[/]?)([a-zA-Z0-9_]+)')
227 self.amp_re = re.compile('&(\s)')
228 taglist = self.get_setting(None, 'emptyElements')
229 if taglist:
230 self.emptyTags = taglist.split()
231
233 return '&%s' % match.group(1)
235
236 return "%s%s" % (match.group(1), match.group(2).lower())
238
239 return ' %s="%s"%s' % (match.group(1).lower(), match.group(2), match.group(3))
241 return "<%s/>" % (match.group(1))
242
244 txt = doc.get_raw()
245
246 txt = txt.replace('\n', ' ')
247 txt = txt.replace('\r', ' ')
248 for x in range(9, 14):
249 txt = txt.replace('&#%d;' % (x), ' ')
250
251 txt = self.doctype_re.sub('', txt)
252 for e in self.entities.keys():
253 txt = txt.replace("&%s;" % (e), self.entities[e])
254
255 txt = self.amp_re.sub(self._loneAmpersand, txt)
256 txt = txt.replace('&<', '&<')
257 txt = self.attr_re.sub(self._attributeFix, txt)
258 txt = self.elem_re.sub(self._lowerElement, txt)
259 for t in self.emptyTags:
260 empty_re = re.compile('<(%s( [^>]+)?)[\s/]*>' % t)
261 txt = empty_re.sub(self._emptyElement, txt)
262
263 txt = self.pi_re.sub('', txt)
264
265 return StringDocument(txt, self.id, doc.processHistory, 'text/xml', doc.parent)