Module multivalent
[hide private]
[frames] | no frames]

Source Code for Module multivalent

  1  # 
  2  # multivalent.py 
  3  # Version: 0.04 
  4  # 
  5  # Description: 
  6  #    Cheshire3 <-> Multivalent support 
  7  # 
  8  # Author:     
  9  #    John Harrison (john.harrison@liv.ac.uk) 
 10  #    Dr Robert Sanderson (azaroth@liv.ac.uk) 
 11  # 
 12  # Copyright: &copy; University of Liverpool 2005 
 13  # 
 14  # Version History: 
 15  # 0.01 - 2005/06/?? - JH - Multivalent protocol client scripts OO-ed and Cheshirized 
 16  # 0.02 - 2005/07/?? - RS - Reviewed, code improvements 
 17  # 0.03 - 2005/08/05 - JH - Support for UrlDocument objects 
 18  #                        - Attempt to reconnect if connection to server lost (in MultivalentPreParser.process_document) 
 19  # 0.04 - 2005/11/03 - JH - Starts and uses a local copy of multivalent server if a path to it is provided 
 20  #                        - Also provides a method close_mvServer to close the external server cleanly 
 21  # 
 22   
 23   
 24  from baseObjects import PreParser 
 25  from document import StringDocument 
 26  import sys, os, re, socket, mimetypes, random 
 27  from c3errors import ExternalSystemException, ConfigFileException 
 28   
 29  import atexit, commands 
 30   
 31  # Az: 
 32  # socket.setdefaulttimeout needs to be globalised at a server level 
 33  # as it affects every socket 
 34  #socket.setdefaulttimeout(1 * 60) 
 35   
 36  # XXX To multivalent.py 
37 -class MvdPdfPreParser(PreParser):
38 """ Multivalent Pre Parser to turn PDF into XML """ 39
40 - def process_document(self, session, doc):
41 (qqq, fn) = tempfile.mkstemp('.pdf') 42 fh = file(fn, 'w') 43 fh.write(doc.get_raw()) 44 fh.close() 45 cmd = "java -Djava.awt.headless=true -cp /users/azaroth/cheshire3/code/mvd/Multivalent20050929.jar tool.doc.ExtractText -output xml %s" % fn 46 (i, o, err) = os.popen3(cmd) 47 data = o.read() 48 os.remove(fn) 49 return StringDocument(data)
50 51
52 -class MultivalentPreParser(PreParser):
53 54 inMimeType = "" 55 outMimeType = "" 56 mvClient = None 57 mvHost = None 58 mvPort = None 59 returnPacking = "" 60 source_re = None 61 # for when we need to start the server locally 62 mvServerPath = None 63
64 - def __init__(self, session, server, config):
65 PreParser.__init__(self, session, server, config) 66 self.source_re = re.compile("<open file '(.+?)', mode '.' at .*?>") 67 68 # get settings from config 69 # Az: Check existence of settings and fail consistently rather than 70 # die half way through execution 71 self.mvServerPath = self.get_path(session, 'mvServerPath') 72 if self.mvServerPath: 73 # they've specified a local path to the server code 74 # we should start a server locally with automatically generated port, in local-only mode 75 if not os.path.exists(self.mvServerPath): 76 raise ConfigFileException('Path type="mvServerPath" does not exist') 77 78 host = '127.0.0.1' 79 # find a random free port 80 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 81 err = True 82 while (err): 83 err = False 84 port = random.randrange(10000) 85 try: s.bind((host,port)) 86 except: err = True 87 88 s.close() 89 del s 90 mvStdin, mvStdout = os.popen2('java -D64 -Djava.awt.headless=true -Xms40m -Xmx256m -jar %s %d -guess -out xml -link' % (self.mvServerPath, port), 't') 91 92 else: 93 # get settings for remote mv server 94 host = self.get_setting(session, 'host') 95 port = self.get_setting(session, 'port') 96 if not port.isdigit(): 97 raise ConfigFileException("'port' setting for Multivalent preParser must be an integer.") 98 99 pack = self.get_setting(session, 'returnPacking') 100 if not (host and port and pack): 101 raise ConfigFileException("'host', 'port' and 'returnPacking' settings must be set for Multivalent preParser '%s'" % self.id) 102 103 self.mvHost = host 104 self.mvPort = int(port) 105 self.returnPacking = pack.lower() 106 if (self.returnPacking == 'xml'): 107 self.outMimeType = 'text/xml' 108 else: 109 self.outMimeType = 'text/plain' 110 # initialise and connect to multivalent client 111 self.mvClient = MultivalentClient() 112 try: 113 self.mvClient.connect(self.mvHost, self.mvPort) 114 except: 115 # (Try to connect at run time) 116 pass 117 atexit.register(self.close_mvServer)
118
119 - def get_mimetype(self, doc):
120 if doc.mimeType: return doc.mimeType 121 try: 122 filepath = self.source_re.search(str(doc.handle)).group(1) 123 except AttributeError: 124 try: 125 filepath = doc.url 126 except AttributeError: 127 # UHOH not a FileDocument or a UrlDocument - better think about it some more 128 pass 129 130 try: 131 filename = filepath.split('/')[-1] 132 return mimetypes.guess_type(filename)[0] or 'application/octet-stream' 133 except: 134 return 'application/octet-stream'
135 136
137 - def process_document(self, session, doc):
138 if not (self.mvClient.connectedToServer): 139 try: 140 self.mvClient.connect(self.mvHost, self.mvPort) 141 except: 142 raise ExternalSystemException('Could not connect to Multivalent server') 143 144 # returns Document 145 history = doc.processHistory 146 # Az: get_raw(session) maybe expensive, --> local var 147 data = doc.get_raw() 148 attrs = {'mimetype': self.get_mimetype(doc), 149 'size': str(len(data)), 150 'packaging': self.returnPacking 151 } 152 153 try: 154 status, mvStr = self.mvClient.mvProtocol(data, attrs) 155 except ExternalSystemException: 156 try: 157 # try to reconnect 158 self.mvClient.disconnect() 159 self.mvClient.connect(host, port) 160 status, mvStr = self.mvClient.mvProtocol(data, attrs) 161 except: 162 raise ExternalSystemException('Could not re-establish connection to Multivalent server') 163 except socket.timeout: 164 # reset connection for next time 165 self.mvClient.disconnect() 166 #raise some kind of parsing error 167 raise ExternalSystemException('Timeout to remote Multivalent server.') 168 169 if (status != 'OK'): 170 # raise some kind of exception? 171 raise ExternalSystemException('Status from Multivalent Server: %s' % status) 172 173 doc = StringDocument(mvStr, history=history) 174 return doc
175 176 #- end process_document() 177
178 - def close_mvServer(self):
179 commands.getoutput('killall -9 java') 180 # disconnect 181 if (self.mvClient.connectedToServer): 182 self.mvClient.disconnect() 183 # close external mvServer process via it's own shutdown procedure 184 mvStdout = os.popen('java -jar %s %d -stop' % (self.mvServerPath, self.mvPort), 'r')
185 186 #- end close_mvServer() 187 188
189 -class MultivalentClient:
190 sock = None 191 connectedToServer = False 192
193 - def __init__(self):
194 self.sock = None 195 self.connectedToServer = False
196
197 - def connect(self, host, port):
198 self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 199 self.sock.connect((host, port)) 200 self.connectedToServer = True
201
202 - def disconnect(self):
203 self.sock.close() 204 self.sock = None 205 self.connectedToServer = False
206
207 - def mvProtocol(self, data, attrs):
208 for attrName, attrVal in attrs.iteritems(): 209 self.sock.sendall('%s = %s\n' % (attrName, attrVal)) 210 recvd = self.sock.recv(1024) 211 if (recvd.lower()[:2] != 'ok'): 212 raise ExternalSystemException('KO - server would not accept attributes') 213 214 # send command that we are ready to send data 215 self.sock.sendall('DATA\n') 216 recvd = self.sock.recv(1024) 217 218 if (recvd.lower()[:4] != 'send'): 219 raise ExternalSystemException('KO - server not prepared to accept data') 220 221 # ok, server is ready for data 222 try: 223 self.sock.sendall(data) 224 except socket.timeout: 225 raise ExternalSystemException('Timeout to Multivalent server') 226 227 inAttrs = {} 228 attr = self.sock.recv(1024) 229 230 while (attr.lower()[:4] != 'data'): 231 # Az: XXX Risky to expect non garbage over the wire 232 try: 233 attrName = attr.split(" = ", 1)[0].lower() 234 attrVal = attr.split(" = ", 1)[1] 235 inAttrs[attrName] = attrVal 236 self.sock.sendall('OK\n') 237 attr = self.sock.recv(1024) 238 except: 239 print "BUSTED PROTOCOL:" 240 print attr 241 break 242 243 if inAttrs['status'][:2] != 'OK': 244 return (inAttrs['status'][2:], None) 245 246 # OK we're ready to receive formatted data 247 self.sock.sendall('SEND\n') 248 expLen = int(inAttrs['size']) 249 recvd = self.sock.recv(expLen) 250 txtPacks = [] 251 txtPacks.append(recvd) 252 cumLen = len(recvd) 253 while cumLen < expLen: 254 try: 255 recvd = self.sock.recv(expLen) 256 except: 257 raise ExternalSystemException('Timeout during receive from Multivalent server.') 258 txtPacks.append(recvd) 259 cumLen += len(recvd) 260 261 txt = ''.join(txtPacks) 262 return ('OK', txt)
263