1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 from baseObjects import PreParser
25 from document import StringDocument
26 import sys, os, re, socket, mimetypes, random
27 from c3errors import ExternalSystemException, ConfigFileException
28
29 import atexit, commands
30
31
32
33
34
35
36
38 """ Multivalent Pre Parser to turn PDF into XML """
39
41 (qqq, fn) = tempfile.mkstemp('.pdf')
42 fh = file(fn, 'w')
43 fh.write(doc.get_raw())
44 fh.close()
45 cmd = "java -Djava.awt.headless=true -cp /users/azaroth/cheshire3/code/mvd/Multivalent20050929.jar tool.doc.ExtractText -output xml %s" % fn
46 (i, o, err) = os.popen3(cmd)
47 data = o.read()
48 os.remove(fn)
49 return StringDocument(data)
50
51
53
54 inMimeType = ""
55 outMimeType = ""
56 mvClient = None
57 mvHost = None
58 mvPort = None
59 returnPacking = ""
60 source_re = None
61
62 mvServerPath = None
63
64 - def __init__(self, session, server, config):
65 PreParser.__init__(self, session, server, config)
66 self.source_re = re.compile("<open file '(.+?)', mode '.' at .*?>")
67
68
69
70
71 self.mvServerPath = self.get_path(session, 'mvServerPath')
72 if self.mvServerPath:
73
74
75 if not os.path.exists(self.mvServerPath):
76 raise ConfigFileException('Path type="mvServerPath" does not exist')
77
78 host = '127.0.0.1'
79
80 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
81 err = True
82 while (err):
83 err = False
84 port = random.randrange(10000)
85 try: s.bind((host,port))
86 except: err = True
87
88 s.close()
89 del s
90 mvStdin, mvStdout = os.popen2('java -D64 -Djava.awt.headless=true -Xms40m -Xmx256m -jar %s %d -guess -out xml -link' % (self.mvServerPath, port), 't')
91
92 else:
93
94 host = self.get_setting(session, 'host')
95 port = self.get_setting(session, 'port')
96 if not port.isdigit():
97 raise ConfigFileException("'port' setting for Multivalent preParser must be an integer.")
98
99 pack = self.get_setting(session, 'returnPacking')
100 if not (host and port and pack):
101 raise ConfigFileException("'host', 'port' and 'returnPacking' settings must be set for Multivalent preParser '%s'" % self.id)
102
103 self.mvHost = host
104 self.mvPort = int(port)
105 self.returnPacking = pack.lower()
106 if (self.returnPacking == 'xml'):
107 self.outMimeType = 'text/xml'
108 else:
109 self.outMimeType = 'text/plain'
110
111 self.mvClient = MultivalentClient()
112 try:
113 self.mvClient.connect(self.mvHost, self.mvPort)
114 except:
115
116 pass
117 atexit.register(self.close_mvServer)
118
120 if doc.mimeType: return doc.mimeType
121 try:
122 filepath = self.source_re.search(str(doc.handle)).group(1)
123 except AttributeError:
124 try:
125 filepath = doc.url
126 except AttributeError:
127
128 pass
129
130 try:
131 filename = filepath.split('/')[-1]
132 return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
133 except:
134 return 'application/octet-stream'
135
136
175
176
177
185
186
187
188
201
206
208 for attrName, attrVal in attrs.iteritems():
209 self.sock.sendall('%s = %s\n' % (attrName, attrVal))
210 recvd = self.sock.recv(1024)
211 if (recvd.lower()[:2] != 'ok'):
212 raise ExternalSystemException('KO - server would not accept attributes')
213
214
215 self.sock.sendall('DATA\n')
216 recvd = self.sock.recv(1024)
217
218 if (recvd.lower()[:4] != 'send'):
219 raise ExternalSystemException('KO - server not prepared to accept data')
220
221
222 try:
223 self.sock.sendall(data)
224 except socket.timeout:
225 raise ExternalSystemException('Timeout to Multivalent server')
226
227 inAttrs = {}
228 attr = self.sock.recv(1024)
229
230 while (attr.lower()[:4] != 'data'):
231
232 try:
233 attrName = attr.split(" = ", 1)[0].lower()
234 attrVal = attr.split(" = ", 1)[1]
235 inAttrs[attrName] = attrVal
236 self.sock.sendall('OK\n')
237 attr = self.sock.recv(1024)
238 except:
239 print "BUSTED PROTOCOL:"
240 print attr
241 break
242
243 if inAttrs['status'][:2] != 'OK':
244 return (inAttrs['status'][2:], None)
245
246
247 self.sock.sendall('SEND\n')
248 expLen = int(inAttrs['size'])
249 recvd = self.sock.recv(expLen)
250 txtPacks = []
251 txtPacks.append(recvd)
252 cumLen = len(recvd)
253 while cumLen < expLen:
254 try:
255 recvd = self.sock.recv(expLen)
256 except:
257 raise ExternalSystemException('Timeout during receive from Multivalent server.')
258 txtPacks.append(recvd)
259 cumLen += len(recvd)
260
261 txt = ''.join(txtPacks)
262 return ('OK', txt)