1
2 from baseObjects import DocumentFactory
3 from document import StringDocument
4
5 import os, re
6 from utils import getFirstData, elementType, verifyXPaths
7
10 self.paraRe = re.compile('\n\n+')
11 self.sentenceRe = re.compile('.+?(?<!\.\.)[\.!?:]["\'\)]?(?=\s+|$)(?![a-z])')
12 self.abbrMashRe = re.compile('(^|\s)([^\s]+?\.[a-zA-Z]+|Prof|Dr|Sr|Mr|Mrs|Ms|Jr|Capt|Gen|Col|Sgt|[A-Z])\.(\s|$)')
13
15 return self.paraRe.split(data)
16
18 data = self.abbrMashRe.sub('\\1\\2.\\3', data)
19 sents = self.sentenceRe.findall(data)
20 if not sents:
21 data += '.'
22 sents = self.sentenceRe.findall(data)
23 return sents
24
25 myTokenizer = SimpleTokenizer()
26
28 inh = None
29 outh = None
30 tokenizer = None
31
32 - def __init__(self, session, node, parent):
33 o = os.getcwd()
34 tp = self.get_path(session, 'taggerPath')
35 if tp:
36 os.chdir(tp)
37 (a,b) = os.popen2('./tagger')
38 self.inh = a
39 self.outh = b
40 os.chdir(o)
41
42 - def tag(self, session, data, xml=0):
59
61 wds = data.split()
62 xml = []
63 for w in wds:
64 t = w.split('/')
65 xml.append('<t p="%s">%s</t>' % (t[1], t[0]))
66
67 return " ".join(xml)
68
69
71 inh = None
72 outh = None
73 tokenizer = None
74
75 - def __init__(self, session, node, parent):
76 tp = self.get_path(session, 'enjuPath')
77 if not tp:
78 tp = commands.getoutput('which enju')
79 if not tp:
80 raise ConfigFileException("%s requires the path: enjuPath" % self.id)
81 (a,b,c) = os.popen3("%s -xml" % tp)
82 self.inh = a
83 self.outh = b
84 self.errh = c
85 l = ""
86 while l != 'Ready\n':
87 l = c.readline()
88
89
90 - def tag(self, session, data, xml=0):
108
110 inh = None
111 outh = None
112 tokenizer = None
113
114 - def __init__(self, session, node, parent):
115 self.unparsedOutput = self.get_setting(session, 'parseOutput', 0)
116 tp = self.get_path(session, 'filePath')
117 if not tp:
118 tp = commands.getoutput('which geniatagger')
119 if not tp:
120 raise ConfigFileException("%s requires the path: filePath" % self.id)
121 os.chdir(tp)
122 (a,b,c) = os.popen3("./geniatagger")
123 self.inh = a
124 self.outh = b
125 self.errh = c
126 l = ""
127 while l != 'loading chunk_models....done.\n':
128 l = c.readline()
129
130
131 - def tag(self, session, data, xml=0):
132 paras = myTokenizer.split_paragraphs(data)
133 words = []
134 for p in paras:
135 sents = myTokenizer.split_sentences(p)
136 for s in sents:
137 s = s.strip()
138 if not s:
139 continue
140 try:
141 self.inh.write(s)
142 except UnicodeEncodeError:
143 self.inh.write(s.encode('utf-8'))
144 self.inh.write("\n")
145 self.inh.flush()
146 tagline = ""
147 while 1:
148 tagline = self.outh.readline()
149 if tagline == "\n":
150 if self.unparsedOutput:
151 words.append(tagline)
152 break
153 else:
154
155 if self.unparsedOutput:
156 words.append(tagline)
157 else:
158 (word, stem, type, type2) = tagline[:-1].split('\t')
159 words.append({'text' : word, 'stem' : stem, 'pos' : type, 'phr' : type2})
160 return words
161
162
163
165
167
168
169
170 vs = rec.process_xpath('phrase[@cat="VP"]/word')
171 dg = StringDocumentGroup("")
172 processed = []
173 for v in vs:
174
175 (name, attrs) = rec._convert_elem(v[0])
176 prepstr = ""
177 iobjstr = ""
178 arg1 = attrs[u'arg1']
179 vtxt = v[1][2:]
180 vid = attrs['id']
181 if vid in processed:
182 continue
183 verb = ['<verb>', '<w pos="%s" base="%s">%s</w>' % (attrs['pos'], attrs['base'], vtxt)]
184 el1 = rec.process_xpath('phrase[@id="%s"]/descendant::word' % arg1)
185 txt = ['<subject>']
186 for w in el1:
187 (name, nattrs) = rec._convert_elem(w[0])
188 txt.append('<w pos="%s">%s</w>' % (nattrs['pos'], w[1][2:]))
189 txt.append("</subject>")
190 subj = ' '.join(txt)
191
192 try:
193 arg2 = attrs[u'arg2']
194
195
196 el2 = rec.process_xpath('phrase[@id="%s"]' % arg2)
197 (name, nattrs) = rec._convert_elem(el2[0][0])
198 nid = nattrs['id']
199 while nattrs[u'cat'] == "VP":
200 allv = rec.process_xpath('phrase[@id="%s"]/descendant::word' % nid)
201 (name, avattrs) = rec._convert_elem(allv[0][0])
202 verb.append('<w pos="%s" base="%s">%s</w>' % (avattrs['pos'], avattrs['base'], allv[0][1][2:]))
203 processed.append(avattrs['id'])
204 avarg2 = avattrs['arg2']
205 if avarg2 == arg1:
206 avarg2 = avattrs['arg1']
207 if avarg2 == '-1':
208
209 break
210 el2 = rec.process_xpath('phrase[@id="%s"]' % avarg2 )
211 (name, nattrs) = rec._convert_elem(el2[0][0])
212 nid = nattrs['id']
213
214 el2 = rec.process_xpath('phrase[@id="%s"]/descendant::word' % nid)
215 txt = ['<object>']
216 for w in el2:
217 (name, nattrs) = rec._convert_elem(w[0])
218 txt.append('<w pos="%s">%s</w>' % (nattrs['pos'], w[1][2:]))
219 txt.append("</object>")
220 obj = ' '.join(txt)
221 except KeyError:
222 obj = "<object/>"
223
224 ppxp = rec.process_xpath("word[@arg1='%s']" % (int(vid) -1))
225 if ppxp:
226 (name, attrs) = rec._convert_elem(ppxp[0][0])
227 ptag = '<w pos="%s">%s</w>' % (attrs['pos'], ppxp[0][1][2:])
228 prepstr = "<prep>%s</prep>\n" % ptag
229 try:
230 iobjxp = rec.process_xpath("phrase[@id='%s']/descendant::word" % attrs['arg2'])
231 iobjlist = ['<iobject>']
232 for w in iobjxp:
233 (name, nattrs) = rec._convert_elem(w[0])
234 iobjlist.append('<w pos="%s">%s</w>' % (nattrs['pos'], w[1][2:]))
235 iobjlist.append('</iobject>')
236 iobjstr = ' '.join(iobjlist) + "\n"
237 except:
238 prepstr = ""
239 iobjstr = ""
240
241 verb.append('</verb>')
242 verb = ' '.join(verb)
243 dg.doctexts.append("<svopi>\n %s\n %s\n %s\n%s%s</svopi>" % (subj, verb, obj, prepstr, iobjstr))
244
245 return dg
246