Package textmining :: Module TsujiiC3
[hide private]
[frames] | no frames]

Source Code for Module textmining.TsujiiC3

  1   
  2  from baseObjects import DocumentFactory 
  3  from document import StringDocument 
  4   
  5  import os, re 
  6  from utils import getFirstData, elementType, verifyXPaths 
  7   
8 -class SimpleTokenizer:
9 - def __init__(self):
10 self.paraRe = re.compile('\n\n+') 11 self.sentenceRe = re.compile('.+?(?<!\.\.)[\.!?:]["\'\)]?(?=\s+|$)(?![a-z])') 12 self.abbrMashRe = re.compile('(^|\s)([^\s]+?\.[a-zA-Z]+|Prof|Dr|Sr|Mr|Mrs|Ms|Jr|Capt|Gen|Col|Sgt|[A-Z])\.(\s|$)')
13
14 - def split_paragraphs(self, data):
15 return self.paraRe.split(data)
16
17 - def split_sentences(self, data):
18 data = self.abbrMashRe.sub('\\1\\2&#46;\\3', data) 19 sents = self.sentenceRe.findall(data) 20 if not sents: 21 data += '.' 22 sents = self.sentenceRe.findall(data) 23 return sents
24 25 myTokenizer = SimpleTokenizer() 26
27 -class TsujiiObject:
28 inh = None 29 outh = None 30 tokenizer = None 31
32 - def __init__(self, session, node, parent):
33 o = os.getcwd() 34 tp = self.get_path(session, 'taggerPath') 35 if tp: 36 os.chdir(tp) 37 (a,b) = os.popen2('./tagger') 38 self.inh = a 39 self.outh = b 40 os.chdir(o)
41
42 - def tag(self, session, data, xml=0):
43 all = [] 44 paras = myTokenizer.split_paragraphs(data) 45 for p in paras: 46 sents = myTokenizer.split_sentences(p) 47 for s in sents: 48 try: 49 self.inh.write(s) 50 except UnicodeEncodeError: 51 self.inh.write(s.encode('utf-8')) 52 self.inh.write("\n") 53 self.inh.flush() 54 tagd = self.outh.readline() 55 if xml: 56 tagd = self.toxml(tagd) 57 all.append(tagd) 58 return all
59
60 - def toxml(self, data):
61 wds = data.split() 62 xml = [] 63 for w in wds: 64 t = w.split('/') 65 xml.append('<t p="%s">%s</t>' % (t[1], t[0])) 66 67 return " ".join(xml)
68 69
70 -class EnjuObject:
71 inh = None 72 outh = None 73 tokenizer = None 74
75 - def __init__(self, session, node, parent):
76 tp = self.get_path(session, 'enjuPath') 77 if not tp: 78 tp = commands.getoutput('which enju') 79 if not tp: 80 raise ConfigFileException("%s requires the path: enjuPath" % self.id) 81 (a,b,c) = os.popen3("%s -xml" % tp) 82 self.inh = a 83 self.outh = b 84 self.errh = c 85 l = "" 86 while l != 'Ready\n': 87 l = c.readline()
88 89
90 - def tag(self, session, data, xml=0):
91 paras = myTokenizer.split_paragraphs(data) 92 all = [] 93 for p in paras: 94 sents = myTokenizer.split_sentences(p) 95 for s in sents: 96 s = s.strip() 97 if not s: 98 continue 99 try: 100 self.inh.write(s) 101 except UnicodeEncodeError: 102 self.inh.write(s.encode('utf-8')) 103 self.inh.write("\n") 104 self.inh.flush() 105 tagd = self.outh.readline() 106 all.append("<sentence>%s</sentence>" % tagd) 107 return all
108
109 -class GeniaObject:
110 inh = None 111 outh = None 112 tokenizer = None 113
114 - def __init__(self, session, node, parent):
115 self.unparsedOutput = self.get_setting(session, 'parseOutput', 0) 116 tp = self.get_path(session, 'filePath') 117 if not tp: 118 tp = commands.getoutput('which geniatagger') 119 if not tp: 120 raise ConfigFileException("%s requires the path: filePath" % self.id) 121 os.chdir(tp) 122 (a,b,c) = os.popen3("./geniatagger") 123 self.inh = a 124 self.outh = b 125 self.errh = c 126 l = "" 127 while l != 'loading chunk_models....done.\n': 128 l = c.readline()
129 130
131 - def tag(self, session, data, xml=0):
132 paras = myTokenizer.split_paragraphs(data) 133 words = [] 134 for p in paras: 135 sents = myTokenizer.split_sentences(p) 136 for s in sents: 137 s = s.strip() 138 if not s: 139 continue 140 try: 141 self.inh.write(s) 142 except UnicodeEncodeError: 143 self.inh.write(s.encode('utf-8')) 144 self.inh.write("\n") 145 self.inh.flush() 146 tagline = "" 147 while 1: 148 tagline = self.outh.readline() 149 if tagline == "\n": 150 if self.unparsedOutput: 151 words.append(tagline) 152 break 153 else: 154 # chomp 155 if self.unparsedOutput: 156 words.append(tagline) 157 else: 158 (word, stem, type, type2) = tagline[:-1].split('\t') 159 words.append({'text' : word, 'stem' : stem, 'pos' : type, 'phr' : type2}) 160 return words
161 162 163
164 -class EnjuGroupDocumentStream(DocumentFactory):
165
166 - def process_record(self, session, rec):
167 # Find verb words 168 # XXX: XPaths are actually very slow using leaf->top approach :( 169 170 vs = rec.process_xpath('phrase[@cat="VP"]/word') 171 dg = StringDocumentGroup("") 172 processed = [] 173 for v in vs: 174 # Find word, and arg1, arg2 175 (name, attrs) = rec._convert_elem(v[0]) 176 prepstr = "" 177 iobjstr = "" 178 arg1 = attrs[u'arg1'] 179 vtxt = v[1][2:] 180 vid = attrs['id'] 181 if vid in processed: 182 continue 183 verb = ['<verb>', '<w pos="%s" base="%s">%s</w>' % (attrs['pos'], attrs['base'], vtxt)] 184 el1 = rec.process_xpath('phrase[@id="%s"]/descendant::word' % arg1) 185 txt = ['<subject>'] 186 for w in el1: 187 (name, nattrs) = rec._convert_elem(w[0]) 188 txt.append('<w pos="%s">%s</w>' % (nattrs['pos'], w[1][2:])) 189 txt.append("</subject>") 190 subj = ' '.join(txt) 191 192 try: 193 arg2 = attrs[u'arg2'] 194 # arg2 might be more verb 195 # eg 'will -> be -> treating' 196 el2 = rec.process_xpath('phrase[@id="%s"]' % arg2) 197 (name, nattrs) = rec._convert_elem(el2[0][0]) 198 nid = nattrs['id'] 199 while nattrs[u'cat'] == "VP": 200 allv = rec.process_xpath('phrase[@id="%s"]/descendant::word' % nid) 201 (name, avattrs) = rec._convert_elem(allv[0][0]) 202 verb.append('<w pos="%s" base="%s">%s</w>' % (avattrs['pos'], avattrs['base'], allv[0][1][2:])) 203 processed.append(avattrs['id']) 204 avarg2 = avattrs['arg2'] 205 if avarg2 == arg1: 206 avarg2 = avattrs['arg1'] 207 if avarg2 == '-1': 208 # no arg2, fall back 209 break 210 el2 = rec.process_xpath('phrase[@id="%s"]' % avarg2 ) 211 (name, nattrs) = rec._convert_elem(el2[0][0]) 212 nid = nattrs['id'] 213 214 el2 = rec.process_xpath('phrase[@id="%s"]/descendant::word' % nid) 215 txt = ['<object>'] 216 for w in el2: 217 (name, nattrs) = rec._convert_elem(w[0]) 218 txt.append('<w pos="%s">%s</w>' % (nattrs['pos'], w[1][2:])) 219 txt.append("</object>") 220 obj = ' '.join(txt) 221 except KeyError: 222 obj = "<object/>" 223 # Try for Prep + Iobjstr 224 ppxp = rec.process_xpath("word[@arg1='%s']" % (int(vid) -1)) 225 if ppxp: 226 (name, attrs) = rec._convert_elem(ppxp[0][0]) 227 ptag = '<w pos="%s">%s</w>' % (attrs['pos'], ppxp[0][1][2:]) 228 prepstr = "<prep>%s</prep>\n" % ptag 229 try: 230 iobjxp = rec.process_xpath("phrase[@id='%s']/descendant::word" % attrs['arg2']) 231 iobjlist = ['<iobject>'] 232 for w in iobjxp: 233 (name, nattrs) = rec._convert_elem(w[0]) 234 iobjlist.append('<w pos="%s">%s</w>' % (nattrs['pos'], w[1][2:])) 235 iobjlist.append('</iobject>') 236 iobjstr = ' '.join(iobjlist) + "\n" 237 except: 238 prepstr = "" 239 iobjstr = "" 240 241 verb.append('</verb>') 242 verb = ' '.join(verb) 243 dg.doctexts.append("<svopi>\n %s\n %s\n %s\n%s%s</svopi>" % (subj, verb, obj, prepstr, iobjstr)) 244 245 return dg
246