Package textmining :: Module tmDocumentFactory
[hide private]
[frames] | no frames]

Source Code for Module textmining.tmDocumentFactory

  1   
  2  from documentFactory import BaseDocumentStream 
  3  from document import StringDocument 
  4   
  5  import os, re 
  6   
7 -class EnjuRecordDocumentStream(BaseDocumentStream):
8
9 - def open_stream(self, stream):
10 # stream here will be a record in Enju XML schema 11 self.streamLocation = stream.id 12 return stream
13
14 - def find_documents(self, session, cache=0):
15 16 # Find verbs, one verb per document 17 vs = rec.process_xpath('phrase[@cat="VP"]/word') 18 docs = [] 19 processed = [] 20 for v in vs: 21 # Find word, and arg1, arg2 22 (name, attrs) = rec._convert_elem(v[0]) 23 prepstr = "" 24 iobjstr = "" 25 arg1 = attrs[u'arg1'] 26 vtxt = v[1][2:] 27 vid = attrs['id'] 28 if vid in processed: 29 continue 30 verb = ['<verb>', '<w pos="%s" base="%s">%s</w>' % (attrs['pos'], attrs['base'], vtxt)] 31 el1 = rec.process_xpath('phrase[@id="%s"]/descendant::word' % arg1) 32 txt = ['<subject>'] 33 for w in el1: 34 (name, nattrs) = rec._convert_elem(w[0]) 35 txt.append('<w pos="%s">%s</w>' % (nattrs['pos'], w[1][2:])) 36 txt.append("</subject>") 37 subj = ' '.join(txt) 38 39 try: 40 arg2 = attrs[u'arg2'] 41 # arg2 might be more verb 42 # eg 'will -> be -> treating' 43 el2 = rec.process_xpath('phrase[@id="%s"]' % arg2) 44 (name, nattrs) = rec._convert_elem(el2[0][0]) 45 nid = nattrs['id'] 46 while nattrs[u'cat'] == "VP": 47 allv = rec.process_xpath('phrase[@id="%s"]/descendant::word' % nid) 48 (name, avattrs) = rec._convert_elem(allv[0][0]) 49 verb.append('<w pos="%s" base="%s">%s</w>' % (avattrs['pos'], avattrs['base'], allv[0][1][2:])) 50 processed.append(avattrs['id']) 51 avarg2 = avattrs['arg2'] 52 if avarg2 == arg1: 53 avarg2 = avattrs['arg1'] 54 if avarg2 == '-1': 55 # no arg2, fall back 56 break 57 el2 = rec.process_xpath('phrase[@id="%s"]' % avarg2 ) 58 (name, nattrs) = rec._convert_elem(el2[0][0]) 59 nid = nattrs['id'] 60 61 el2 = rec.process_xpath('phrase[@id="%s"]/descendant::word' % nid) 62 txt = ['<object>'] 63 for w in el2: 64 (name, nattrs) = rec._convert_elem(w[0]) 65 txt.append('<w pos="%s">%s</w>' % (nattrs['pos'], w[1][2:])) 66 txt.append("</object>") 67 obj = ' '.join(txt) 68 except KeyError: 69 obj = "<object/>" 70 # Try for Prep + Iobjstr 71 ppxp = rec.process_xpath("word[@arg1='%s']" % (int(vid) -1)) 72 if ppxp: 73 (name, attrs) = rec._convert_elem(ppxp[0][0]) 74 ptag = '<w pos="%s">%s</w>' % (attrs['pos'], ppxp[0][1][2:]) 75 prepstr = "<prep>%s</prep>\n" % ptag 76 try: 77 iobjxp = rec.process_xpath("phrase[@id='%s']/descendant::word" % attrs['arg2']) 78 iobjlist = ['<iobject>'] 79 for w in iobjxp: 80 (name, nattrs) = rec._convert_elem(w[0]) 81 iobjlist.append('<w pos="%s">%s</w>' % (nattrs['pos'], w[1][2:])) 82 iobjlist.append('</iobject>') 83 iobjstr = ' '.join(iobjlist) + "\n" 84 except: 85 prepstr = "" 86 iobjstr = "" 87 88 verb.append('</verb>') 89 verb = ' '.join(verb) 90 docstr = "<svopi>\n %s\n %s\n %s\n%s%s</svopi>" % (subj, verb, obj, prepstr, iobjstr) 91 doc = StringDocument(docstr) 92 if cache == 0: 93 yield doc 94 elif cache == 1: 95 raise NotImplementedError 96 else: 97 docs.append(doc) 98 self.documents = docs 99 raise StopIteration
100