1
2 from document import StringDocument
3 from baseObjects import PreParser
4 import os, re
5 from utils import getFirstData, elementType, verifyXPaths
6
7 from TsujiiC3 import TsujiiObject, EnjuObject, GeniaObject
8
10 """ Base class for deriving Part of Speech PreParsers """
11 pass
12
14
15
16 inh = None
17 outh = None
18
19 - def __init__(self, session, node, parent):
20 PreParser.__init__(self, session, node, parent)
21 o = os.getcwd()
22 tp = self.get_path(session, 'chunkerPath')
23 if tp:
24 os.chdir(tp)
25 else:
26 os.chdir('../../code/tsujii')
27 (a,b) = os.popen2('./parser')
28 self.inh = a
29 self.outh = b
30 os.chdir(o)
31
32
45
46
48
49 - def __init__(self, session, node, parent):
52
59
60 -class TsujiiTextPosPreParser(PosPreParser, TsujiiObject):
61
62 - def __init__(self, session, node, parent):
65
66 - def process_document(self, session, doc):
67 text = doc.get_raw()
68 tt = self.tag(session, text, xml=0)
69 tt = '\n'.join(tt)
70 return StringDocument(tt, self.id, doc.processHistory, 'text/plain', doc.parent)
71
72 -class EnjuTextPreParser(PosPreParser, EnjuObject):
73 - def __init__(self, session, node, parent):
76
77 - def process_document(self, session, doc):
78 text = doc.get_raw()
79 tt = self.tag(session, text)
80 tt= '\n'.join(tt)
81 return StringDocument("<text>%s</text>" % tt)
82
83
84 -class GeniaTextPreParser(PreParser):
85 """ Take the full output from Genia and reconstruct the document, maybe with stems ('useStem') and/or PoS tags ('pos') """
86
87 - def __init__(self, session, config, parent):
88 PreParser.__init__(self, session, config, parent)
89 self.stem = self.get_setting(session, 'useStem', 0)
90 self.pos = self.get_setting(session, 'pos', 0)
91 self.puncre = re.compile('[ ]([.,;:?!][ \n])')
92
93 - def process_document(session, doc):
94 data = doc.get_raw()
95 lines = data.split('\n')
96 words = []
97 for l in lines:
98 if l == '\n':
99 words.append(l)
100 else:
101 (word, stem, pos, other) = l[:-1].split('\t')
102 if self.stem:
103 w = stem
104 else:
105 w = word
106 if self.pos:
107 w = "%s/%s" % (w, pos)
108 words.append(w)
109 txt = ' '.join(words)
110 txt = self.puncRe.sub('\\1', txt)
111 return StringDocument(txt)
112