1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """Manage the Wordfast Translation Memory format
23
24 Wordfast TM format is the Translation Memory format used by the
25 U{Wordfast<http://www.wordfast.net/>} computer aided translation tool.
26
27 It is a bilingual base class derived format with L{WordfastTMFile}
28 and L{WordfastUnit} providing file and unit level access.
29
30 Wordfast tools
31 ==============
32 Wordfast is a computer aided translation tool. It is an application
33 built on top of Microsoft Word and is implemented as a rather
34 sophisticated set of macros. Understanding that helps us understand
35 many of the seemingly strange choices around this format including:
36 encoding, escaping and file naming.
37
38 Implementation
39 ==============
40 The implementation covers the full requirements of a Wordfast TM file.
41 The files are simple Tab Separated Value (TSV) files that can be read
42 by Microsoft Excel and other spreadsheet programs. They use the .txt
43 extension which does make it more difficult to automatically identify
44 such files.
45
46 The dialect of the TSV files is specified by L{WordfastDialect}.
47
48 Encoding
49 --------
50 The files are UTF-16 or ISO-8859-1 (Latin1) encoded. These choices
51 are most likely because Microsoft Word is the base editing tool for
52 Wordfast.
53
54 The format is tab separated so we are able to detect UTF-16 vs Latin-1
55 by searching for the occurance of a UTF-16 tab character and then
56 continuing with the parsing.
57
58 Timestamps
59 ----------
60 L{WordfastTime} allows for the correct management of the Wordfast
61 YYYYMMDD~HHMMSS timestamps. However, timestamps on individual units are
62 not updated when edited.
63
64 Header
65 ------
66 L{WordfastHeader} provides header management support. The header
67 functionality is fully implemented through observing the behaviour of the
68 files in real use cases, input from the Wordfast programmers and
69 public documentation.
70
71 Escaping
72 --------
73 Wordfast TM implements a form of escaping that covers two aspects:
74 1. Placeable: bold, formating, etc. These are left as is and ignored.
75 It is up to the editor and future placeable implementation to manage
76 these.
77 2. Escapes: items that may confuse Excel or translators are
78 escaped as &'XX;. These are fully implemented and are converted to
79 and from Unicode. By observing behaviour and reading documentation
80 we where able to observe all possible escapes. Unfortunately the
81 escaping differs slightly between Windows and Mac version. This
82 might cause errors in future.
83 Functions allow for L{conversion to Unicode<_wf_to_char>} and L{back to
84 Wordfast escapes<_char_to_wf>}.
85
86 Extended Attributes
87 -------------------
88 The last 4 columns allow users to define and manage extended attributes.
89 These are left as is and are not directly managed byour implemenation.
90 """
91
92 import csv
93 import sys
94 import time
95 from translate.storage import base
96
97 WF_TIMEFORMAT = "%Y%m%d~%H%M%S"
98 """Time format used by Wordfast"""
99
100 WF_FIELDNAMES_HEADER = ["date", "userlist", "tucount", "src-lang", "version", "target-lang", "license", "attr1list", "attr2list", "attr3list", "attr4list", "attr5list"]
101 """Field names for the Wordfast header"""
102
103 WF_FIELDNAMES = ["date", "user", "reuse", "src-lang", "source", "target-lang", "target", "attr1", "attr2", "attr3", "attr4"]
104 """Field names for a Wordfast TU"""
105
106 WF_FIELDNAMES_HEADER_DEFAULTS = {
107 "date": "%19000101~121212",
108 "userlist": "%User ID,TT,TT Translate-Toolkit",
109 "tucount": "%TU=00000001",
110 "src-lang": "%EN-US",
111 "version": "%Wordfast TM v.5.51w9/00",
112 "target-lang": "",
113 "license": "%---00000001",
114 "attr1list": "",
115 "attr2list": "",
116 "attr3list": "",
117 "attr4list": "" }
118 """Default or minimum header entries for a Wordfast file"""
119
120
121
122
123
124 WF_ESCAPE_MAP = (
125 ("&'26;", u"\u0026"),
126 ("&'82;", u"\u201A"),
127 ("&'85;", u"\u2026"),
128 ("&'91;", u"\u2018"),
129 ("&'92;", u"\u2019"),
130 ("&'93;", u"\u201C"),
131 ("&'94;", u"\u201D"),
132 ("&'96;", u"\u2013"),
133 ("&'97;", u"\u2014"),
134 ("&'99;", u"\u2122"),
135
136 ("&'A0;", u"\u00A0"),
137 ("&'A9;", u"\u00A9"),
138 ("&'AE;", u"\u00AE"),
139 ("&'BC;", u"\u00BC"),
140 ("&'BD;", u"\u00BD"),
141 ("&'BE;", u"\u00BE"),
142
143 ("&'A8;", u"\u00AE"),
144 ("&'AA;", u"\u2122"),
145 ("&'C7;", u"\u00AB"),
146 ("&'C8;", u"\u00BB"),
147 ("&'C9;", u"\u2026"),
148 ("&'CA;", u"\u00A0"),
149 ("&'D0;", u"\u2013"),
150 ("&'D1;", u"\u2014"),
151 ("&'D2;", u"\u201C"),
152 ("&'D3;", u"\u201D"),
153 ("&'D4;", u"\u2018"),
154 ("&'D5;", u"\u2019"),
155 ("&'E2;", u"\u201A"),
156 ("&'E3;", u"\u201E"),
157
158
159 )
160 """Mapping of Wordfast &'XX; escapes to correct Unicode characters"""
161
162 TAB_UTF16 = "\x00\x09"
163 """The tab \\t character as it would appear in UTF-16 encoding"""
164
166 """Char -> Wordfast &'XX; escapes
167
168 Full roundtripping is not possible because of the escaping of NEWLINE \\n
169 and TAB \\t"""
170
171 if string:
172 for code, char in WF_ESCAPE_MAP:
173 string = string.replace(char.encode('utf-8'), code)
174 string = string.replace("\n", "\\n").replace("\t", "\\t")
175 return string
176
184
198 csv.register_dialect("wordfast", WordfastDialect)
199
201 """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss"""
203 self._time = None
204 if not newtime:
205 self.time = None
206 elif isinstance(newtime, basestring):
207 self.timestring = newtime
208 elif isinstance(newtime, time.struct_time):
209 self.time = newtime
210
212 """Get the time in the Wordfast time format"""
213 if not self._time:
214 return None
215 else:
216 return time.strftime(WF_TIMEFORMAT, self._time)
217
219 """Set the time_sturct object using a Wordfast time formated string
220
221 @param timestring: A Wordfast time string (YYYMMDD~hhmmss)
222 @type timestring: String
223 """
224 self._time = time.strptime(timestring, WF_TIMEFORMAT)
225 timestring = property(get_timestring, set_timestring)
226
228 """Get the time_struct object"""
229 return self._time
230
232 """Set the time_struct object
233
234 @param newtime: a new time object
235 @type newtime: time.time_struct
236 """
237 if newtime and isinstance(newtime, time.struct_time):
238 self._time = newtime
239 else:
240 self._time = None
241 time = property(get_time, set_time)
242
248
250 """A wordfast translation memory header"""
257
263
265 """Get the header dictionary"""
266 return self._header_dict
267
269 self._header_dict = newheader
270 header = property(getheader, setheader)
271
273 self._header_dict['target-lang'] = '%%%s' % newlang
274 targetlang = property(None, settargetlang)
275
277 self._header_dict['tucount'] = '%%TU=%08d' % count
278 tucount = property(None, settucount)
279
281 """A Wordfast translation memory unit"""
287
291
293 """Get the dictionary of values for a Wordfast line"""
294 return self._dict
295
297 """Set the dictionary of values for a Wordfast line
298
299 @param newdict: a new dictionary with Wordfast line elements
300 @type newdict: Dict
301 """
302
303 self._dict = newdict
304 dict = property(getdict, setdict)
305
307 if self._dict.get(key, None) is None:
308 return None
309 elif self._dict[key]:
310 return _wf_to_char(self._dict[key]).decode('utf-8')
311 else:
312 return ""
313
315 if newvalue is None:
316 self._dict[key] = None
317 if isinstance(newvalue, unicode):
318 newvalue = newvalue.encode('utf-8')
319 newvalue = _char_to_wf(newvalue)
320 if not key in self._dict or newvalue != self._dict[key]:
321 self._dict[key] = newvalue
322 self._update_timestamp()
323
326
330 source = property(getsource, setsource)
331
334
338 target = property(gettarget, settarget)
339
341 self._dict['target-lang'] = newlang
342 targetlang = property(None, settargetlang)
343
345 return str(self._dict)
346
348 if not self._dict.get('source', None):
349 return False
350 return bool(self._dict.get('target', None))
351
352
354 """A Wordfast translation memory file"""
355 Name = _("Wordfast Translation Memory")
356 Mimetypes = ["application/x-wordfast"]
357 Extensions = ["txt"]
359 """construct a Wordfast TM, optionally reading in from inputfile."""
360 self.UnitClass = unitclass
361 base.TranslationStore.__init__(self, unitclass=unitclass)
362 self.filename = ''
363 self.header = WordfastHeader()
364 self._encoding = 'iso-8859-1'
365 if inputfile is not None:
366 self.parse(inputfile)
367
369 """parsese the given file or file source string"""
370 if hasattr(input, 'name'):
371 self.filename = input.name
372 elif not getattr(self, 'filename', ''):
373 self.filename = ''
374 if hasattr(input, "read"):
375 tmsrc = input.read()
376 input.close()
377 input = tmsrc
378 if TAB_UTF16 in input.split("\n")[0]:
379 self._encoding = 'utf-16'
380 else:
381 self._encoding = 'iso-8859-1'
382 try:
383 input = input.decode(self._encoding).encode('utf-8')
384 except:
385 raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded")
386 for header in csv.DictReader(input.split("\n")[:1], fieldnames=WF_FIELDNAMES_HEADER, dialect="wordfast"):
387 self.header = WordfastHeader(header)
388 lines = csv.DictReader(input.split("\n")[1:], fieldnames=WF_FIELDNAMES, dialect="wordfast")
389 for line in lines:
390 newunit = WordfastUnit()
391 newunit.dict = line
392 self.addunit(newunit)
393
395 output = csv.StringIO()
396 header_output = csv.StringIO()
397 writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES, dialect="wordfast")
398 unit_count = 0
399 for unit in self.units:
400 if unit.istranslated():
401 unit_count += 1
402 writer.writerow(unit.dict)
403 if unit_count == 0:
404 return ""
405 output.reset()
406 self.header.tucount = unit_count
407 outheader = csv.DictWriter(header_output, fieldnames=WF_FIELDNAMES_HEADER, dialect="wordfast")
408 outheader.writerow(self.header.header)
409 header_output.reset()
410 decoded = "".join(header_output.readlines() + output.readlines()).decode('utf-8')
411 try:
412 return decoded.encode(self._encoding)
413 except UnicodeEncodeError:
414 return decoded.encode('utf-16')
415