Package translate :: Package tools :: Module pogrep
[hide private]
[frames] | no frames]

Source Code for Module translate.tools.pogrep

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2008 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Grep XLIFF, Gettext PO and TMX localization files 
 23   
 24  Matches are output to snippet files of the same type which can then be reviewed  
 25  and later merged using pomerge 
 26   
 27  See: http://translate.sourceforge.net/wiki/toolkit/pogrep for examples and 
 28  usage instructions 
 29  """ 
 30   
 31  from translate.storage import factory 
 32  from translate.storage.poheader import poheader 
 33  from translate.misc import optrecurse 
 34  from translate.misc.multistring import multistring 
 35  from translate.lang import data 
 36  import re 
 37  import locale 
 38   
 39   
40 -class GrepMatch(object):
41 """Just a small data structure that represents a search match.""" 42 43 # INITIALIZERS #
44 - def __init__(self, unit, part='target', part_n=0, start=0, end=0):
45 self.unit = unit 46 self.part = part 47 self.part_n = part_n 48 self.start = start 49 self.end = end
50 51 # ACCESSORS #
52 - def get_getter(self):
53 if self.part == 'target': 54 if self.unit.hasplural(): 55 getter = lambda: self.unit.target.strings[self.part_n] 56 else: 57 getter = lambda: self.unit.target 58 return getter 59 elif self.part == 'source': 60 if self.unit.hasplural(): 61 getter = lambda: self.unit.source.strings[self.part_n] 62 else: 63 getter = lambda: self.unit.source 64 return getter 65 elif self.part == 'notes': 66 def getter(): 67 return self.unit.getnotes()[self.part_n]
68 return getter 69 elif self.part == 'locations': 70 def getter(): 71 return self.unit.getlocations()[self.part_n]
72 return getter 73
74 - def get_setter(self):
75 if self.part == 'target': 76 if self.unit.hasplural(): 77 def setter(value): 78 strings = self.unit.target.strings 79 strings[self.part_n] = value 80 self.unit.target = strings
81 else: 82 def setter(value): 83 self.unit.target = value 84 return setter 85 86 # SPECIAL METHODS #
87 - def __str__(self):
88 start, end = self.start, self.end 89 if start < 3: 90 start = 3 91 if end > len(self.get_getter()()) - 3: 92 end = len(self.get_getter()()) - 3 93 matchpart = self.get_getter()()[start-2:end+2] 94 return '<GrepMatch "%s" part=%s[%d] start=%d end=%d>' % (matchpart, self.part, self.part_n, self.start, self.end)
95
96 - def __repr__(self):
97 return str(self)
98
99 -def real_index(string, nfc_index):
100 """Calculate the real index in the unnormalized string that corresponds to 101 the index nfc_index in the normalized string.""" 102 length = nfc_index 103 max_length = len(string) 104 while len(data.normalize(string[:length])) <= nfc_index: 105 if length == max_length: 106 return length 107 length += 1 108 return length - 1
109 110
111 -def find_matches(unit, part, strings, re_search):
112 """Return the GrepFilter objects where re_search matches in strings.""" 113 matches = [] 114 for n, string in enumerate(strings): 115 if not string: 116 continue 117 normalized = data.normalize(string) 118 for matchobj in re_search.finditer(normalized): 119 start = real_index(string, matchobj.start()) 120 end = real_index(string, matchobj.end()) 121 matches.append(GrepMatch(unit, part=part, part_n=n, start=start, end=end)) 122 return matches
123
124 -class GrepFilter:
125 - def __init__(self, searchstring, searchparts, ignorecase=False, useregexp=False, 126 invertmatch=False, accelchar=None, encoding='utf-8', 127 max_matches=0):
128 """builds a checkfilter using the given checker""" 129 if isinstance(searchstring, unicode): 130 self.searchstring = searchstring 131 else: 132 self.searchstring = searchstring.decode(encoding) 133 self.searchstring = data.normalize(self.searchstring) 134 if searchparts: 135 # For now we still support the old terminology, except for the old 'source' 136 # which has a new meaning now. 137 self.search_source = ('source' in searchparts) or ('msgid' in searchparts) 138 self.search_target = ('target' in searchparts) or ('msgstr' in searchparts) 139 self.search_notes = ('notes' in searchparts) or ('comment' in searchparts) 140 self.search_locations = 'locations' in searchparts 141 else: 142 self.search_source = True 143 self.search_target = True 144 self.search_notes = False 145 self.search_locations = False 146 self.ignorecase = ignorecase 147 if self.ignorecase: 148 self.searchstring = self.searchstring.lower() 149 self.useregexp = useregexp 150 if self.useregexp: 151 self.searchpattern = re.compile(self.searchstring) 152 self.invertmatch = invertmatch 153 self.accelchar = accelchar 154 self.max_matches = max_matches
155
156 - def matches(self, teststr):
157 if teststr is None: 158 return False 159 teststr = data.normalize(teststr) 160 if self.ignorecase: 161 teststr = teststr.lower() 162 if self.accelchar: 163 teststr = re.sub(self.accelchar + self.accelchar, "#", teststr) 164 teststr = re.sub(self.accelchar, "", teststr) 165 if self.useregexp: 166 found = self.searchpattern.search(teststr) 167 else: 168 found = teststr.find(self.searchstring) != -1 169 if self.invertmatch: 170 found = not found 171 return found
172
173 - def filterunit(self, unit):
174 """runs filters on an element""" 175 if unit.isheader(): 176 return [] 177 178 if self.search_source: 179 if isinstance(unit.source, multistring): 180 strings = unit.source.strings 181 else: 182 strings = [unit.source] 183 for string in strings: 184 if self.matches(string): 185 return True 186 187 if self.search_target: 188 if isinstance(unit.target, multistring): 189 strings = unit.target.strings 190 else: 191 strings = [unit.target] 192 for string in strings: 193 if self.matches(string): 194 return True 195 196 if self.search_notes: 197 if self.matches(unit.getnotes()): 198 return True 199 if self.search_locations: 200 if self.matches(u" ".join(unit.getlocations())): 201 return True 202 return False
203
204 - def filterfile(self, thefile):
205 """runs filters on a translation file object""" 206 thenewfile = type(thefile)() 207 thenewfile.setsourcelanguage(thefile.sourcelanguage) 208 thenewfile.settargetlanguage(thefile.targetlanguage) 209 for unit in thefile.units: 210 if self.filterunit(unit): 211 thenewfile.addunit(unit) 212 213 if isinstance(thenewfile, poheader): 214 thenewfile.updateheader(add=True, **thefile.parseheader()) 215 return thenewfile
216
217 - def getmatches(self, units):
218 if not self.searchstring: 219 return [], [] 220 221 searchstring = self.searchstring 222 flags = re.LOCALE | re.MULTILINE | re.UNICODE 223 224 if self.ignorecase: 225 flags |= re.IGNORECASE 226 if not self.useregexp: 227 searchstring = re.escape(searchstring) 228 self.re_search = re.compile(u'(%s)' % (searchstring), flags) 229 230 matches = [] 231 indexes = [] 232 233 for index, unit in enumerate(units): 234 old_length = len(matches) 235 236 if self.search_target: 237 if unit.hasplural(): 238 targets = unit.target.strings 239 else: 240 targets = [unit.target] 241 matches.extend(find_matches(unit, 'target', targets, self.re_search)) 242 if self.search_source: 243 if unit.hasplural(): 244 sources = unit.source.strings 245 else: 246 sources = [unit.source] 247 matches.extend(find_matches(unit, 'source', sources, self.re_search)) 248 if self.search_notes: 249 matches.extend(find_matches(unit, 'notes', unit.getnotes(), self.re_search)) 250 251 if self.search_locations: 252 matches.extend(find_matches(unit, 'locations', unit.getlocations(), self.re_search)) 253 254 # A search for a single letter or an all-inclusive regular 255 # expression could give enough results to cause performance 256 # problems. The answer is probably not very useful at this scale. 257 if self.max_matches and len(matches) > self.max_matches: 258 raise Exception("Too many matches found") 259 260 if len(matches) > old_length: 261 old_length = len(matches) 262 indexes.append(index) 263 264 return matches, indexes
265
266 -class GrepOptionParser(optrecurse.RecursiveOptionParser):
267 """a specialized Option Parser for the grep tool..."""
268 - def parse_args(self, args=None, values=None):
269 """parses the command line options, handling implicit input/output args""" 270 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 271 # some intelligence as to what reasonable people might give on the command line 272 if args: 273 options.searchstring = args[0] 274 args = args[1:] 275 else: 276 self.error("At least one argument must be given for the search string") 277 if args and not options.input: 278 if not options.output: 279 options.input = args[:-1] 280 args = args[-1:] 281 else: 282 options.input = args 283 args = [] 284 if args and not options.output: 285 options.output = args[-1] 286 args = args[:-1] 287 if args: 288 self.error("You have used an invalid combination of --input, --output and freestanding args") 289 if isinstance(options.input, list) and len(options.input) == 1: 290 options.input = options.input[0] 291 return (options, args)
292
293 - def set_usage(self, usage=None):
294 """sets the usage string - if usage not given, uses getusagestring for each option""" 295 if usage is None: 296 self.usage = "%prog searchstring " + " ".join([self.getusagestring(option) for option in self.option_list]) 297 else: 298 super(GrepOptionParser, self).set_usage(usage)
299
300 - def run(self):
301 """parses the arguments, and runs recursiveprocess with the resulting options""" 302 (options, args) = self.parse_args() 303 options.inputformats = self.inputformats 304 options.outputoptions = self.outputoptions 305 options.checkfilter = GrepFilter(options.searchstring, options.searchparts, options.ignorecase, options.useregexp, options.invertmatch, options.accelchar, locale.getpreferredencoding()) 306 self.usepsyco(options) 307 self.recursiveprocess(options)
308
309 -def rungrep(inputfile, outputfile, templatefile, checkfilter):
310 """reads in inputfile, filters using checkfilter, writes to outputfile""" 311 fromfile = factory.getobject(inputfile) 312 tofile = checkfilter.filterfile(fromfile) 313 if tofile.isempty(): 314 return False 315 outputfile.write(str(tofile)) 316 return True
317
318 -def cmdlineparser():
319 formats = {"po":("po", rungrep), "pot":("pot", rungrep), 320 "mo":("mo", rungrep), "gmo":("gmo", rungrep), 321 "tmx":("tmx", rungrep), 322 "xliff":("xliff", rungrep), "xlf":("xlf", rungrep), "xlff":("xlff", rungrep), 323 None:("po", rungrep)} 324 parser = GrepOptionParser(formats) 325 parser.add_option("", "--search", dest="searchparts", 326 action="append", type="choice", choices=["source", "target", "notes", "locations", "msgid", "msgstr", "comment" ], 327 metavar="SEARCHPARTS", help="searches the given parts (source, target, notes and locations)") 328 parser.add_option("-I", "--ignore-case", dest="ignorecase", 329 action="store_true", default=False, help="ignore case distinctions") 330 parser.add_option("-e", "--regexp", dest="useregexp", 331 action="store_true", default=False, help="use regular expression matching") 332 parser.add_option("-v", "--invert-match", dest="invertmatch", 333 action="store_true", default=False, help="select non-matching lines") 334 parser.add_option("", "--accelerator", dest="accelchar", 335 action="store", type="choice", choices=["&", "_", "~"], 336 metavar="ACCELERATOR", help="ignores the given accelerator when matching") 337 parser.set_usage() 338 parser.passthrough.append('checkfilter') 339 parser.description = __doc__ 340 return parser
341
342 -def main():
343 parser = cmdlineparser() 344 parser.run()
345 346 if __name__ == '__main__': 347 main() 348