Module normalizer
[hide private]
[frames] | no frames]

Source Code for Module normalizer

   1   
   2  from configParser import C3Object 
   3  from baseObjects import Normalizer 
   4  from c3errors import ConfigFileException 
   5  import types, re, os 
   6   
   7  # The following defaults should be overridden 
   8  # SimpleNormalizer should never be used (waste of CPU) 
9 -class SimpleNormalizer(Normalizer):
10 """ Base normalizer. Simply returns the data (shouldn't be used directly) """ 11
12 - def __init__(self, session, config, parent):
13 Normalizer.__init__(self, session, config, parent)
14
15 - def process_string(self, session, data):
16 # normalise string into single appropriate form (eg '1' -> 1) 17 return data
18
19 - def process_hash(self, session, data):
20 kw = {} 21 has = kw.has_key 22 vals = data.values() 23 if not vals: 24 return kw 25 prox = vals[0].has_key('positions') 26 process = self.process_string 27 for d in vals: 28 new = process(session, d['text']) 29 if type(new) == types.DictType: 30 # from string to hash 31 for k in new.values(): 32 txt = k['text'] 33 if has(txt): 34 kw[txt]['occurences'] += k['occurences'] 35 if prox: 36 kw[txt]['positions'].extend(k['positions']) 37 else: 38 kw[txt] = k 39 else: 40 if new != None: 41 try: 42 kw[new]['occurences'] += d['occurences'] 43 if prox: 44 kw[new]['positions'].extend(d['positions']) 45 except KeyError: 46 d = d.copy() 47 d['text'] = new 48 kw[new] = d 49 return kw
50
51 -class DataExistsNormalizer(SimpleNormalizer):
52 """ Return '1' if any data exists, otherwise '0' """ 53
54 - def process_string(self, session, data):
55 if data: 56 return "1" 57 else: 58 return "0"
59
60 -class TermExistsNormalizer(SimpleNormalizer):
61 """ Un-stoplist anonymizing normalizer. Eg for use with data mining """
62 - def __init__(self, session, config, parent):
63 SimpleNormalizer.__init__(self, session, config, parent) 64 tlstr = self.get_setting(session, 'termlist', '') 65 splitter = self.get_setting(session, 'splitChar', ' ') 66 self.termlist = tlstr.split(splitter) 67 self.frequency = self.get_setting(session, 'frequency', 0)
68
69 - def process_string(self, session, data):
70 if data in self.termlist: 71 return "1" 72 else: 73 return "0"
74
75 - def process_hash(self, session, data):
76 vals = data.values() 77 if not vals: 78 return kw 79 process = self.process_string 80 total = 0 81 for d in vals: 82 new = process(session, d['text']) 83 if new == "1": 84 if self.frequency: 85 total += d['occurences'] 86 else: 87 total += 1 88 return str(total)
89 90
91 -class CaseNormalizer(SimpleNormalizer):
92 """ Reduce text to lower case """ 93
94 - def process_string(self, session, data):
95 return data.lower()
96
97 -class ReverseNormalizer(SimpleNormalizer):
98 """ Reverse string (eg for left truncation) """
99 - def process_string(self, session, data):
100 return data[::-1]
101 102
103 -class SpaceNormalizer(SimpleNormalizer):
104 """ Reduce multiple whitespace to single space character """
105 - def __init__(self, session, config, parent):
106 SimpleNormalizer.__init__(self, session, config, parent) 107 self.whitespace = re.compile("\s+")
108
109 - def process_string(self, session, data):
110 data = data.strip() 111 data = self.whitespace.sub(' ', data) 112 return data
113
114 -class ArticleNormalizer(SimpleNormalizer):
115 """ Remove leading english articles (the, a, an) """
116 - def process_string(self, session, data):
117 d = data.lower() 118 if (d[:4] == "the "): 119 return data[4:] 120 elif (d[:2] == "a "): 121 return data[2:] 122 elif (d[:3] == "an "): 123 return data[3:] 124 else: 125 return data
126 127
128 -class NumericEntityNormalizer(SimpleNormalizer):
129 """ Replace characters matching regular expression with the equivalent numeric character entity """
130 - def __init__(self, session, config, parent):
131 SimpleNormalizer.__init__(self, session, config, parent) 132 regex = self.get_setting(session, 'regexp') 133 if regex: 134 self.regexp = re.compile(regex) 135 self.function = lambda x: "&#%s;" % ord(x.group(1))
136
137 - def process_string(self, session, data):
138 return self.regexp.sub(self.function, data)
139 140 141 # Non printable characters (Printable) 142 # self.asciiRe = re.compile('([\x0e-\x1f]|[\x7b-\xff])') 143 144 # Non useful characters (Stripper) 145 # self.asciiRe = re.compile('["%#@~!*{}]') 146
147 -class RegexpNormalizer(SimpleNormalizer):
148 """ Either strip, replace or keep data which matches a given regular expression """
149 - def __init__(self, session, config, parent):
150 SimpleNormalizer.__init__(self, session, config, parent) 151 char = self.get_setting(session, 'char') 152 regex = self.get_setting(session, 'regexp') 153 self.keep = self.get_setting(session, 'keep') 154 if regex: 155 self.regexp = re.compile(regex) 156 if char: 157 self.char = char 158 else: 159 self.char = ''
160
161 - def process_string(self, session, data):
162 if self.keep: 163 l = self.regexp.findall(data) 164 return self.char.join(l) 165 else: 166 return self.regexp.sub(self.char, data)
167 168 169
170 -class PossessiveNormalizer(SimpleNormalizer):
171 """ Remove trailing 's or s' from words """
172 - def process_string(self, session, data):
173 # Not totally correct... eg: it's == 'it is', not 'of it' 174 if (data[-2:] == "s'"): 175 return data[:-1] 176 elif (data[-2:] == "'s"): 177 return data[:-2] 178 else: 179 return data
180
181 -class IntNormalizer(SimpleNormalizer):
182 """ Turn a string into an integer """
183 - def process_string(self, session, data):
184 try: 185 return long(data) 186 except: 187 return None
188
189 -class StringIntNormalizer(SimpleNormalizer):
190 """ Turn an integer into a 0 padded string, 12 chrs long """
191 - def process_string(self, session, data):
192 try: 193 d = long(data) 194 return "%012d" % (d) 195 except: 196 return None
197
198 -class StoplistNormalizer(SimpleNormalizer):
199 """ Remove words that match a stopword list """ 200 stoplist = {} 201
202 - def __init__(self, session, config, parent):
203 SimpleNormalizer.__init__(self, session, config, parent) 204 p = self.get_path(session, "stoplist") 205 if (not os.path.isabs(p)): 206 dfp = self.get_path(session, "defaultPath") 207 p = os.path.join(dfp, p) 208 f = file(p) 209 l = f.readlines() 210 f.close() 211 for sw in l: 212 # chomp 213 self.stoplist[sw[:-1]] = 1
214
215 - def process_string(self, session, data):
216 if (self.stoplist.has_key(data)): 217 return None 218 else: 219 return data
220 221 try: 222 import txngstemmer as Stemmer 223 224 class StemNormalizer(SimpleNormalizer): 225 """ Use a Snowball stemmer to stem the terms """ 226 stemmer = None 227 228 def __init__(self, session, config, parent): 229 SimpleNormalizer.__init__(self, session, config, parent) 230 lang = self.get_setting(session, 'language', 'english') 231 try: 232 self.stemmer = Stemmer.Stemmer(lang) 233 except: 234 raise(ConfigFileException("Unknown stemmer language: %s" % (lang)))
235 236 def process_string(self, session, data): 237 if (type(data) != type(u"")): 238 data = unicode(data, 'utf-8') 239 return self.stemmer.stem([data])[0] 240 241 class PhraseStemNormalizer(SimpleNormalizer): 242 """ Use a Snowball stemmer to stem multiple words in a phrase (eg from PosPhraseNormalizer) """ 243 stemmer = None 244 245 def __init__(self, session, config, parent): 246 SimpleNormalizer.__init__(self, session, config, parent) 247 lang = self.get_setting(session, 'language', 'english') 248 self.punctuationRe = re.compile("((?<!s)'|[-.,]((?=\s)|$)|(^|(?<=\s))[-.,']|[~`!@+=\#\&\^*()\[\]{}\\\|\":;<>?/])") 249 try: 250 self.stemmer = Stemmer.Stemmer(lang) 251 except: 252 raise(ConfigFileException("Unknown stemmer language: %s" % (lang))) 253 254 def process_string(self, session, data): 255 if (type(data) != type(u"")): 256 data = unicode(data, 'utf-8') 257 s = self.punctuationRe.sub(' ', data) 258 wds = data.split() 259 stemmed = self.stemmer.stem(wds) 260 return ' '.join(stemmed) 261 262 except: 263 264 class StemNormalizer(SimpleNormalizer): 265 def __init__(self, session, config, parent): 266 raise(ConfigFileException('Stemmer library not available')) 267 268 269
270 -class DateStringNormalizer(SimpleNormalizer):
271 """ Turns a Date object into ISO8601 format """ 272
273 - def process_string(self, session, data):
274 # str() defaults to iso8601 format 275 return str(data)
276
277 -class RangeNormalizer(SimpleNormalizer):
278 """ Should normalise ranges... unfinished """ 279
280 - def process_hash(self, session, data):
281 # Need to step through positions in order 282 kw = {} 283 vals = data.values() 284 if not vals: 285 return kw 286 prox = vals[0].has_key('positions') 287 if not prox: 288 # Bad. Assume low -> high order 289 tmplist = [(d['text'], d) for d in vals] 290 else: 291 # Need to duplicate across occs, as all in same hash from record 292 tmplist = [] 293 for d in vals: 294 for x in range(0, len(d['positions']), 2): 295 tmplist.append(("%s-%s" % (d['positions'][x], d['positions'][x+1]), d)) 296 tmplist.sort() 297 print tmplist 298 299 for t in range(0,len(tmplist),2): 300 base = tmplist[t][1] 301 try: 302 text = base['text'] + " " + tmplist[t+1][1]['text'] 303 except: 304 text = base['text'] + " " + base['text'] 305 base['text'] = text 306 try: 307 del base['positions'] 308 except: 309 pass 310 kw[text] = base 311 312 return kw
313 314
315 -class KeywordNormalizer(SimpleNormalizer):
316 """ Given a string, keyword it with proximity. Eg for chaining after ExactExtractor + other normalizers """ 317 318 # Including Proximity 319
320 - def __init__(self, session, config, parent):
321 SimpleNormalizer.__init__(self, session, config, parent) 322 pre = self.get_setting(session, 'regexp', "((?<!s)'|[-.,]((?=\s)|$)|(^|(?<=\s))[-.,']|[~`!@+=\#\&\^*()\[\]{}\\\|\":;<>?/])") 323 self.punctuationRe = re.compile(pre)
324 325
326 - def process_string(self, session, data):
327 kw = {} 328 has = kw.has_key 329 s = self.punctuationRe.sub(' ', data) 330 # Force proximity 331 if (self.get_setting(session, 'prox')): 332 prox = 1 333 else: 334 prox = 0 335 w = 0 336 for t in s.split(): 337 if has(t): 338 kw[t]['occurences'] += 1 339 if prox: 340 kw[t]['positions'].extend([-1, w]) 341 w += 1 342 elif prox: 343 kw[t] = {'text' : t, 'occurences' : 1, 344 'positions' : [-1, w]} 345 w += 1 346 else: 347 kw[t] = {'text' : t, 'occurences' : 1} 348 return kw
349
350 - def process_hash(self, session, data):
351 kw = {} 352 vals = data.values() 353 if (vals and vals[0].has_key('positions')) or self.get_setting(session, 'prox'): 354 prox = 1 355 else: 356 prox = 0 357 reSub = self.punctuationRe.sub 358 has = kw.has_key 359 for d in vals: 360 t = d['text'] 361 s = reSub(' ', t) 362 w = 0 363 if prox: 364 try: 365 lno = d['positions'][0] 366 except: 367 lno = -1 368 for t in s.split(): 369 if has(t): 370 kw[t]['occurences'] += 1 371 if prox: 372 kw[t]['positions'].extend([lno, w]) 373 w += 1 374 elif prox: 375 kw[t] = {'text' : t, 'occurences' : 1, 376 'positions' : [lno, w]} 377 w += 1 378 else: 379 kw[t] = {'text' : t, 'occurences' : 1} 380 return kw
381 382 383 384 # These are very simple. Should read in map from file 385 # Maybe determine expansions file based on context 386 # etc. 387
388 -class ExactExpansionNormalizer(SimpleNormalizer):
389 # Expand stuff within a string 390 # Then maybe pass to keyworder 391 map = { 392 'USA' :'United States of America', 393 'UK' : 'United Kingdom', 394 'NZ' : 'New Zealand', 395 'XML' : 'Extensible Markup Language', 396 'SOAP' : 'Simple Object Access Protocol', 397 'SRW' : 'Search Retrieve Webservice', 398 'CQL' : 'Common Query Language', 399 "isn't" : 'is not', 400 "don't" : "do not", 401 "won't" : "will not", 402 "can't" : "can not", 403 "wasn't" : "was not", 404 "hasn't" : "has not", 405 "I'm" : "I am", 406 "you're" : "you are", 407 "he's" : "he is", 408 "she's" : "she is", 409 "they're" : "they are", 410 "we're" : "we are", 411 "I'd" : "I would", 412 "she'd" : "she would", 413 "he'd" : "he would", 414 "we'd" : "we would", 415 "they'd" : "they would", 416 "you'd" : "you would" 417 } 418
419 - def process_string(self, session, data):
420 for m in self.map.items(): 421 data = data.replace(m[0], m[1]) 422 return data
423 424
425 -class WordExpansionNormalizer(SimpleNormalizer):
426 # Expand acronyms or abbreviations 427 # Only for words, not strings. 428 map = { 429 'USA' :['United', 'States', 'of', 'America'], 430 'UK' : ['United', 'Kingdom'], 431 'XML' : ['Extensible', 'Markup', 'Language'], 432 'SOAP' : ['Simple', 'Object', 'Access', 'Protocol'], 433 'SRW' : ['Search', 'Retrieve', 'Webservice'] 434 } 435
436 - def process_string(self, session, data):
437 if self.map.has_key(data): 438 return ' '.join(map[data])
439 440 # Need own process_hash in order to merge 441 # Will be same for any 1 -> many normalizer
442 - def process_hash(self, session, data):
443 vals = data.values() 444 if vals[0].has_key('positions'): 445 raise NotImplementedError 446 kw = {} 447 store = self.storeOriginal 448 has = kw.has_key 449 process = self.process_string 450 map = self.map 451 maphas = map.has_key 452 for d in vals: 453 t = d['text'] 454 if maphas(t): 455 dlist = map[t] 456 p = 0 457 for new in dlist: 458 if has(new): 459 kw[new]['occurences'] += 1 460 else: 461 kw[new] = d 462 else: 463 kw[t] = d 464 return kw
465 466 467 # XXX string.maketrans() ? Doesn't like Unicode... 468
469 -class DiacriticNormalizer(SimpleNormalizer):
470 """ Slow implementation of Unicode 4.0 character decomposition. Eg that &amp;eacute; -> e """ 471 472 # Decomposition as per Unicode 4.0 Data file 473 map = { 474 u"\u00C0" : u"\u0041", 475 u"\u00C1" : u"\u0041", 476 u"\u00C2" : u"\u0041", 477 u"\u00C3" : u"\u0041", 478 u"\u00C4" : u"\u0041", 479 u"\u00C5" : u"\u0041", 480 u"\u00C7" : u"\u0043", 481 u"\u00C8" : u"\u0045", 482 u"\u00C9" : u"\u0045", 483 u"\u00CA" : u"\u0045", 484 u"\u00CB" : u"\u0045", 485 u"\u00CC" : u"\u0049", 486 u"\u00CD" : u"\u0049", 487 u"\u00CE" : u"\u0049", 488 u"\u00CF" : u"\u0049", 489 u"\u00D1" : u"\u004E", 490 u"\u00D2" : u"\u004F", 491 u"\u00D3" : u"\u004F", 492 u"\u00D4" : u"\u004F", 493 u"\u00D5" : u"\u004F", 494 u"\u00D6" : u"\u004F", 495 u"\u00D9" : u"\u0055", 496 u"\u00DA" : u"\u0055", 497 u"\u00DB" : u"\u0055", 498 u"\u00DC" : u"\u0055", 499 u"\u00DD" : u"\u0059", 500 u"\u00E0" : u"\u0061", 501 u"\u00E1" : u"\u0061", 502 u"\u00E2" : u"\u0061", 503 u"\u00E3" : u"\u0061", 504 u"\u00E4" : u"\u0061", 505 u"\u00E5" : u"\u0061", 506 u"\u00E7" : u"\u0063", 507 u"\u00E8" : u"\u0065", 508 u"\u00E9" : u"\u0065", 509 u"\u00EA" : u"\u0065", 510 u"\u00EB" : u"\u0065", 511 u"\u00EC" : u"\u0069", 512 u"\u00ED" : u"\u0069", 513 u"\u00EE" : u"\u0069", 514 u"\u00EF" : u"\u0069", 515 u"\u00F1" : u"\u006E", 516 u"\u00F2" : u"\u006F", 517 u"\u00F3" : u"\u006F", 518 u"\u00F4" : u"\u006F", 519 u"\u00F5" : u"\u006F", 520 u"\u00F6" : u"\u006F", 521 u"\u00F9" : u"\u0075", 522 u"\u00FA" : u"\u0075", 523 u"\u00FB" : u"\u0075", 524 u"\u00FC" : u"\u0075", 525 u"\u00FD" : u"\u0079", 526 u"\u00FF" : u"\u0079", 527 u"\u0100" : u"\u0041", 528 u"\u0101" : u"\u0061", 529 u"\u0102" : u"\u0041", 530 u"\u0103" : u"\u0061", 531 u"\u0104" : u"\u0041", 532 u"\u0105" : u"\u0061", 533 u"\u0106" : u"\u0043", 534 u"\u0107" : u"\u0063", 535 u"\u0108" : u"\u0043", 536 u"\u0109" : u"\u0063", 537 u"\u010A" : u"\u0043", 538 u"\u010B" : u"\u0063", 539 u"\u010C" : u"\u0043", 540 u"\u010D" : u"\u0063", 541 u"\u010E" : u"\u0044", 542 u"\u010F" : u"\u0064", 543 u"\u0112" : u"\u0045", 544 u"\u0113" : u"\u0065", 545 u"\u0114" : u"\u0045", 546 u"\u0115" : u"\u0065", 547 u"\u0116" : u"\u0045", 548 u"\u0117" : u"\u0065", 549 u"\u0118" : u"\u0045", 550 u"\u0119" : u"\u0065", 551 u"\u011A" : u"\u0045", 552 u"\u011B" : u"\u0065", 553 u"\u011C" : u"\u0047", 554 u"\u011D" : u"\u0067", 555 u"\u011E" : u"\u0047", 556 u"\u011F" : u"\u0067", 557 u"\u0120" : u"\u0047", 558 u"\u0121" : u"\u0067", 559 u"\u0122" : u"\u0047", 560 u"\u0123" : u"\u0067", 561 u"\u0124" : u"\u0048", 562 u"\u0125" : u"\u0068", 563 u"\u0128" : u"\u0049", 564 u"\u0129" : u"\u0069", 565 u"\u012A" : u"\u0049", 566 u"\u012B" : u"\u0069", 567 u"\u012C" : u"\u0049", 568 u"\u012D" : u"\u0069", 569 u"\u012E" : u"\u0049", 570 u"\u012F" : u"\u0069", 571 u"\u0130" : u"\u0049", 572 u"\u0132" : u"\u0049", 573 u"\u0133" : u"\u0069", 574 u"\u0134" : u"\u004A", 575 u"\u0135" : u"\u006A", 576 u"\u0136" : u"\u004B", 577 u"\u0137" : u"\u006B", 578 u"\u0139" : u"\u004C", 579 u"\u013A" : u"\u006C", 580 u"\u013B" : u"\u004C", 581 u"\u013C" : u"\u006C", 582 u"\u013D" : u"\u004C", 583 u"\u013E" : u"\u006C", 584 u"\u013F" : u"\u004C", 585 u"\u0140" : u"\u006C", 586 u"\u0143" : u"\u004E", 587 u"\u0144" : u"\u006E", 588 u"\u0145" : u"\u004E", 589 u"\u0146" : u"\u006E", 590 u"\u0147" : u"\u004E", 591 u"\u0148" : u"\u006E", 592 u"\u0149" : u"\u02BC", 593 u"\u014C" : u"\u004F", 594 u"\u014D" : u"\u006F", 595 u"\u014E" : u"\u004F", 596 u"\u014F" : u"\u006F", 597 u"\u0150" : u"\u004F", 598 u"\u0151" : u"\u006F", 599 u"\u0154" : u"\u0052", 600 u"\u0155" : u"\u0072", 601 u"\u0156" : u"\u0052", 602 u"\u0157" : u"\u0072", 603 u"\u0158" : u"\u0052", 604 u"\u0159" : u"\u0072", 605 u"\u015A" : u"\u0053", 606 u"\u015B" : u"\u0073", 607 u"\u015C" : u"\u0053", 608 u"\u015D" : u"\u0073", 609 u"\u015E" : u"\u0053", 610 u"\u015F" : u"\u0073", 611 u"\u0160" : u"\u0053", 612 u"\u0161" : u"\u0073", 613 u"\u0162" : u"\u0054", 614 u"\u0163" : u"\u0074", 615 u"\u0164" : u"\u0054", 616 u"\u0165" : u"\u0074", 617 u"\u0168" : u"\u0055", 618 u"\u0169" : u"\u0075", 619 u"\u016A" : u"\u0055", 620 u"\u016B" : u"\u0075", 621 u"\u016C" : u"\u0055", 622 u"\u016D" : u"\u0075", 623 u"\u016E" : u"\u0055", 624 u"\u016F" : u"\u0075", 625 u"\u0170" : u"\u0055", 626 u"\u0171" : u"\u0075", 627 u"\u0172" : u"\u0055", 628 u"\u0173" : u"\u0075", 629 u"\u0174" : u"\u0057", 630 u"\u0175" : u"\u0077", 631 u"\u0176" : u"\u0059", 632 u"\u0177" : u"\u0079", 633 u"\u0178" : u"\u0059", 634 u"\u0179" : u"\u005A", 635 u"\u017A" : u"\u007A", 636 u"\u017B" : u"\u005A", 637 u"\u017C" : u"\u007A", 638 u"\u017D" : u"\u005A", 639 u"\u017E" : u"\u007A", 640 u"\u01A0" : u"\u004F", 641 u"\u01A1" : u"\u006F", 642 u"\u01AF" : u"\u0055", 643 u"\u01B0" : u"\u0075", 644 u"\u01C4" : u"\u0044", 645 u"\u01C5" : u"\u0044", 646 u"\u01C6" : u"\u0064", 647 u"\u01C7" : u"\u004C", 648 u"\u01C8" : u"\u004C", 649 u"\u01C9" : u"\u006C", 650 u"\u01CA" : u"\u004E", 651 u"\u01CB" : u"\u004E", 652 u"\u01CC" : u"\u006E", 653 u"\u01CD" : u"\u0041", 654 u"\u01CE" : u"\u0061", 655 u"\u01CF" : u"\u0049", 656 u"\u01D0" : u"\u0069", 657 u"\u01D1" : u"\u004F", 658 u"\u01D2" : u"\u006F", 659 u"\u01D3" : u"\u0055", 660 u"\u01D4" : u"\u0075", 661 u"\u01D5" : u"\u0055", 662 u"\u01D6" : u"\u0075", 663 u"\u01D7" : u"\u0055", 664 u"\u01D8" : u"\u0075", 665 u"\u01D9" : u"\u0055", 666 u"\u01DA" : u"\u0075", 667 u"\u01DB" : u"\u0055", 668 u"\u01DC" : u"\u0075", 669 u"\u01DE" : u"\u0041", 670 u"\u01DF" : u"\u0061", 671 u"\u01E0" : u"\u0226", 672 u"\u01E1" : u"\u0227", 673 u"\u01E2" : u"\u00C6", 674 u"\u01E3" : u"\u00E6", 675 u"\u01E6" : u"\u0047", 676 u"\u01E7" : u"\u0067", 677 u"\u01E8" : u"\u004B", 678 u"\u01E9" : u"\u006B", 679 u"\u01EA" : u"\u004F", 680 u"\u01EB" : u"\u006F", 681 u"\u01EC" : u"\u004F", 682 u"\u01ED" : u"\u006F", 683 u"\u01EE" : u"\u01B7", 684 u"\u01EF" : u"\u0292", 685 u"\u01F0" : u"\u006A", 686 u"\u01F1" : u"\u0044", 687 u"\u01F2" : u"\u0044", 688 u"\u01F3" : u"\u0064", 689 u"\u01F4" : u"\u0047", 690 u"\u01F5" : u"\u0067", 691 u"\u01F8" : u"\u004E", 692 u"\u01F9" : u"\u006E", 693 u"\u01FA" : u"\u0041", 694 u"\u01FB" : u"\u0061", 695 u"\u01FC" : u"\u00C6", 696 u"\u01FD" : u"\u00E6", 697 u"\u01FE" : u"\u00D8", 698 u"\u01FF" : u"\u00F8", 699 u"\u0200" : u"\u0041", 700 u"\u0201" : u"\u0061", 701 u"\u0202" : u"\u0041", 702 u"\u0203" : u"\u0061", 703 u"\u0204" : u"\u0045", 704 u"\u0205" : u"\u0065", 705 u"\u0206" : u"\u0045", 706 u"\u0207" : u"\u0065", 707 u"\u0208" : u"\u0049", 708 u"\u0209" : u"\u0069", 709 u"\u020A" : u"\u0049", 710 u"\u020B" : u"\u0069", 711 u"\u020C" : u"\u004F", 712 u"\u020D" : u"\u006F", 713 u"\u020E" : u"\u004F", 714 u"\u020F" : u"\u006F", 715 u"\u0210" : u"\u0052", 716 u"\u0211" : u"\u0072", 717 u"\u0212" : u"\u0052", 718 u"\u0213" : u"\u0072", 719 u"\u0214" : u"\u0055", 720 u"\u0215" : u"\u0075", 721 u"\u0216" : u"\u0055", 722 u"\u0217" : u"\u0075", 723 u"\u0218" : u"\u0053", 724 u"\u0219" : u"\u0073", 725 u"\u021A" : u"\u0054", 726 u"\u021B" : u"\u0074", 727 u"\u021E" : u"\u0048", 728 u"\u021F" : u"\u0068", 729 u"\u0226" : u"\u0041", 730 u"\u0227" : u"\u0061", 731 u"\u0228" : u"\u0045", 732 u"\u0229" : u"\u0065", 733 u"\u022A" : u"\u004F", 734 u"\u022B" : u"\u006F", 735 u"\u022C" : u"\u004F", 736 u"\u022D" : u"\u006F", 737 u"\u022E" : u"\u004F", 738 u"\u022F" : u"\u006F", 739 u"\u0230" : u"\u004F", 740 u"\u0231" : u"\u006F", 741 u"\u0232" : u"\u0059", 742 u"\u0233" : u"\u0079", 743 u"\u1E00" : u"\u0041", 744 u"\u1E01" : u"\u0061", 745 u"\u1E02" : u"\u0042", 746 u"\u1E03" : u"\u0062", 747 u"\u1E04" : u"\u0042", 748 u"\u1E05" : u"\u0062", 749 u"\u1E06" : u"\u0042", 750 u"\u1E07" : u"\u0062", 751 u"\u1E08" : u"\u0043", 752 u"\u1E09" : u"\u0063", 753 u"\u1E0A" : u"\u0044", 754 u"\u1E0B" : u"\u0064", 755 u"\u1E0C" : u"\u0044", 756 u"\u1E0D" : u"\u0064", 757 u"\u1E0E" : u"\u0044", 758 u"\u1E0F" : u"\u0064", 759 u"\u1E10" : u"\u0044", 760 u"\u1E11" : u"\u0064", 761 u"\u1E12" : u"\u0044", 762 u"\u1E13" : u"\u0064", 763 u"\u1E14" : u"\u0045", 764 u"\u1E15" : u"\u0065", 765 u"\u1E16" : u"\u0045", 766 u"\u1E17" : u"\u0065", 767 u"\u1E18" : u"\u0045", 768 u"\u1E19" : u"\u0065", 769 u"\u1E1A" : u"\u0045", 770 u"\u1E1B" : u"\u0065", 771 u"\u1E1C" : u"\u0045", 772 u"\u1E1D" : u"\u0065", 773 u"\u1E1E" : u"\u0046", 774 u"\u1E1F" : u"\u0066", 775 u"\u1E20" : u"\u0047", 776 u"\u1E21" : u"\u0067", 777 u"\u1E22" : u"\u0048", 778 u"\u1E23" : u"\u0068", 779 u"\u1E24" : u"\u0048", 780 u"\u1E25" : u"\u0068", 781 u"\u1E26" : u"\u0048", 782 u"\u1E27" : u"\u0068", 783 u"\u1E28" : u"\u0048", 784 u"\u1E29" : u"\u0068", 785 u"\u1E2A" : u"\u0048", 786 u"\u1E2B" : u"\u0068", 787 u"\u1E2C" : u"\u0049", 788 u"\u1E2D" : u"\u0069", 789 u"\u1E2E" : u"\u0049", 790 u"\u1E2F" : u"\u0069", 791 u"\u1E30" : u"\u004B", 792 u"\u1E31" : u"\u006B", 793 u"\u1E32" : u"\u004B", 794 u"\u1E33" : u"\u006B", 795 u"\u1E34" : u"\u004B", 796 u"\u1E35" : u"\u006B", 797 u"\u1E36" : u"\u004C", 798 u"\u1E37" : u"\u006C", 799 u"\u1E38" : u"\u004C", 800 u"\u1E39" : u"\u006C", 801 u"\u1E3A" : u"\u004C", 802 u"\u1E3B" : u"\u006C", 803 u"\u1E3C" : u"\u004C", 804 u"\u1E3D" : u"\u006C", 805 u"\u1E3E" : u"\u004D", 806 u"\u1E3F" : u"\u006D", 807 u"\u1E40" : u"\u004D", 808 u"\u1E41" : u"\u006D", 809 u"\u1E42" : u"\u004D", 810 u"\u1E43" : u"\u006D", 811 u"\u1E44" : u"\u004E", 812 u"\u1E45" : u"\u006E", 813 u"\u1E46" : u"\u004E", 814 u"\u1E47" : u"\u006E", 815 u"\u1E48" : u"\u004E", 816 u"\u1E49" : u"\u006E", 817 u"\u1E4A" : u"\u004E", 818 u"\u1E4B" : u"\u006E", 819 u"\u1E4C" : u"\u004F", 820 u"\u1E4D" : u"\u006F", 821 u"\u1E4E" : u"\u004F", 822 u"\u1E4F" : u"\u006F", 823 u"\u1E50" : u"\u004F", 824 u"\u1E51" : u"\u006F", 825 u"\u1E52" : u"\u004F", 826 u"\u1E53" : u"\u006F", 827 u"\u1E54" : u"\u0050", 828 u"\u1E55" : u"\u0070", 829 u"\u1E56" : u"\u0050", 830 u"\u1E57" : u"\u0070", 831 u"\u1E58" : u"\u0052", 832 u"\u1E59" : u"\u0072", 833 u"\u1E5A" : u"\u0052", 834 u"\u1E5B" : u"\u0072", 835 u"\u1E5C" : u"\u0052", 836 u"\u1E5D" : u"\u0072", 837 u"\u1E5E" : u"\u0052", 838 u"\u1E5F" : u"\u0072", 839 u"\u1E60" : u"\u0053", 840 u"\u1E61" : u"\u0073", 841 u"\u1E62" : u"\u0053", 842 u"\u1E63" : u"\u0073", 843 u"\u1E64" : u"\u0053", 844 u"\u1E65" : u"\u0073", 845 u"\u1E66" : u"\u0053", 846 u"\u1E67" : u"\u0073", 847 u"\u1E68" : u"\u0053", 848 u"\u1E69" : u"\u0073", 849 u"\u1E6A" : u"\u0054", 850 u"\u1E6B" : u"\u0074", 851 u"\u1E6C" : u"\u0054", 852 u"\u1E6D" : u"\u0074", 853 u"\u1E6E" : u"\u0054", 854 u"\u1E6F" : u"\u0074", 855 u"\u1E70" : u"\u0054", 856 u"\u1E71" : u"\u0074", 857 u"\u1E72" : u"\u0055", 858 u"\u1E73" : u"\u0075", 859 u"\u1E74" : u"\u0055", 860 u"\u1E75" : u"\u0075", 861 u"\u1E76" : u"\u0055", 862 u"\u1E77" : u"\u0075", 863 u"\u1E78" : u"\u0055", 864 u"\u1E79" : u"\u0075", 865 u"\u1E7A" : u"\u0055", 866 u"\u1E7B" : u"\u0075", 867 u"\u1E7C" : u"\u0056", 868 u"\u1E7D" : u"\u0076", 869 u"\u1E7E" : u"\u0056", 870 u"\u1E7F" : u"\u0076", 871 u"\u1E80" : u"\u0057", 872 u"\u1E81" : u"\u0077", 873 u"\u1E82" : u"\u0057", 874 u"\u1E83" : u"\u0077", 875 u"\u1E84" : u"\u0057", 876 u"\u1E85" : u"\u0077", 877 u"\u1E86" : u"\u0057", 878 u"\u1E87" : u"\u0077", 879 u"\u1E88" : u"\u0057", 880 u"\u1E89" : u"\u0077", 881 u"\u1E8A" : u"\u0058", 882 u"\u1E8B" : u"\u0078", 883 u"\u1E8C" : u"\u0058", 884 u"\u1E8D" : u"\u0078", 885 u"\u1E8E" : u"\u0059", 886 u"\u1E8F" : u"\u0079", 887 u"\u1E90" : u"\u005A", 888 u"\u1E91" : u"\u007A", 889 u"\u1E92" : u"\u005A", 890 u"\u1E93" : u"\u007A", 891 u"\u1E94" : u"\u005A", 892 u"\u1E95" : u"\u007A", 893 u"\u1E96" : u"\u0068", 894 u"\u1E97" : u"\u0074", 895 u"\u1E98" : u"\u0077", 896 u"\u1E99" : u"\u0079", 897 u"\u1E9A" : u"\u0061", 898 u"\u1E9B" : u"\u017F", 899 u"\u1EA0" : u"\u0041", 900 u"\u1EA1" : u"\u0061", 901 u"\u1EA2" : u"\u0041", 902 u"\u1EA3" : u"\u0061", 903 u"\u1EA4" : u"\u0041", 904 u"\u1EA5" : u"\u0061", 905 u"\u1EA6" : u"\u0041", 906 u"\u1EA7" : u"\u0061", 907 u"\u1EA8" : u"\u0041", 908 u"\u1EA9" : u"\u0061", 909 u"\u1EAA" : u"\u0041", 910 u"\u1EAB" : u"\u0061", 911 u"\u1EAC" : u"\u0041", 912 u"\u1EAD" : u"\u0061", 913 u"\u1EAE" : u"\u0041", 914 u"\u1EAF" : u"\u0061", 915 u"\u1EB0" : u"\u0041", 916 u"\u1EB1" : u"\u0061", 917 u"\u1EB2" : u"\u0041", 918 u"\u1EB3" : u"\u0061", 919 u"\u1EB4" : u"\u0041", 920 u"\u1EB5" : u"\u0061", 921 u"\u1EB6" : u"\u0041", 922 u"\u1EB7" : u"\u0061", 923 u"\u1EB8" : u"\u0045", 924 u"\u1EB9" : u"\u0065", 925 u"\u1EBA" : u"\u0045", 926 u"\u1EBB" : u"\u0065", 927 u"\u1EBC" : u"\u0045", 928 u"\u1EBD" : u"\u0065", 929 u"\u1EBE" : u"\u0045", 930 u"\u1EBF" : u"\u0065", 931 u"\u1EC0" : u"\u0045", 932 u"\u1EC1" : u"\u0065", 933 u"\u1EC2" : u"\u0045", 934 u"\u1EC3" : u"\u0065", 935 u"\u1EC4" : u"\u0045", 936 u"\u1EC5" : u"\u0065", 937 u"\u1EC6" : u"\u0045", 938 u"\u1EC7" : u"\u0065", 939 u"\u1EC8" : u"\u0049", 940 u"\u1EC9" : u"\u0069", 941 u"\u1ECA" : u"\u0049", 942 u"\u1ECB" : u"\u0069", 943 u"\u1ECC" : u"\u004F", 944 u"\u1ECD" : u"\u006F", 945 u"\u1ECE" : u"\u004F", 946 u"\u1ECF" : u"\u006F", 947 u"\u1ED0" : u"\u004F", 948 u"\u1ED1" : u"\u006F", 949 u"\u1ED2" : u"\u004F", 950 u"\u1ED3" : u"\u006F", 951 u"\u1ED4" : u"\u004F", 952 u"\u1ED5" : u"\u006F", 953 u"\u1ED6" : u"\u004F", 954 u"\u1ED7" : u"\u006F", 955 u"\u1ED8" : u"\u004F", 956 u"\u1ED9" : u"\u006F", 957 u"\u1EDA" : u"\u004F", 958 u"\u1EDB" : u"\u006F", 959 u"\u1EDC" : u"\u004F", 960 u"\u1EDD" : u"\u006F", 961 u"\u1EDE" : u"\u004F", 962 u"\u1EDF" : u"\u006F", 963 u"\u1EE0" : u"\u004F", 964 u"\u1EE1" : u"\u006F", 965 u"\u1EE2" : u"\u004F", 966 u"\u1EE3" : u"\u006F", 967 u"\u1EE4" : u"\u0055", 968 u"\u1EE5" : u"\u0075", 969 u"\u1EE6" : u"\u0055", 970 u"\u1EE7" : u"\u0075", 971 u"\u1EE8" : u"\u0055", 972 u"\u1EE9" : u"\u0075", 973 u"\u1EEA" : u"\u0055", 974 u"\u1EEB" : u"\u0075", 975 u"\u1EEC" : u"\u0055", 976 u"\u1EED" : u"\u0075", 977 u"\u1EEE" : u"\u0055", 978 u"\u1EEF" : u"\u0075", 979 u"\u1EF0" : u"\u0055", 980 u"\u1EF1" : u"\u0075", 981 u"\u1EF2" : u"\u0059", 982 u"\u1EF3" : u"\u0079", 983 u"\u1EF4" : u"\u0059", 984 u"\u1EF5" : u"\u0079", 985 u"\u1EF6" : u"\u0059", 986 u"\u1EF7" : u"\u0079", 987 u"\u1EF8" : u"\u0059", 988 u"\u1EF9" : u"\u0079" 989 } 990 991
992 - def process_string(self, session, data):
993 d = [] 994 # XXX Horrifically slow 995 for c in data: 996 if (self.map.has_key(c)): 997 d.append(self.map[c]) 998 else: 999 d.append(c) 1000 return ''.join(d)
1001