Package pytils :: Module translit
[hide private]
[frames] | no frames]

Source Code for Module pytils.translit

  1  # -*- coding: utf-8 -*- 
  2  # -*- test-case-name: pytils.test.test_translit -*- 
  3  # License: GNU GPL2 
  4  # Author: Pythy <the.pythy@gmail.com> 
  5  """ 
  6  Simple transliteration 
  7  """ 
  8   
  9  __id__ = __revision__ = "$Id: translit.py 32 2006-10-28 07:10:43Z the.pythy $" 
 10  __url__ = "$URL: https://pythy.googlecode.com/svn/trunk/pytils/pytils/translit.py $" 
 11   
 12  import re 
 13  from pytils import utils 
 14   
 15  TRANSTABLE = ( 
 16          (u"‘", u"'"), 
 17          (u"’", u"'"), 
 18          (u"«", u'"'), 
 19          (u"»", u'"'), 
 20          (u"–", u"-"), 
 21          (u"…", u"..."), 
 22          (u"№", u"#"), 
 23          ## верхний регистр 
 24          # трехбуквенные замены 
 25          (u"Щ", u"Sch"), 
 26          # при замене русский->английский будет первая замена, 
 27          # т.е. Sch 
 28          # а вот если английский->русский, то вариант SCH и Sch -- 
 29          # оба пройдут 
 30          (u"Щ", u"SCH"),  
 31          # двухбуквенные замены 
 32          (u"Ё", u"Yo"), 
 33          (u"Ё", u"YO"), 
 34          (u"Ж", u"Zh"), 
 35          (u"Ж", u"ZH"), 
 36          (u"Ц", u"Ts"), 
 37          (u"Ц", u"TS"), 
 38          (u"Ч", u"Ch"), 
 39          (u"Ч", u"CH"), 
 40          (u"Ш", u"Sh"), 
 41          (u"Ш", u"SH"), 
 42          (u"Ы", u"Yi"), 
 43          (u"Ы", u"YI"), 
 44          (u"Ю", u"Yu"), 
 45          (u"Ю", u"YU"), 
 46          (u"Я", u"Ya"), 
 47          (u"Я", u"YA"), 
 48          # однобуквенные замены 
 49          (u"А", u"A"), 
 50          (u"Б", u"B"), 
 51          (u"В", u"V"), 
 52          (u"Г", u"G"), 
 53          (u"Д", u"D"), 
 54          (u"Е", u"E"), 
 55          (u"З", u"Z"), 
 56          (u"И", u"I"), 
 57          (u"Й", u"J"), 
 58          (u"К", u"K"), 
 59          (u"Л", u"L"), 
 60          (u"М", u"M"), 
 61          (u"Н", u"N"), 
 62          (u"О", u"O"), 
 63          (u"П", u"P"), 
 64          (u"Р", u"R"), 
 65          (u"С", u"S"), 
 66          (u"Т", u"T"), 
 67          (u"У", u"U"), 
 68          (u"Ф", u"F"), 
 69          (u"Х", u"H"), 
 70          (u"Э", u"E"), 
 71          (u"Ъ", u"`"), 
 72          (u"Ь", u"'"),         
 73          ## нижний регистр 
 74          # трехбуквенные замены 
 75          (u"щ", u"sch"), 
 76          # двухбуквенные замены 
 77          (u"ё", u"yo"), 
 78          (u"ж", u"zh"), 
 79          (u"ц", u"ts"), 
 80          (u"ч", u"ch"), 
 81          (u"ш", u"sh"), 
 82          (u"ы", u"yi"), 
 83          (u"ю", u"yu"), 
 84          (u"я", u"ya"), 
 85          # однобуквенные замены 
 86          (u"а", u"a"), 
 87          (u"б", u"b"), 
 88          (u"в", u"v"), 
 89          (u"г", u"g"), 
 90          (u"д", u"d"), 
 91          (u"е", u"e"), 
 92          (u"з", u"z"), 
 93          (u"и", u"i"), 
 94          (u"й", u"j"), 
 95          (u"к", u"k"), 
 96          (u"л", u"l"), 
 97          (u"м", u"m"), 
 98          (u"н", u"n"), 
 99          (u"о", u"o"), 
100          (u"п", u"p"), 
101          (u"р", u"r"), 
102          (u"с", u"s"), 
103          (u"т", u"t"), 
104          (u"у", u"u"), 
105          (u"ф", u"f"), 
106          (u"х", u"h"), 
107          (u"э", u"e"), 
108          (u"ъ", u"`"), 
109          (u"ь", u"'"), 
110          # для полноты английского алфавит (в slugify) 
111          # дополняем английскими буквами, которых 
112          # не в парах 
113          (u"c", u"c"), 
114          (u"q", u"q"), 
115          (u"y", u"y"), 
116          (u"x", u"x"), 
117          (u"C", u"C"), 
118          (u"Q", u"Q"), 
119          (u"Y", u"Y"), 
120          (u"X", u"X"), 
121          (u"1", u"1"), 
122          (u"2", u"2"), 
123          (u"3", u"3"), 
124          (u"4", u"4"), 
125          (u"5", u"5"), 
126          (u"6", u"6"), 
127          (u"7", u"7"), 
128          (u"8", u"8"), 
129          (u"9", u"9"), 
130          (u"0", u"0"), 
131          )  #: Translation table 
132   
133  RU_ALPHABET = [x[0] for x in TRANSTABLE] #: Russian alphabet that we can translate 
134  EN_ALPHABET = [x[1] for x in TRANSTABLE] #: English alphabet that we can detransliterate 
135  ALPHABET = RU_ALPHABET + EN_ALPHABET #: Alphabet that we can (de)transliterate 
136   
137   
138 -def translify(in_string):
139 """ 140 Translify russian text 141 142 @param in_string: input string 143 @type in_string: C{unicode} 144 145 @return: transliterated string 146 @rtype: C{str} 147 148 @raise TypeError: when in_string is not C{unicode} 149 @raise ValueError: when string doesn't transliterate completely 150 """ 151 utils.check_type('in_string', unicode) 152 153 translit = in_string 154 for symb_in, symb_out in TRANSTABLE: 155 translit = translit.replace(symb_in, symb_out) 156 157 try: 158 translit = str(translit) 159 except UnicodeEncodeError: 160 raise ValueError("Unicode string doesn't transliterate completely, " + \ 161 "is it russian?") 162 163 return translit
164
165 -def detranslify(in_string):
166 """ 167 Detranslify 168 169 @param in_string: input string 170 @type in_string: C{basestring} 171 172 @return: detransliterated string 173 @rtype: C{str} 174 175 @raise TypeError: when in_string neither C{str}, no C{unicode} 176 @raise ValueError: if in_string is C{str}, but it isn't ascii 177 """ 178 utils.check_type('in_string', basestring) 179 180 # в unicode 181 try: 182 russian = unicode(in_string) 183 except UnicodeDecodeError: 184 raise ValueError("We expects when in_string is str type," + \ 185 "it is an ascii, but now it isn't. Use unicode " + \ 186 "in this case.") 187 188 for symb_out, symb_in in TRANSTABLE: 189 russian = russian.replace(symb_in, symb_out) 190 191 return russian
192
193 -def slugify(in_string):
194 """ 195 Prepare string for slug (i.e. URL or file/dir name) 196 197 @param in_string: input string 198 @type in_string: C{basestring} 199 200 @return: slug-string 201 @rtype: C{str} 202 203 @raise TypeError: when in_string isn't C{unicode} or C{str} 204 @raise ValueError: if in_string is C{str}, but it isn't ascii 205 """ 206 utils.check_type('in_string', basestring) 207 try: 208 u_in_string = unicode(in_string) 209 except UnicodeDecodeError: 210 raise ValueError("We expects when in_string is str type," + \ 211 "it is an ascii, but now it isn't. Use unicode " + \ 212 "in this case.") 213 214 215 216 # convert & to "and" 217 u_in_string = re.sub('\&amp\;|\&', ' and ', u_in_string) 218 # replace spaces by hyphen 219 u_in_string = re.sub('[-\s]+', '-', u_in_string) 220 # remove symbols that not in alphabet 221 u_in_string = u''.join([symb for symb in u_in_string if symb in ALPHABET]) 222 # translify it 223 out_string = translify(u_in_string) 224 # remove non-alpha 225 return re.sub('[^\w\s-]', '', out_string).strip().lower()
226
227 -def dirify(in_string):
228 """ 229 Alias for L{slugify} 230 """ 231 slugify(in_string)
232