diff --git a/README.md b/README.md index 66a7a4c..f8df1da 100644 --- a/README.md +++ b/README.md @@ -11,3 +11,10 @@ required more than simple transliteration. >>> slugify.slugify(u'Bän...g (bang)') u'bäng-bang' + + >>> slugify.slugify(u'Bäuma means a tree', replace_latin=True) + u'bauma-means-a-tree' + + >>> slugify(u'Bakıcı geldi', replace_latin=True) + u'bakici-geldi' + diff --git a/slugify/__init__.py b/slugify/__init__.py index 791c0ea..3fa1a65 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -2,6 +2,79 @@ import six import unicodedata +LATIN_LETTERS = { + u'\N{LATIN SMALL LETTER DOTLESS I}': 'i', + u'\N{LATIN SMALL LETTER S WITH CEDILLA}': 's', + u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'c', + u'\N{LATIN SMALL LETTER G WITH BREVE}': 'g', + u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'o', + u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'u', + u'\N{LATIN SMALL LETTER A WITH GRAVE}' : 'a', + u'\N{LATIN SMALL LETTER A WITH ACUTE}' : 'a', + u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}' : 'a', + u'\N{LATIN SMALL LETTER A WITH TILDE}' : 'a', + u'\N{LATIN SMALL LETTER A WITH DIAERESIS}' : 'a', + u'\N{LATIN SMALL LETTER A WITH RING ABOVE}' : 'a', + u'\N{LATIN SMALL LETTER A WITH MACRON}': 'a', + u'\N{LATIN SMALL LETTER A WITH BREVE}': 'a', + u'\N{LATIN SMALL LETTER AE}' : 'ae', + u'\N{LATIN SMALL LETTER E WITH GRAVE}' : 'e', + u'\N{LATIN SMALL LETTER E WITH ACUTE}' : 'e', + u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}' : 'e', + u'\N{LATIN SMALL LETTER E WITH DIAERESIS}' : 'e', + u'\N{LATIN SMALL LETTER I WITH GRAVE}' : 'i', + u'\N{LATIN SMALL LETTER I WITH ACUTE}' : 'i', + u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}' : 'i', + u'\N{LATIN SMALL LETTER I WITH DIAERESIS}' : 'i', + u'\N{LATIN SMALL LETTER N WITH TILDE}' : 'n', + u'\N{LATIN SMALL LETTER O WITH GRAVE}' : 'o', + u'\N{LATIN SMALL LETTER O WITH ACUTE}' : 'o', + u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}' : 'o', + u'\N{LATIN SMALL LETTER O WITH TILDE}' : 'o', + u'\N{LATIN SMALL LETTER O WITH STROKE}': 'o', + u'\N{LATIN SMALL LETTER U WITH GRAVE}': 'u', + u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'u', + u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}': 'u', + u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'y', + u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}': 'y' +} + +CAPITAL_LATIN_LETTERS = { + u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}': 'I', + u'\N{LATIN CAPITAL LETTER S WITH CEDILLA}': 'S', + u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}': 'C', + u'\N{LATIN CAPITAL LETTER G WITH BREVE}': 'G', + u'\N{LATIN CAPITAL LETTER O WITH DIAERESIS}': 'O', + u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}': 'U', + u'\N{LATIN CAPITAL LETTER A WITH GRAVE}' : 'A', + u'\N{LATIN CAPITAL LETTER A WITH ACUTE}' : 'A', + u'\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}' : 'A', + u'\N{LATIN CAPITAL LETTER A WITH TILDE}' : 'A', + u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}' : 'A', + u'\N{LATIN CAPITAL LETTER A WITH RING ABOVE}' : 'A', + u'\N{LATIN CAPITAL LETTER A WITH MACRON}': 'A', + u'\N{LATIN CAPITAL LETTER A WITH BREVE}': 'A', + u'\N{LATIN CAPITAL LETTER AE}' : 'AE', + u'\N{LATIN CAPITAL LETTER E WITH GRAVE}' : 'E', + u'\N{LATIN CAPITAL LETTER E WITH ACUTE}' : 'E', + u'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}' : 'E', + u'\N{LATIN CAPITAL LETTER E WITH DIAERESIS}' : 'E', + u'\N{LATIN CAPITAL LETTER I WITH GRAVE}' : 'I', + u'\N{LATIN CAPITAL LETTER I WITH ACUTE}' : 'I', + u'\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}' : 'I', + u'\N{LATIN CAPITAL LETTER I WITH DIAERESIS}' : 'I', + u'\N{LATIN CAPITAL LETTER N WITH TILDE}' : 'N', + u'\N{LATIN CAPITAL LETTER O WITH GRAVE}' : 'O', + u'\N{LATIN CAPITAL LETTER O WITH ACUTE}' : 'O', + u'\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}' : 'O', + u'\N{LATIN CAPITAL LETTER O WITH TILDE}' : 'O', + u'\N{LATIN CAPITAL LETTER O WITH STROKE}': 'O', + u'\N{LATIN CAPITAL LETTER U WITH GRAVE}': 'U', + u'\N{LATIN CAPITAL LETTER U WITH ACUTE}': 'U', + u'\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}': 'U', + u'\N{LATIN CAPITAL LETTER Y WITH ACUTE}': 'Y', + u'\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}': 'Y' +} def smart_text(s, encoding='utf-8', errors='strict'): if isinstance(s, six.text_type): @@ -21,14 +94,30 @@ def smart_text(s, encoding='utf-8', errors='strict'): s = six.text_type(s) return s - -# Extra characters outside of alphanumerics that we'll allow. SLUG_OK = '-_~' -def slugify(s, ok=SLUG_OK, lower=True, spaces=False): - # L and N signify letter/number. - # http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table +def slugify(s, ok=SLUG_OK, lower=True, spaces=False, replace_latin=False): + """ + Creates a unicode slug for given string with several options. + + L and N signify letter/number. + http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table + + :param s: Your unicode string. + :param ok: Extra characters outside of alphanumerics to be allowed. + :param lower: Lower the output string. + :param spaces: True allows spaces, False replaces a space with a dash (-) + :param replace_latin: True to replace common unicode latin letters with their similar ascii representations. + :type s: String + :type ok: String + :type lower: Bool + :type spaces: Bool + :type replace_latin: Bool + :return: Slugified unicode string + + """ + rv = [] for c in unicodedata.normalize('NFKC', smart_text(s)): cat = unicodedata.category(c)[0] @@ -39,4 +128,17 @@ def slugify(s, ok=SLUG_OK, lower=True, spaces=False): new = ''.join(rv).strip() if not spaces: new = re.sub('[-\s]+', '-', new) - return new.lower() if lower else new + + new = new.lower() if lower else new + + # Smart replace for latin alphabet + if replace_latin == True: + + for char, new_char in LATIN_LETTERS.items(): + new = new.replace(char, new_char) + + if not lower: + for char, new_char in CAPITAL_LATIN_LETTERS.items(): + new = new.replace(char, new_char) + + return new diff --git a/slugify/tests.py b/slugify/tests.py index 4515147..a03286d 100644 --- a/slugify/tests.py +++ b/slugify/tests.py @@ -16,6 +16,12 @@ def test_slugify(): def check(x, y): eq_(slugify(x), y) + def check_replace_latin(x, y): + eq_(slugify(x, replace_latin=True), y) + + def check_replace_latin_capital(x, y): + eq_(slugify(x, lower=False, replace_latin=True), y) + s = [('xx x - "#$@ x', 'xx-x-x'), (u'Bän...g (bang)', u'bäng-bang'), (u, u.lower()), @@ -33,11 +39,24 @@ def check(x, y): # I don't really care what slugify returns. Just don't crash. (u'x𘍿', u'x'), (u'ϧ΃𘒬𘓣', u'\u03e7'), - (u'¿x', u'x')] + (u'¿x', u'x'), + (u'Bakıcı geldi', u'bak\u0131c\u0131-geldi'), + (u'Bäuma means tree', u'b\xe4uma-means-tree')] + + replace_latin = [(u'Bakıcı geldi', u'bakici-geldi'), (u'Bäuma means tree', u'bauma-means-tree')] + + replace_latin_capital = [(u'BÄUMA MEANS TREE', u'BAUMA-MEANS-TREE'), + (u'EMİN WAS HERE', u'EMIN-WAS-HERE')] for val, expected in s: yield check, val, expected + for val, expected in replace_latin: + yield check_replace_latin, val, expected + + for val, expected in replace_latin_capital: + yield check_replace_latin_capital, val, expected + class SmartTextTestCase(unittest.TestCase):