From d624d96f44d3754344b71673f43e762c62066987 Mon Sep 17 00:00:00 2001 From: Emin Bugra Date: Sun, 11 Jan 2015 21:46:20 -0800 Subject: [PATCH 01/10] Add letters. --- slugify/__init__.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/slugify/__init__.py b/slugify/__init__.py index 791c0ea..9d857ab 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -2,6 +2,27 @@ import six import unicodedata +LETTERS = { + u'\u0131': 'i', + u'\u015f': 's', + u'\xe7': 'c', + u'\u011f': 'g', + u'\xf6': 'o', + u'\xfc': 'u', + u'\xe2': 'a', + u'\xee': 'i' +} + +CAPITAL_LETTERS = { + u'\u0130': 'I', + u'\u015e': 'S', + u'\xc7': 'C', + u'\u011e': 'G', + u'\xd6': 'O', + u'\xdc': 'U', + u'\xc2': 'A', + u'\xce': 'I' +} def smart_text(s, encoding='utf-8', errors='strict'): if isinstance(s, six.text_type): From 3d1cd1279e3e34e8570aa4aa0bc8257d95be27d5 Mon Sep 17 00:00:00 2001 From: Emin Bugra Date: Sun, 11 Jan 2015 21:52:36 -0800 Subject: [PATCH 02/10] Add smart replace. --- slugify/__init__.py | 17 +++++++++++++++-- slugify/tests.py | 10 +++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/slugify/__init__.py b/slugify/__init__.py index 9d857ab..0618d65 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -47,7 +47,7 @@ def smart_text(s, encoding='utf-8', errors='strict'): SLUG_OK = '-_~' -def slugify(s, ok=SLUG_OK, lower=True, spaces=False): +def slugify(s, ok=SLUG_OK, lower=True, spaces=False, smart_replace=False): # L and N signify letter/number. # http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table rv = [] @@ -60,4 +60,17 @@ def slugify(s, ok=SLUG_OK, lower=True, spaces=False): new = ''.join(rv).strip() if not spaces: new = re.sub('[-\s]+', '-', new) - return new.lower() if lower else new + + new.lower() if lower else new + + # Smart replace + if smart_replace == True: + + for char, new_char in LETTERS.items(): + new = new.replace(char, new_char) + + if not lower: + for char, new_char in CAPITAL_LETTERS.items(): + new = new.replace(char, new_char) + + return new diff --git a/slugify/tests.py b/slugify/tests.py index 4515147..125d811 100644 --- a/slugify/tests.py +++ b/slugify/tests.py @@ -16,6 +16,9 @@ def test_slugify(): def check(x, y): eq_(slugify(x), y) + def check_smart_replace(x, y): + eq_(slugify(x, smart_replace=True), y) + s = [('xx x - "#$@ x', 'xx-x-x'), (u'Bän...g (bang)', u'bäng-bang'), (u, u.lower()), @@ -33,11 +36,16 @@ def check(x, y): # I don't really care what slugify returns. Just don't crash. (u'x𘍿', u'x'), (u'ϧ΃𘒬𘓣', u'\u03e7'), - (u'¿x', u'x')] + (u'¿x', u'x'), + (u'Bakıcı geldi', u'bak\u0131c\u0131-geldi')] + + smart_replace = [(u'Bakıcı geldi', u'bakici-geldi')] for val, expected in s: yield check, val, expected + for val, expected in smart_replace: + yield check_smart_replace, val, expected class SmartTextTestCase(unittest.TestCase): From 4aeb464c733599eb95561db4df397bfdd627fcee Mon Sep 17 00:00:00 2001 From: Emin Bugra Date: Sun, 11 Jan 2015 21:55:40 -0800 Subject: [PATCH 03/10] Typo fix. --- slugify/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slugify/__init__.py b/slugify/__init__.py index 0618d65..5a933e5 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -61,7 +61,7 @@ def slugify(s, ok=SLUG_OK, lower=True, spaces=False, smart_replace=False): if not spaces: new = re.sub('[-\s]+', '-', new) - new.lower() if lower else new + new = new.lower() if lower else new # Smart replace if smart_replace == True: From ee9451607edc57e1607f59b50a78da339573a8e9 Mon Sep 17 00:00:00 2001 From: Emin Bugra Date: Sun, 11 Jan 2015 22:24:41 -0800 Subject: [PATCH 04/10] Replace with Unicode UCS-2 versions to make it readable. --- slugify/__init__.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/slugify/__init__.py b/slugify/__init__.py index 5a933e5..0ecd8f6 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -3,25 +3,21 @@ import unicodedata LETTERS = { - u'\u0131': 'i', - u'\u015f': 's', - u'\xe7': 'c', - u'\u011f': 'g', - u'\xf6': 'o', - u'\xfc': 'u', - u'\xe2': 'a', - u'\xee': 'i' + u'\N{LATIN SMALL LETTER DOTLESS I}': 'i', + u'\N{LATIN SMALL LETTER S WITH CEDILLA}': 's', + u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'c', + u'\N{LATIN SMALL LETTER G WITH BREVE}': 'g', + u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'o', + u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'u', } CAPITAL_LETTERS = { - u'\u0130': 'I', - u'\u015e': 'S', - u'\xc7': 'C', - u'\u011e': 'G', - u'\xd6': 'O', - u'\xdc': 'U', - u'\xc2': 'A', - u'\xce': 'I' + u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}': 'I', + u'\N{LATIN CAPITAL LETTER S WITH CEDILLA}': 'S', + u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}': 'C', + u'\N{LATIN CAPITAL LETTER G WITH BREVE}': 'G', + u'\N{LATIN CAPITAL LETTER O WITH DIAERESIS}': 'O', + u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}': 'U' } def smart_text(s, encoding='utf-8', errors='strict'): From cb83c7d16721b9a572a412e7a074a8ce93dc29fa Mon Sep 17 00:00:00 2001 From: Emin Bugra Date: Sun, 11 Jan 2015 23:08:56 -0800 Subject: [PATCH 05/10] Add more latin letters. --- slugify/__init__.py | 58 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/slugify/__init__.py b/slugify/__init__.py index 0ecd8f6..1ca860e 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -9,6 +9,34 @@ u'\N{LATIN SMALL LETTER G WITH BREVE}': 'g', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'o', u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'u', + u'\N{LATIN SMALL LETTER A WITH GRAVE}' : 'a', + u'\N{LATIN SMALL LETTER A WITH ACUTE}' : 'a', + u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}' : 'a', + u'\N{LATIN SMALL LETTER A WITH TILDE}' : 'a', + u'\N{LATIN SMALL LETTER A WITH DIAERESIS}' : 'a', + u'\N{LATIN SMALL LETTER A WITH RING ABOVE}' : 'a', + u'\N{LATIN SMALL LETTER A WITH MACRON}': 'a', + u'\N{LATIN SMALL LETTER A WITH BREVE}': 'a', + u'\N{LATIN SMALL LETTER AE}' : 'ae', + u'\N{LATIN SMALL LETTER E WITH GRAVE}' : 'e', + u'\N{LATIN SMALL LETTER E WITH ACUTE}' : 'e', + u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}' : 'e', + u'\N{LATIN SMALL LETTER E WITH DIAERESIS}' : 'e', + u'\N{LATIN SMALL LETTER I WITH GRAVE}' : 'i', + u'\N{LATIN SMALL LETTER I WITH ACUTE}' : 'i', + u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}' : 'i', + u'\N{LATIN SMALL LETTER I WITH DIAERESIS}' : 'i', + u'\N{LATIN SMALL LETTER N WITH TILDE}' : 'n', + u'\N{LATIN SMALL LETTER O WITH GRAVE}' : 'o', + u'\N{LATIN SMALL LETTER O WITH ACUTE}' : 'o', + u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}' : 'o', + u'\N{LATIN SMALL LETTER O WITH TILDE}' : 'o', + u'\N{LATIN SMALL LETTER O WITH STROKE}': 'o', + u'\N{LATIN SMALL LETTER U WITH GRAVE}': 'u', + u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'u', + u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}': 'u', + u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'y', + u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}': 'y' } CAPITAL_LETTERS = { @@ -17,7 +45,35 @@ u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}': 'C', u'\N{LATIN CAPITAL LETTER G WITH BREVE}': 'G', u'\N{LATIN CAPITAL LETTER O WITH DIAERESIS}': 'O', - u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}': 'U' + u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}': 'U', + u'\N{LATIN CAPITAL LETTER A WITH GRAVE}' : 'A', + u'\N{LATIN CAPITAL LETTER A WITH ACUTE}' : 'A', + u'\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}' : 'A', + u'\N{LATIN CAPITAL LETTER A WITH TILDE}' : 'A', + u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}' : 'A', + u'\N{LATIN CAPITAL LETTER A WITH RING ABOVE}' : 'A', + u'\N{LATIN CAPITAL LETTER A WITH MACRON}': 'A', + u'\N{LATIN CAPITAL LETTER A WITH BREVE}': 'A', + u'\N{LATIN CAPITAL LETTER AE}' : 'AE', + u'\N{LATIN CAPITAL LETTER E WITH GRAVE}' : 'E', + u'\N{LATIN CAPITAL LETTER E WITH ACUTE}' : 'E', + u'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}' : 'E', + u'\N{LATIN CAPITAL LETTER E WITH DIAERESIS}' : 'E', + u'\N{LATIN CAPITAL LETTER I WITH GRAVE}' : 'I', + u'\N{LATIN CAPITAL LETTER I WITH ACUTE}' : 'I', + u'\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}' : 'I', + u'\N{LATIN CAPITAL LETTER I WITH DIAERESIS}' : 'I', + u'\N{LATIN CAPITAL LETTER N WITH TILDE}' : 'N', + u'\N{LATIN CAPITAL LETTER O WITH GRAVE}' : 'O', + u'\N{LATIN CAPITAL LETTER O WITH ACUTE}' : 'O', + u'\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}' : 'O', + u'\N{LATIN CAPITAL LETTER O WITH TILDE}' : 'O', + u'\N{LATIN CAPITAL LETTER O WITH STROKE}': 'O', + u'\N{LATIN CAPITAL LETTER U WITH GRAVE}': 'U', + u'\N{LATIN CAPITAL LETTER U WITH ACUTE}': 'U', + u'\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}': 'U', + u'\N{LATIN CAPITAL LETTER Y WITH ACUTE}': 'Y', + u'\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}': 'Y' } def smart_text(s, encoding='utf-8', errors='strict'): From 4eacc9f34547c6683a72c2422d8e419918745744 Mon Sep 17 00:00:00 2001 From: Emin Bugra Date: Sun, 11 Jan 2015 23:10:07 -0800 Subject: [PATCH 06/10] Rename parameter name to replace_latin. --- slugify/__init__.py | 14 +++++++------- slugify/tests.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/slugify/__init__.py b/slugify/__init__.py index 1ca860e..83b8d92 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -2,7 +2,7 @@ import six import unicodedata -LETTERS = { +LATIN_LETTERS = { u'\N{LATIN SMALL LETTER DOTLESS I}': 'i', u'\N{LATIN SMALL LETTER S WITH CEDILLA}': 's', u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'c', @@ -39,7 +39,7 @@ u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}': 'y' } -CAPITAL_LETTERS = { +CAPITAL_LATIN_LETTERS = { u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}': 'I', u'\N{LATIN CAPITAL LETTER S WITH CEDILLA}': 'S', u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}': 'C', @@ -99,7 +99,7 @@ def smart_text(s, encoding='utf-8', errors='strict'): SLUG_OK = '-_~' -def slugify(s, ok=SLUG_OK, lower=True, spaces=False, smart_replace=False): +def slugify(s, ok=SLUG_OK, lower=True, spaces=False, replace_latin=False): # L and N signify letter/number. # http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table rv = [] @@ -115,14 +115,14 @@ def slugify(s, ok=SLUG_OK, lower=True, spaces=False, smart_replace=False): new = new.lower() if lower else new - # Smart replace - if smart_replace == True: + # Smart replace for latin alphabet + if replace_latin == True: - for char, new_char in LETTERS.items(): + for char, new_char in LATIN_LETTERS.items(): new = new.replace(char, new_char) if not lower: - for char, new_char in CAPITAL_LETTERS.items(): + for char, new_char in CAPITAL_LATIN_LETTERS.items(): new = new.replace(char, new_char) return new diff --git a/slugify/tests.py b/slugify/tests.py index 125d811..3773294 100644 --- a/slugify/tests.py +++ b/slugify/tests.py @@ -16,8 +16,8 @@ def test_slugify(): def check(x, y): eq_(slugify(x), y) - def check_smart_replace(x, y): - eq_(slugify(x, smart_replace=True), y) + def check_replace_latin(x, y): + eq_(slugify(x, replace_latin=True), y) s = [('xx x - "#$@ x', 'xx-x-x'), (u'Bän...g (bang)', u'bäng-bang'), @@ -39,13 +39,13 @@ def check_smart_replace(x, y): (u'¿x', u'x'), (u'Bakıcı geldi', u'bak\u0131c\u0131-geldi')] - smart_replace = [(u'Bakıcı geldi', u'bakici-geldi')] + replace_latin = [(u'Bakıcı geldi', u'bakici-geldi')] for val, expected in s: yield check, val, expected - for val, expected in smart_replace: - yield check_smart_replace, val, expected + for val, expected in replace_latin: + yield check_replace_latin, val, expected class SmartTextTestCase(unittest.TestCase): From 2f8ccf007c6155340c96237cc8b048b186dbe1ee Mon Sep 17 00:00:00 2001 From: Emin Bugra Date: Mon, 12 Jan 2015 12:25:14 -0800 Subject: [PATCH 07/10] Add more tests. --- slugify/tests.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/slugify/tests.py b/slugify/tests.py index 3773294..a03286d 100644 --- a/slugify/tests.py +++ b/slugify/tests.py @@ -19,6 +19,9 @@ def check(x, y): def check_replace_latin(x, y): eq_(slugify(x, replace_latin=True), y) + def check_replace_latin_capital(x, y): + eq_(slugify(x, lower=False, replace_latin=True), y) + s = [('xx x - "#$@ x', 'xx-x-x'), (u'Bän...g (bang)', u'bäng-bang'), (u, u.lower()), @@ -37,9 +40,13 @@ def check_replace_latin(x, y): (u'x𘍿', u'x'), (u'ϧ΃𘒬𘓣', u'\u03e7'), (u'¿x', u'x'), - (u'Bakıcı geldi', u'bak\u0131c\u0131-geldi')] + (u'Bakıcı geldi', u'bak\u0131c\u0131-geldi'), + (u'Bäuma means tree', u'b\xe4uma-means-tree')] + + replace_latin = [(u'Bakıcı geldi', u'bakici-geldi'), (u'Bäuma means tree', u'bauma-means-tree')] - replace_latin = [(u'Bakıcı geldi', u'bakici-geldi')] + replace_latin_capital = [(u'BÄUMA MEANS TREE', u'BAUMA-MEANS-TREE'), + (u'EMİN WAS HERE', u'EMIN-WAS-HERE')] for val, expected in s: yield check, val, expected @@ -47,6 +54,10 @@ def check_replace_latin(x, y): for val, expected in replace_latin: yield check_replace_latin, val, expected + for val, expected in replace_latin_capital: + yield check_replace_latin_capital, val, expected + + class SmartTextTestCase(unittest.TestCase): def test_smart_text_raises_an_error(self): From eeb291d21ec2244351a10ec88a822e12b411916b Mon Sep 17 00:00:00 2001 From: Emin Bugra Date: Mon, 12 Jan 2015 12:40:46 -0800 Subject: [PATCH 08/10] Add docstring. --- slugify/__init__.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/slugify/__init__.py b/slugify/__init__.py index 83b8d92..3fa1a65 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -94,14 +94,30 @@ def smart_text(s, encoding='utf-8', errors='strict'): s = six.text_type(s) return s - -# Extra characters outside of alphanumerics that we'll allow. SLUG_OK = '-_~' def slugify(s, ok=SLUG_OK, lower=True, spaces=False, replace_latin=False): - # L and N signify letter/number. - # http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table + """ + Creates a unicode slug for given string with several options. + + L and N signify letter/number. + http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table + + :param s: Your unicode string. + :param ok: Extra characters outside of alphanumerics to be allowed. + :param lower: Lower the output string. + :param spaces: True allows spaces, False replaces a space with a dash (-) + :param replace_latin: True to replace common unicode latin letters with their similar ascii representations. + :type s: String + :type ok: String + :type lower: Bool + :type spaces: Bool + :type replace_latin: Bool + :return: Slugified unicode string + + """ + rv = [] for c in unicodedata.normalize('NFKC', smart_text(s)): cat = unicodedata.category(c)[0] From f3fff77cd75c543a6519f14f9f8072239a15eab3 Mon Sep 17 00:00:00 2001 From: Emin Bugra Date: Mon, 12 Jan 2015 12:40:57 -0800 Subject: [PATCH 09/10] Add example to readme. --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 66a7a4c..5db9eff 100644 --- a/README.md +++ b/README.md @@ -11,3 +11,6 @@ required more than simple transliteration. >>> slugify.slugify(u'Bän...g (bang)') u'bäng-bang' + + >>> slugify.slugify(u'Bäuma means a tree', replace_latin=True) + u'bauma-means-a-tree' From 266bac4633c3070a1fee0a83551dd952fd2109f2 Mon Sep 17 00:00:00 2001 From: Emin Bugra Date: Mon, 12 Jan 2015 12:45:19 -0800 Subject: [PATCH 10/10] Another example in readme --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 5db9eff..f8df1da 100644 --- a/README.md +++ b/README.md @@ -14,3 +14,7 @@ required more than simple transliteration. >>> slugify.slugify(u'Bäuma means a tree', replace_latin=True) u'bauma-means-a-tree' + + >>> slugify(u'Bakıcı geldi', replace_latin=True) + u'bakici-geldi' +