Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,10 @@ required more than simple transliteration.

>>> slugify.slugify(u'Bän...g (bang)')
u'bäng-bang'

>>> slugify.slugify(u'Bäuma means a tree', replace_latin=True)
u'bauma-means-a-tree'

>>> slugify(u'Bakıcı geldi', replace_latin=True)
u'bakici-geldi'

114 changes: 108 additions & 6 deletions slugify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,79 @@
import six
import unicodedata

LATIN_LETTERS = {
u'\N{LATIN SMALL LETTER DOTLESS I}': 'i',
u'\N{LATIN SMALL LETTER S WITH CEDILLA}': 's',
u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'c',
u'\N{LATIN SMALL LETTER G WITH BREVE}': 'g',
u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'o',
u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'u',
u'\N{LATIN SMALL LETTER A WITH GRAVE}' : 'a',
u'\N{LATIN SMALL LETTER A WITH ACUTE}' : 'a',
u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}' : 'a',
u'\N{LATIN SMALL LETTER A WITH TILDE}' : 'a',
u'\N{LATIN SMALL LETTER A WITH DIAERESIS}' : 'a',
u'\N{LATIN SMALL LETTER A WITH RING ABOVE}' : 'a',
u'\N{LATIN SMALL LETTER A WITH MACRON}': 'a',
u'\N{LATIN SMALL LETTER A WITH BREVE}': 'a',
u'\N{LATIN SMALL LETTER AE}' : 'ae',
u'\N{LATIN SMALL LETTER E WITH GRAVE}' : 'e',
u'\N{LATIN SMALL LETTER E WITH ACUTE}' : 'e',
u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}' : 'e',
u'\N{LATIN SMALL LETTER E WITH DIAERESIS}' : 'e',
u'\N{LATIN SMALL LETTER I WITH GRAVE}' : 'i',
u'\N{LATIN SMALL LETTER I WITH ACUTE}' : 'i',
u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}' : 'i',
u'\N{LATIN SMALL LETTER I WITH DIAERESIS}' : 'i',
u'\N{LATIN SMALL LETTER N WITH TILDE}' : 'n',
u'\N{LATIN SMALL LETTER O WITH GRAVE}' : 'o',
u'\N{LATIN SMALL LETTER O WITH ACUTE}' : 'o',
u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}' : 'o',
u'\N{LATIN SMALL LETTER O WITH TILDE}' : 'o',
u'\N{LATIN SMALL LETTER O WITH STROKE}': 'o',
u'\N{LATIN SMALL LETTER U WITH GRAVE}': 'u',
u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'u',
u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}': 'u',
u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'y',
u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}': 'y'
}

CAPITAL_LATIN_LETTERS = {
u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}': 'I',
u'\N{LATIN CAPITAL LETTER S WITH CEDILLA}': 'S',
u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}': 'C',
u'\N{LATIN CAPITAL LETTER G WITH BREVE}': 'G',
u'\N{LATIN CAPITAL LETTER O WITH DIAERESIS}': 'O',
u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}': 'U',
u'\N{LATIN CAPITAL LETTER A WITH GRAVE}' : 'A',
u'\N{LATIN CAPITAL LETTER A WITH ACUTE}' : 'A',
u'\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}' : 'A',
u'\N{LATIN CAPITAL LETTER A WITH TILDE}' : 'A',
u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}' : 'A',
u'\N{LATIN CAPITAL LETTER A WITH RING ABOVE}' : 'A',
u'\N{LATIN CAPITAL LETTER A WITH MACRON}': 'A',
u'\N{LATIN CAPITAL LETTER A WITH BREVE}': 'A',
u'\N{LATIN CAPITAL LETTER AE}' : 'AE',
u'\N{LATIN CAPITAL LETTER E WITH GRAVE}' : 'E',
u'\N{LATIN CAPITAL LETTER E WITH ACUTE}' : 'E',
u'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}' : 'E',
u'\N{LATIN CAPITAL LETTER E WITH DIAERESIS}' : 'E',
u'\N{LATIN CAPITAL LETTER I WITH GRAVE}' : 'I',
u'\N{LATIN CAPITAL LETTER I WITH ACUTE}' : 'I',
u'\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}' : 'I',
u'\N{LATIN CAPITAL LETTER I WITH DIAERESIS}' : 'I',
u'\N{LATIN CAPITAL LETTER N WITH TILDE}' : 'N',
u'\N{LATIN CAPITAL LETTER O WITH GRAVE}' : 'O',
u'\N{LATIN CAPITAL LETTER O WITH ACUTE}' : 'O',
u'\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}' : 'O',
u'\N{LATIN CAPITAL LETTER O WITH TILDE}' : 'O',
u'\N{LATIN CAPITAL LETTER O WITH STROKE}': 'O',
u'\N{LATIN CAPITAL LETTER U WITH GRAVE}': 'U',
u'\N{LATIN CAPITAL LETTER U WITH ACUTE}': 'U',
u'\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}': 'U',
u'\N{LATIN CAPITAL LETTER Y WITH ACUTE}': 'Y',
u'\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}': 'Y'
}

def smart_text(s, encoding='utf-8', errors='strict'):
if isinstance(s, six.text_type):
Expand All @@ -21,14 +94,30 @@ def smart_text(s, encoding='utf-8', errors='strict'):
s = six.text_type(s)
return s


# Extra characters outside of alphanumerics that we'll allow.
SLUG_OK = '-_~'


def slugify(s, ok=SLUG_OK, lower=True, spaces=False):
# L and N signify letter/number.
# http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table
def slugify(s, ok=SLUG_OK, lower=True, spaces=False, replace_latin=False):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you wouldn't mind adding a docstring here that would be great too, at least explain what replace_latin does when enabled and when disabled.

"""
Creates a unicode slug for given string with several options.

L and N signify letter/number.
http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table

:param s: Your unicode string.
:param ok: Extra characters outside of alphanumerics to be allowed.
:param lower: Lower the output string.
:param spaces: True allows spaces, False replaces a space with a dash (-)
:param replace_latin: True to replace common unicode latin letters with their similar ascii representations.
:type s: String
:type ok: String
:type lower: Bool
:type spaces: Bool
:type replace_latin: Bool
:return: Slugified unicode string

"""

rv = []
for c in unicodedata.normalize('NFKC', smart_text(s)):
cat = unicodedata.category(c)[0]
Expand All @@ -39,4 +128,17 @@ def slugify(s, ok=SLUG_OK, lower=True, spaces=False):
new = ''.join(rv).strip()
if not spaces:
new = re.sub('[-\s]+', '-', new)
return new.lower() if lower else new

new = new.lower() if lower else new

# Smart replace for latin alphabet
if replace_latin == True:

for char, new_char in LATIN_LETTERS.items():
new = new.replace(char, new_char)

if not lower:
for char, new_char in CAPITAL_LATIN_LETTERS.items():
new = new.replace(char, new_char)

return new
21 changes: 20 additions & 1 deletion slugify/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ def test_slugify():
def check(x, y):
eq_(slugify(x), y)

def check_replace_latin(x, y):
eq_(slugify(x, replace_latin=True), y)

def check_replace_latin_capital(x, y):
eq_(slugify(x, lower=False, replace_latin=True), y)

s = [('xx x - "#$@ x', 'xx-x-x'),
(u'Bän...g (bang)', u'bäng-bang'),
(u, u.lower()),
Expand All @@ -33,11 +39,24 @@ def check(x, y):
# I don't really care what slugify returns. Just don't crash.
(u'x𘍿', u'x'),
(u'ϧ΃𘒬𘓣', u'\u03e7'),
(u'¿x', u'x')]
(u'¿x', u'x'),
(u'Bakıcı geldi', u'bak\u0131c\u0131-geldi'),
(u'Bäuma means tree', u'b\xe4uma-means-tree')]

replace_latin = [(u'Bakıcı geldi', u'bakici-geldi'), (u'Bäuma means tree', u'bauma-means-tree')]

replace_latin_capital = [(u'BÄUMA MEANS TREE', u'BAUMA-MEANS-TREE'),
(u'EMİN WAS HERE', u'EMIN-WAS-HERE')]

for val, expected in s:
yield check, val, expected

for val, expected in replace_latin:
yield check_replace_latin, val, expected

for val, expected in replace_latin_capital:
yield check_replace_latin_capital, val, expected


class SmartTextTestCase(unittest.TestCase):

Expand Down