# This file is from https://github.com/r0oth3x49/udemy-dl/blob/master/udemy/sanitize.py from __future__ import unicode_literals import re import six import unicodedata from unidecode import unidecode def smart_text(s, encoding="utf-8", errors="strict"): if isinstance(s, six.text_type): return s if not isinstance(s, six.string_types): if six.PY3: if isinstance(s, bytes): s = six.text_type(s, encoding, errors) else: s = six.text_type(s) elif hasattr(s, "__unicode__"): s = six.text_type(s) else: s = six.text_type(bytes(s), encoding, errors) else: s = six.text_type(s) return s # Extra characters outside of alphanumerics that we'll allow. SLUG_OK = "-_~" def slugify(s, ok=SLUG_OK, lower=True, spaces=False, only_ascii=False, space_replacement="_"): """ Creates a unicode slug for given string with several options. L and N signify letter/number. http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table :param s: Your unicode string. :param ok: Extra characters outside of alphanumerics to be allowed. Default is '-_~' :param lower: Lower the output string. Default is True :param spaces: True allows spaces, False replaces a space with the "space_replacement" param :param only_ascii: True to replace non-ASCII unicode characters with their ASCII representations. :param space_replacement: Char used to replace spaces if "spaces" is False. Default is dash ("-") or first char in ok if dash not allowed :type s: String :type ok: String :type lower: Bool :type spaces: Bool :type only_ascii: Bool :type space_replacement: String :return: Slugified unicode string """ if only_ascii and ok != SLUG_OK and hasattr(ok, "decode"): try: ok.decode("ascii") except UnicodeEncodeError: raise ValueError( ('You can not use "only_ascii=True" with ' 'a non ascii available chars in "ok" ("%s" given)') % ok) rv = [] for c in unicodedata.normalize("NFKC", smart_text(s)): cat = unicodedata.category(c)[0] if cat in "LN" or c in ok: rv.append(c) elif cat == "Z": # space rv.append(" ") new = "".join(rv).strip() if only_ascii: new = unidecode(new) if not spaces: if space_replacement and space_replacement not in ok: space_replacement = ok[0] if ok else "" new = re.sub("[%s\s]+" % space_replacement, space_replacement, new) if lower: new = new.lower() return new def sanitize(title): _locale = { "194": "A", "199": "C", "286": "G", "304": "I", "206": "I", "214": "O", "350": "S", "219": "U", "226": "a", "231": "c", "287": "g", "305": "i", "238": "i", "246": "o", "351": "s", "251": "u", "191": "", "225": "a", "233": "e", "237": "i", "243": "o", "250": "u", "252": "u", "168u": "u", "241": "n", "193": "A", "201": "E", "205": "I", "211": "O", "218": "U", "220": "U", "168U": "U", "209": "N", "223": "ss", } _temp = "".join([str(ord(i)) if ord(i) > 128 else i for i in title]) for _ascii, _char in _locale.items(): if _ascii in _temp: _temp = _temp.replace(_ascii, _char) ok = re.compile(r'[^\\/:*?"<>]') _title = "".join(x if ok.match(x) else "_" for x in _temp) return _title