mirror of
https://cdm-project.com/Download-Tools/udemy-downloader.git
synced 2025-04-30 01:04:26 +02:00
137 lines
3.8 KiB
Python
137 lines
3.8 KiB
Python
# This file is from https://github.com/r0oth3x49/udemy-dl/blob/master/udemy/sanitize.py
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
import re
|
|
import six
|
|
import unicodedata
|
|
from unidecode import unidecode
|
|
|
|
|
|
def smart_text(s, encoding="utf-8", errors="strict"):
|
|
if isinstance(s, six.text_type):
|
|
return s
|
|
|
|
if not isinstance(s, six.string_types):
|
|
if six.PY3:
|
|
if isinstance(s, bytes):
|
|
s = six.text_type(s, encoding, errors)
|
|
else:
|
|
s = six.text_type(s)
|
|
elif hasattr(s, "__unicode__"):
|
|
s = six.text_type(s)
|
|
else:
|
|
s = six.text_type(bytes(s), encoding, errors)
|
|
else:
|
|
s = six.text_type(s)
|
|
return s
|
|
|
|
|
|
# Extra characters outside of alphanumerics that we'll allow.
|
|
SLUG_OK = "-_~"
|
|
|
|
|
|
def slugify(s,
|
|
ok=SLUG_OK,
|
|
lower=True,
|
|
spaces=False,
|
|
only_ascii=False,
|
|
space_replacement="_"):
|
|
"""
|
|
Creates a unicode slug for given string with several options.
|
|
L and N signify letter/number.
|
|
http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table
|
|
:param s: Your unicode string.
|
|
:param ok: Extra characters outside of alphanumerics to be allowed.
|
|
Default is '-_~'
|
|
:param lower: Lower the output string.
|
|
Default is True
|
|
:param spaces: True allows spaces, False replaces a space with the "space_replacement" param
|
|
:param only_ascii: True to replace non-ASCII unicode characters with
|
|
their ASCII representations.
|
|
:param space_replacement: Char used to replace spaces if "spaces" is False.
|
|
Default is dash ("-") or first char in ok if dash not allowed
|
|
:type s: String
|
|
:type ok: String
|
|
:type lower: Bool
|
|
:type spaces: Bool
|
|
:type only_ascii: Bool
|
|
:type space_replacement: String
|
|
:return: Slugified unicode string
|
|
"""
|
|
|
|
if only_ascii and ok != SLUG_OK and hasattr(ok, "decode"):
|
|
try:
|
|
ok.decode("ascii")
|
|
except UnicodeEncodeError:
|
|
raise ValueError(
|
|
('You can not use "only_ascii=True" with '
|
|
'a non ascii available chars in "ok" ("%s" given)') % ok)
|
|
|
|
rv = []
|
|
for c in unicodedata.normalize("NFKC", smart_text(s)):
|
|
cat = unicodedata.category(c)[0]
|
|
if cat in "LN" or c in ok:
|
|
rv.append(c)
|
|
elif cat == "Z": # space
|
|
rv.append(" ")
|
|
new = "".join(rv).strip()
|
|
|
|
if only_ascii:
|
|
new = unidecode(new)
|
|
if not spaces:
|
|
if space_replacement and space_replacement not in ok:
|
|
space_replacement = ok[0] if ok else ""
|
|
new = re.sub("[%s\s]+" % space_replacement, space_replacement, new)
|
|
if lower:
|
|
new = new.lower()
|
|
|
|
return new
|
|
|
|
|
|
def sanitize(title):
|
|
_locale = {
|
|
"194": "A",
|
|
"199": "C",
|
|
"286": "G",
|
|
"304": "I",
|
|
"206": "I",
|
|
"214": "O",
|
|
"350": "S",
|
|
"219": "U",
|
|
"226": "a",
|
|
"231": "c",
|
|
"287": "g",
|
|
"305": "i",
|
|
"238": "i",
|
|
"246": "o",
|
|
"351": "s",
|
|
"251": "u",
|
|
"191": "",
|
|
"225": "a",
|
|
"233": "e",
|
|
"237": "i",
|
|
"243": "o",
|
|
"250": "u",
|
|
"252": "u",
|
|
"168u": "u",
|
|
"241": "n",
|
|
"193": "A",
|
|
"201": "E",
|
|
"205": "I",
|
|
"211": "O",
|
|
"218": "U",
|
|
"220": "U",
|
|
"168U": "U",
|
|
"209": "N",
|
|
"223": "ss",
|
|
}
|
|
_temp = "".join([str(ord(i)) if ord(i) > 128 else i for i in title])
|
|
for _ascii, _char in _locale.items():
|
|
if _ascii in _temp:
|
|
_temp = _temp.replace(_ascii, _char)
|
|
|
|
ok = re.compile(r'[^\\/:*?"<>]')
|
|
_title = "".join(x if ok.match(x) else "_" for x in _temp)
|
|
return _title
|