mirror of
https://cdm-project.com/Download-Tools/udemy-downloader.git
synced 2025-05-29 10:50:12 +02:00
Subtitle Support
+ Added support for downloading subtitles (see readme for usage)
This commit is contained in:
parent
a867f82f2b
commit
6c5b7870a9
20
README.md
20
README.md
@ -71,15 +71,17 @@ You can now run `python main.py` to start downloading. The course will download
|
|||||||
# Advanced Usage
|
# Advanced Usage
|
||||||
|
|
||||||
```
|
```
|
||||||
usage: main.py [-h] [-d] [-q] [--download-assets]
|
usage: main.py [-h] [-d] [-q] [-l] [--download-assets] [--download-captions]
|
||||||
|
|
||||||
Udemy Downloader
|
Udemy Downloader
|
||||||
|
|
||||||
optional arguments:
|
optional arguments:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
-d, --debug Use test_data.json rather than fetch from the udemy api.
|
-d, --debug Use test_data.json rather than fetch from the udemy api.
|
||||||
-q , --quality Download specific video quality. (144, 360, 480, 720, 1080)
|
-q , --quality Download specific video quality. (144, 360, 480, 720, 1080)
|
||||||
--download-assets Download lecture assets along with lectures
|
-l , --lang The language to download for captions (Default is en)
|
||||||
|
--download-assets If specified, lecture assets will be downloaded.
|
||||||
|
--download-captions If specified, captions will be downloaded.
|
||||||
```
|
```
|
||||||
|
|
||||||
- Download a specific quality
|
- Download a specific quality
|
||||||
@ -88,6 +90,14 @@ optional arguments:
|
|||||||
- `python main.py --download-assets`
|
- `python main.py --download-assets`
|
||||||
- Download assets and specify a quality
|
- Download assets and specify a quality
|
||||||
- `python main.py -q 360 --download-assets`
|
- `python main.py -q 360 --download-assets`
|
||||||
|
- Download captions
|
||||||
|
- `python main.py --download-captions`
|
||||||
|
- Download captions with specific language
|
||||||
|
- `python main.py --download-captions -l en` - English subtitles
|
||||||
|
- `python main.py --download-captions -l es` - Spanish subtitles
|
||||||
|
- `python main.py --download-captions -l it` - Italian subtitles
|
||||||
|
- `python main.py --download-captions -l pl` - Polish Subtitles
|
||||||
|
- etc
|
||||||
|
|
||||||
# Getting an error about "Accepting the latest terms of service"?
|
# Getting an error about "Accepting the latest terms of service"?
|
||||||
|
|
||||||
|
86
main.py
86
main.py
@ -9,6 +9,7 @@ from mpegdash.utils import (parse_attr_value, parse_child_nodes,
|
|||||||
parse_node_value, write_attr_value,
|
parse_node_value, write_attr_value,
|
||||||
write_child_node, write_node_value)
|
write_child_node, write_node_value)
|
||||||
from utils import extract_kid
|
from utils import extract_kid
|
||||||
|
from vtt_to_srt import convert
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
@ -24,6 +25,8 @@ retry = 3
|
|||||||
home_dir = os.getcwd()
|
home_dir = os.getcwd()
|
||||||
keyfile_path = "%s\keyfile.json" % os.getcwd()
|
keyfile_path = "%s\keyfile.json" % os.getcwd()
|
||||||
dl_assets = False
|
dl_assets = False
|
||||||
|
dl_captions = False
|
||||||
|
caption_locale = "en"
|
||||||
quality = None # None will download the best possible
|
quality = None # None will download the best possible
|
||||||
valid_qualities = [144, 360, 480, 720, 1080]
|
valid_qualities = [144, 360, 480, 720, 1080]
|
||||||
|
|
||||||
@ -315,7 +318,7 @@ def download(url, path, filename):
|
|||||||
return file_size
|
return file_size
|
||||||
|
|
||||||
|
|
||||||
def process_lecture(lecture, lecture_path, lecture_dir):
|
def process_lecture(lecture, lecture_index, lecture_path, lecture_dir):
|
||||||
lecture_title = lecture["title"]
|
lecture_title = lecture["title"]
|
||||||
lecture_asset = lecture["asset"]
|
lecture_asset = lecture["asset"]
|
||||||
if lecture_asset["media_license_token"] == None:
|
if lecture_asset["media_license_token"] == None:
|
||||||
@ -380,6 +383,48 @@ def process_lecture(lecture, lecture_path, lecture_dir):
|
|||||||
print("> Found %s assets for lecture '%s'" %
|
print("> Found %s assets for lecture '%s'" %
|
||||||
(len(assets), lecture_title))
|
(len(assets), lecture_title))
|
||||||
|
|
||||||
|
# process captions
|
||||||
|
if dl_captions:
|
||||||
|
captions = []
|
||||||
|
for caption in lecture_asset.get("captions"):
|
||||||
|
if not isinstance(caption, dict):
|
||||||
|
continue
|
||||||
|
if caption.get("_class") != "caption":
|
||||||
|
continue
|
||||||
|
download_url = caption.get("url")
|
||||||
|
if not download_url or not isinstance(download_url, str):
|
||||||
|
continue
|
||||||
|
lang = (caption.get("language") or caption.get("srclang")
|
||||||
|
or caption.get("label")
|
||||||
|
or caption.get("locale_id").split("_")[0])
|
||||||
|
ext = "vtt" if "vtt" in download_url.rsplit(".", 1)[-1] else "srt"
|
||||||
|
if caption_locale == "all" or caption_locale == lang:
|
||||||
|
captions.append({
|
||||||
|
"language": lang,
|
||||||
|
"locale_id": caption.get("locale_id"),
|
||||||
|
"ext": ext,
|
||||||
|
"url": download_url
|
||||||
|
})
|
||||||
|
|
||||||
|
for caption in captions:
|
||||||
|
filename = f"%s. %s_%s.%s" % (lecture_index, sanitize(
|
||||||
|
lecture_title), caption.get("locale_id"), caption.get("ext"))
|
||||||
|
filename_no_ext = f"%s. %s_%s" % (lecture_index,
|
||||||
|
sanitize(lecture_title),
|
||||||
|
caption.get("locale_id"))
|
||||||
|
filepath = f"%s\\%s" % (lecture_dir, filename)
|
||||||
|
|
||||||
|
if os.path.isfile(filepath):
|
||||||
|
print("> Captions '%s' already downloaded." % filename)
|
||||||
|
else:
|
||||||
|
print(f"> Downloading captions: '%s'" % filename)
|
||||||
|
download(caption.get("url"), filepath, filename)
|
||||||
|
if caption.get("ext") == "vtt":
|
||||||
|
print("> Converting captions to SRT format...")
|
||||||
|
convert(lecture_dir, filename_no_ext)
|
||||||
|
print("> Caption conversion complete.")
|
||||||
|
os.remove(filepath)
|
||||||
|
|
||||||
|
|
||||||
def parse(data):
|
def parse(data):
|
||||||
chapters = []
|
chapters = []
|
||||||
@ -396,10 +441,10 @@ def parse(data):
|
|||||||
except IndexError:
|
except IndexError:
|
||||||
# This is caused by there not being a starting chapter
|
# This is caused by there not being a starting chapter
|
||||||
lectures.append(obj)
|
lectures.append(obj)
|
||||||
lecture_path = f"%s\\%s. %s.mp4" % (download_dir,
|
lecture_index = lectures.index(obj) + 1
|
||||||
lectures.index(obj) + 1,
|
lecture_path = f"%s\\%s. %s.mp4" % (
|
||||||
sanitize(obj["title"]))
|
download_dir, lecture_index, sanitize(obj["title"]))
|
||||||
process_lecture(obj, lecture_path, download_dir)
|
process_lecture(obj, lecture_index, lecture_path, download_dir)
|
||||||
|
|
||||||
for chapter in chapters:
|
for chapter in chapters:
|
||||||
chapter_dir = f"%s\\%s. %s" % (download_dir, chapters.index(chapter) +
|
chapter_dir = f"%s\\%s. %s" % (download_dir, chapters.index(chapter) +
|
||||||
@ -408,10 +453,13 @@ def parse(data):
|
|||||||
os.mkdir(chapter_dir)
|
os.mkdir(chapter_dir)
|
||||||
|
|
||||||
for lecture in chapter["lectures"]:
|
for lecture in chapter["lectures"]:
|
||||||
lecture_path = f"%s\\%s. %s.mp4" % (
|
lecture_index = chapter["lectures"].index(lecture) + 1
|
||||||
chapter_dir, chapter["lectures"].index(lecture) + 1,
|
lecture_path = f"%s\\%s. %s.mp4" % (chapter_dir, lecture_index,
|
||||||
sanitize(lecture["title"]))
|
sanitize(lecture["title"]))
|
||||||
process_lecture(lecture, lecture_path, chapter_dir)
|
process_lecture(lecture, lecture_index, lecture_path, chapter_dir)
|
||||||
|
print("\n\n\n\n\n\n\n\n=====================")
|
||||||
|
print("All downloads completed for course!")
|
||||||
|
print("=====================")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -431,16 +479,34 @@ if __name__ == "__main__":
|
|||||||
help="Download specific video quality. (144, 360, 480, 720, 1080)",
|
help="Download specific video quality. (144, 360, 480, 720, 1080)",
|
||||||
metavar="",
|
metavar="",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-l",
|
||||||
|
"--lang",
|
||||||
|
dest="lang",
|
||||||
|
type=str,
|
||||||
|
help="The language to download for captions (Default is en)",
|
||||||
|
metavar="",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--download-assets",
|
"--download-assets",
|
||||||
dest="download_assets",
|
dest="download_assets",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Download lecture assets along with lectures",
|
help="If specified, lecture assets will be downloaded.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--download-captions",
|
||||||
|
dest="download_captions",
|
||||||
|
action="store_true",
|
||||||
|
help="If specified, captions will be downloaded.",
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.download_assets:
|
if args.download_assets:
|
||||||
dl_assets = True
|
dl_assets = True
|
||||||
|
if args.lang:
|
||||||
|
caption_locale = args.lang
|
||||||
|
if args.download_captions:
|
||||||
|
dl_captions = True
|
||||||
if args.quality:
|
if args.quality:
|
||||||
if not args.quality in valid_qualities:
|
if not args.quality in valid_qualities:
|
||||||
print("Invalid quality specified! %s" % quality)
|
print("Invalid quality specified! %s" % quality)
|
||||||
|
@ -3,4 +3,6 @@ sanitize_filename
|
|||||||
tqdm
|
tqdm
|
||||||
requests
|
requests
|
||||||
python-dotenv
|
python-dotenv
|
||||||
protobuf
|
protobuf
|
||||||
|
webvtt-py
|
||||||
|
pysrt
|
19
vtt_to_srt.py
Normal file
19
vtt_to_srt.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from webvtt import WebVTT
|
||||||
|
import html
|
||||||
|
from pysrt.srtitem import SubRipItem
|
||||||
|
from pysrt.srttime import SubRipTime
|
||||||
|
|
||||||
|
|
||||||
|
def convert(directory, filename):
|
||||||
|
index = 0
|
||||||
|
vtt_filepath = f"%s\\%s.vtt" % (directory, filename)
|
||||||
|
srt_filepath = f"%s\\%s.srt" % (directory, filename)
|
||||||
|
srt = open(srt_filepath, "w")
|
||||||
|
|
||||||
|
for caption in WebVTT().read(vtt_filepath):
|
||||||
|
index += 1
|
||||||
|
start = SubRipTime(0, 0, caption.start_in_seconds)
|
||||||
|
end = SubRipTime(0, 0, caption.end_in_seconds)
|
||||||
|
srt.write(
|
||||||
|
SubRipItem(index, start, end, html.unescape(
|
||||||
|
caption.text)).__str__() + "\n")
|
Loading…
x
Reference in New Issue
Block a user