mirror of
https://cdm-project.com/Download-Tools/udemy-downloader.git
synced 2025-05-28 05:20:14 +02:00
Subtitle Support
+ Added support for downloading subtitles (see readme for usage)
This commit is contained in:
parent
a867f82f2b
commit
6c5b7870a9
20
README.md
20
README.md
@ -71,15 +71,17 @@ You can now run `python main.py` to start downloading. The course will download
|
||||
# Advanced Usage
|
||||
|
||||
```
|
||||
usage: main.py [-h] [-d] [-q] [--download-assets]
|
||||
usage: main.py [-h] [-d] [-q] [-l] [--download-assets] [--download-captions]
|
||||
|
||||
Udemy Downloader
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-d, --debug Use test_data.json rather than fetch from the udemy api.
|
||||
-q , --quality Download specific video quality. (144, 360, 480, 720, 1080)
|
||||
--download-assets Download lecture assets along with lectures
|
||||
-h, --help show this help message and exit
|
||||
-d, --debug Use test_data.json rather than fetch from the udemy api.
|
||||
-q , --quality Download specific video quality. (144, 360, 480, 720, 1080)
|
||||
-l , --lang The language to download for captions (Default is en)
|
||||
--download-assets If specified, lecture assets will be downloaded.
|
||||
--download-captions If specified, captions will be downloaded.
|
||||
```
|
||||
|
||||
- Download a specific quality
|
||||
@ -88,6 +90,14 @@ optional arguments:
|
||||
- `python main.py --download-assets`
|
||||
- Download assets and specify a quality
|
||||
- `python main.py -q 360 --download-assets`
|
||||
- Download captions
|
||||
- `python main.py --download-captions`
|
||||
- Download captions with specific language
|
||||
- `python main.py --download-captions -l en` - English subtitles
|
||||
- `python main.py --download-captions -l es` - Spanish subtitles
|
||||
- `python main.py --download-captions -l it` - Italian subtitles
|
||||
- `python main.py --download-captions -l pl` - Polish Subtitles
|
||||
- etc
|
||||
|
||||
# Getting an error about "Accepting the latest terms of service"?
|
||||
|
||||
|
86
main.py
86
main.py
@ -9,6 +9,7 @@ from mpegdash.utils import (parse_attr_value, parse_child_nodes,
|
||||
parse_node_value, write_attr_value,
|
||||
write_child_node, write_node_value)
|
||||
from utils import extract_kid
|
||||
from vtt_to_srt import convert
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -24,6 +25,8 @@ retry = 3
|
||||
home_dir = os.getcwd()
|
||||
keyfile_path = "%s\keyfile.json" % os.getcwd()
|
||||
dl_assets = False
|
||||
dl_captions = False
|
||||
caption_locale = "en"
|
||||
quality = None # None will download the best possible
|
||||
valid_qualities = [144, 360, 480, 720, 1080]
|
||||
|
||||
@ -315,7 +318,7 @@ def download(url, path, filename):
|
||||
return file_size
|
||||
|
||||
|
||||
def process_lecture(lecture, lecture_path, lecture_dir):
|
||||
def process_lecture(lecture, lecture_index, lecture_path, lecture_dir):
|
||||
lecture_title = lecture["title"]
|
||||
lecture_asset = lecture["asset"]
|
||||
if lecture_asset["media_license_token"] == None:
|
||||
@ -380,6 +383,48 @@ def process_lecture(lecture, lecture_path, lecture_dir):
|
||||
print("> Found %s assets for lecture '%s'" %
|
||||
(len(assets), lecture_title))
|
||||
|
||||
# process captions
|
||||
if dl_captions:
|
||||
captions = []
|
||||
for caption in lecture_asset.get("captions"):
|
||||
if not isinstance(caption, dict):
|
||||
continue
|
||||
if caption.get("_class") != "caption":
|
||||
continue
|
||||
download_url = caption.get("url")
|
||||
if not download_url or not isinstance(download_url, str):
|
||||
continue
|
||||
lang = (caption.get("language") or caption.get("srclang")
|
||||
or caption.get("label")
|
||||
or caption.get("locale_id").split("_")[0])
|
||||
ext = "vtt" if "vtt" in download_url.rsplit(".", 1)[-1] else "srt"
|
||||
if caption_locale == "all" or caption_locale == lang:
|
||||
captions.append({
|
||||
"language": lang,
|
||||
"locale_id": caption.get("locale_id"),
|
||||
"ext": ext,
|
||||
"url": download_url
|
||||
})
|
||||
|
||||
for caption in captions:
|
||||
filename = f"%s. %s_%s.%s" % (lecture_index, sanitize(
|
||||
lecture_title), caption.get("locale_id"), caption.get("ext"))
|
||||
filename_no_ext = f"%s. %s_%s" % (lecture_index,
|
||||
sanitize(lecture_title),
|
||||
caption.get("locale_id"))
|
||||
filepath = f"%s\\%s" % (lecture_dir, filename)
|
||||
|
||||
if os.path.isfile(filepath):
|
||||
print("> Captions '%s' already downloaded." % filename)
|
||||
else:
|
||||
print(f"> Downloading captions: '%s'" % filename)
|
||||
download(caption.get("url"), filepath, filename)
|
||||
if caption.get("ext") == "vtt":
|
||||
print("> Converting captions to SRT format...")
|
||||
convert(lecture_dir, filename_no_ext)
|
||||
print("> Caption conversion complete.")
|
||||
os.remove(filepath)
|
||||
|
||||
|
||||
def parse(data):
|
||||
chapters = []
|
||||
@ -396,10 +441,10 @@ def parse(data):
|
||||
except IndexError:
|
||||
# This is caused by there not being a starting chapter
|
||||
lectures.append(obj)
|
||||
lecture_path = f"%s\\%s. %s.mp4" % (download_dir,
|
||||
lectures.index(obj) + 1,
|
||||
sanitize(obj["title"]))
|
||||
process_lecture(obj, lecture_path, download_dir)
|
||||
lecture_index = lectures.index(obj) + 1
|
||||
lecture_path = f"%s\\%s. %s.mp4" % (
|
||||
download_dir, lecture_index, sanitize(obj["title"]))
|
||||
process_lecture(obj, lecture_index, lecture_path, download_dir)
|
||||
|
||||
for chapter in chapters:
|
||||
chapter_dir = f"%s\\%s. %s" % (download_dir, chapters.index(chapter) +
|
||||
@ -408,10 +453,13 @@ def parse(data):
|
||||
os.mkdir(chapter_dir)
|
||||
|
||||
for lecture in chapter["lectures"]:
|
||||
lecture_path = f"%s\\%s. %s.mp4" % (
|
||||
chapter_dir, chapter["lectures"].index(lecture) + 1,
|
||||
sanitize(lecture["title"]))
|
||||
process_lecture(lecture, lecture_path, chapter_dir)
|
||||
lecture_index = chapter["lectures"].index(lecture) + 1
|
||||
lecture_path = f"%s\\%s. %s.mp4" % (chapter_dir, lecture_index,
|
||||
sanitize(lecture["title"]))
|
||||
process_lecture(lecture, lecture_index, lecture_path, chapter_dir)
|
||||
print("\n\n\n\n\n\n\n\n=====================")
|
||||
print("All downloads completed for course!")
|
||||
print("=====================")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -431,16 +479,34 @@ if __name__ == "__main__":
|
||||
help="Download specific video quality. (144, 360, 480, 720, 1080)",
|
||||
metavar="",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--lang",
|
||||
dest="lang",
|
||||
type=str,
|
||||
help="The language to download for captions (Default is en)",
|
||||
metavar="",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--download-assets",
|
||||
dest="download_assets",
|
||||
action="store_true",
|
||||
help="Download lecture assets along with lectures",
|
||||
help="If specified, lecture assets will be downloaded.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--download-captions",
|
||||
dest="download_captions",
|
||||
action="store_true",
|
||||
help="If specified, captions will be downloaded.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.download_assets:
|
||||
dl_assets = True
|
||||
if args.lang:
|
||||
caption_locale = args.lang
|
||||
if args.download_captions:
|
||||
dl_captions = True
|
||||
if args.quality:
|
||||
if not args.quality in valid_qualities:
|
||||
print("Invalid quality specified! %s" % quality)
|
||||
|
@ -3,4 +3,6 @@ sanitize_filename
|
||||
tqdm
|
||||
requests
|
||||
python-dotenv
|
||||
protobuf
|
||||
protobuf
|
||||
webvtt-py
|
||||
pysrt
|
19
vtt_to_srt.py
Normal file
19
vtt_to_srt.py
Normal file
@ -0,0 +1,19 @@
|
||||
from webvtt import WebVTT
|
||||
import html
|
||||
from pysrt.srtitem import SubRipItem
|
||||
from pysrt.srttime import SubRipTime
|
||||
|
||||
|
||||
def convert(directory, filename):
|
||||
index = 0
|
||||
vtt_filepath = f"%s\\%s.vtt" % (directory, filename)
|
||||
srt_filepath = f"%s\\%s.srt" % (directory, filename)
|
||||
srt = open(srt_filepath, "w")
|
||||
|
||||
for caption in WebVTT().read(vtt_filepath):
|
||||
index += 1
|
||||
start = SubRipTime(0, 0, caption.start_in_seconds)
|
||||
end = SubRipTime(0, 0, caption.end_in_seconds)
|
||||
srt.write(
|
||||
SubRipItem(index, start, end, html.unescape(
|
||||
caption.text)).__str__() + "\n")
|
Loading…
x
Reference in New Issue
Block a user