Subtitle Support

+ Added support for downloading subtitles (see readme for usage)
This commit is contained in:
Puyodead1 2021-05-19 15:59:49 -04:00
parent a867f82f2b
commit 6c5b7870a9
4 changed files with 113 additions and 16 deletions

View File

@ -71,15 +71,17 @@ You can now run `python main.py` to start downloading. The course will download
# Advanced Usage
```
usage: main.py [-h] [-d] [-q] [--download-assets]
usage: main.py [-h] [-d] [-q] [-l] [--download-assets] [--download-captions]
Udemy Downloader
optional arguments:
-h, --help show this help message and exit
-d, --debug Use test_data.json rather than fetch from the udemy api.
-q , --quality Download specific video quality. (144, 360, 480, 720, 1080)
--download-assets Download lecture assets along with lectures
-h, --help show this help message and exit
-d, --debug Use test_data.json rather than fetch from the udemy api.
-q , --quality Download specific video quality. (144, 360, 480, 720, 1080)
-l , --lang The language to download for captions (Default is en)
--download-assets If specified, lecture assets will be downloaded.
--download-captions If specified, captions will be downloaded.
```
- Download a specific quality
@ -88,6 +90,14 @@ optional arguments:
- `python main.py --download-assets`
- Download assets and specify a quality
- `python main.py -q 360 --download-assets`
- Download captions
- `python main.py --download-captions`
- Download captions with specific language
- `python main.py --download-captions -l en` - English subtitles
- `python main.py --download-captions -l es` - Spanish subtitles
- `python main.py --download-captions -l it` - Italian subtitles
- `python main.py --download-captions -l pl` - Polish Subtitles
- etc
# Getting an error about "Accepting the latest terms of service"?

86
main.py
View File

@ -9,6 +9,7 @@ from mpegdash.utils import (parse_attr_value, parse_child_nodes,
parse_node_value, write_attr_value,
write_child_node, write_node_value)
from utils import extract_kid
from vtt_to_srt import convert
load_dotenv()
@ -24,6 +25,8 @@ retry = 3
home_dir = os.getcwd()
keyfile_path = "%s\keyfile.json" % os.getcwd()
dl_assets = False
dl_captions = False
caption_locale = "en"
quality = None # None will download the best possible
valid_qualities = [144, 360, 480, 720, 1080]
@ -315,7 +318,7 @@ def download(url, path, filename):
return file_size
def process_lecture(lecture, lecture_path, lecture_dir):
def process_lecture(lecture, lecture_index, lecture_path, lecture_dir):
lecture_title = lecture["title"]
lecture_asset = lecture["asset"]
if lecture_asset["media_license_token"] == None:
@ -380,6 +383,48 @@ def process_lecture(lecture, lecture_path, lecture_dir):
print("> Found %s assets for lecture '%s'" %
(len(assets), lecture_title))
# process captions
if dl_captions:
captions = []
for caption in lecture_asset.get("captions"):
if not isinstance(caption, dict):
continue
if caption.get("_class") != "caption":
continue
download_url = caption.get("url")
if not download_url or not isinstance(download_url, str):
continue
lang = (caption.get("language") or caption.get("srclang")
or caption.get("label")
or caption.get("locale_id").split("_")[0])
ext = "vtt" if "vtt" in download_url.rsplit(".", 1)[-1] else "srt"
if caption_locale == "all" or caption_locale == lang:
captions.append({
"language": lang,
"locale_id": caption.get("locale_id"),
"ext": ext,
"url": download_url
})
for caption in captions:
filename = f"%s. %s_%s.%s" % (lecture_index, sanitize(
lecture_title), caption.get("locale_id"), caption.get("ext"))
filename_no_ext = f"%s. %s_%s" % (lecture_index,
sanitize(lecture_title),
caption.get("locale_id"))
filepath = f"%s\\%s" % (lecture_dir, filename)
if os.path.isfile(filepath):
print("> Captions '%s' already downloaded." % filename)
else:
print(f"> Downloading captions: '%s'" % filename)
download(caption.get("url"), filepath, filename)
if caption.get("ext") == "vtt":
print("> Converting captions to SRT format...")
convert(lecture_dir, filename_no_ext)
print("> Caption conversion complete.")
os.remove(filepath)
def parse(data):
chapters = []
@ -396,10 +441,10 @@ def parse(data):
except IndexError:
# This is caused by there not being a starting chapter
lectures.append(obj)
lecture_path = f"%s\\%s. %s.mp4" % (download_dir,
lectures.index(obj) + 1,
sanitize(obj["title"]))
process_lecture(obj, lecture_path, download_dir)
lecture_index = lectures.index(obj) + 1
lecture_path = f"%s\\%s. %s.mp4" % (
download_dir, lecture_index, sanitize(obj["title"]))
process_lecture(obj, lecture_index, lecture_path, download_dir)
for chapter in chapters:
chapter_dir = f"%s\\%s. %s" % (download_dir, chapters.index(chapter) +
@ -408,10 +453,13 @@ def parse(data):
os.mkdir(chapter_dir)
for lecture in chapter["lectures"]:
lecture_path = f"%s\\%s. %s.mp4" % (
chapter_dir, chapter["lectures"].index(lecture) + 1,
sanitize(lecture["title"]))
process_lecture(lecture, lecture_path, chapter_dir)
lecture_index = chapter["lectures"].index(lecture) + 1
lecture_path = f"%s\\%s. %s.mp4" % (chapter_dir, lecture_index,
sanitize(lecture["title"]))
process_lecture(lecture, lecture_index, lecture_path, chapter_dir)
print("\n\n\n\n\n\n\n\n=====================")
print("All downloads completed for course!")
print("=====================")
if __name__ == "__main__":
@ -431,16 +479,34 @@ if __name__ == "__main__":
help="Download specific video quality. (144, 360, 480, 720, 1080)",
metavar="",
)
parser.add_argument(
"-l",
"--lang",
dest="lang",
type=str,
help="The language to download for captions (Default is en)",
metavar="",
)
parser.add_argument(
"--download-assets",
dest="download_assets",
action="store_true",
help="Download lecture assets along with lectures",
help="If specified, lecture assets will be downloaded.",
)
parser.add_argument(
"--download-captions",
dest="download_captions",
action="store_true",
help="If specified, captions will be downloaded.",
)
args = parser.parse_args()
if args.download_assets:
dl_assets = True
if args.lang:
caption_locale = args.lang
if args.download_captions:
dl_captions = True
if args.quality:
if not args.quality in valid_qualities:
print("Invalid quality specified! %s" % quality)

View File

@ -3,4 +3,6 @@ sanitize_filename
tqdm
requests
python-dotenv
protobuf
protobuf
webvtt-py
pysrt

19
vtt_to_srt.py Normal file
View File

@ -0,0 +1,19 @@
from webvtt import WebVTT
import html
from pysrt.srtitem import SubRipItem
from pysrt.srttime import SubRipTime
def convert(directory, filename):
index = 0
vtt_filepath = f"%s\\%s.vtt" % (directory, filename)
srt_filepath = f"%s\\%s.srt" % (directory, filename)
srt = open(srt_filepath, "w")
for caption in WebVTT().read(vtt_filepath):
index += 1
start = SubRipTime(0, 0, caption.start_in_seconds)
end = SubRipTime(0, 0, caption.end_in_seconds)
srt.write(
SubRipItem(index, start, end, html.unescape(
caption.text)).__str__() + "\n")