Bug fixes

+ Support for large course content in subscription courses (#92)
+ Attempt to fix encoding problems with caption conversion (#92, #98, #97)
This commit is contained in:
Puyodead1 2022-01-12 19:51:11 -05:00 committed by Puyodead1
parent ad74eed395
commit 15b7d92afc
No known key found for this signature in database
GPG Key ID: A4FA4FEC0DD353FC

95
main.py
View File

@ -780,22 +780,29 @@ class Udemy:
raise Exception("[-] Cloudflare captcha detected!")
# wait for page load
WebDriverWait(selenium.driver, 60).until(EC.visibility_of_element_located((By.TAG_NAME, "pre")))
WebDriverWait(selenium.driver, 60).until(
EC.visibility_of_element_located((By.TAG_NAME, "body")))
time.sleep(2)
# TODO: determine if the course content is large
# get the text from the page
page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text
if not page_text or not isinstance(page_text, str):
raise Exception("[-] Could not get page text!")
page_json = json.loads(page_text)
if page_json:
return page_json
body_text = selenium.driver.find_element(By.TAG_NAME, "body").text
if not body_text:
raise Exception("[-] Could not get page body text!")
if "502 Bad Gateway" in body_text:
# its a large course, handle accordingly
logger.info("[+] Detected large course content, using large content extractor...")
return self._extract_large_course_content_sub(url=url, selenium=selenium)
else:
logger.error("[-] Failed to extract course json!")
time.sleep(0.8)
sys.exit(1)
# get the text from the page
page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text
if not page_text or not isinstance(page_text, str):
raise Exception("[-] Could not get page pre text!")
page_json = json.loads(page_text)
if page_json:
return page_json
else:
logger.error("[-] Failed to extract course json!")
time.sleep(0.8)
sys.exit(1)
def _extract_large_course_content(self, url):
url = url.replace("10000", "50") if url.endswith("10000") else url
@ -823,6 +830,68 @@ class Udemy:
data["results"].append(d)
return data
def _extract_large_course_content_sub(self, url, selenium: Selenium):
url = url.replace("10000", "50") if url.endswith("10000") else url
try:
selenium.driver.get(url)
time.sleep(2)
if "Attention" in selenium.driver.title:
# cloudflare captcha, panic
raise Exception("[-] Cloudflare captcha detected!")
# wait for page load
WebDriverWait(selenium.driver, 60).until(
EC.visibility_of_element_located((By.TAG_NAME, "body")))
time.sleep(2)
# get the text from the page
page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text
if not page_text or not isinstance(page_text, str):
raise Exception("[-] Could not get page pre text!")
data = json.loads(page_text)
logger.debug(data)
except conn_error as error:
logger.fatal(f"[-] Udemy Says: Connection error, {error}")
time.sleep(0.8)
sys.exit(1)
else:
_next = data.get("next")
while _next:
logger.info("> Downloading course information.. ")
try:
selenium.driver.get(_next)
time.sleep(2)
if "Attention" in selenium.driver.title:
# cloudflare captcha, panic
raise Exception("[-] Cloudflare captcha detected!")
# wait for page load
WebDriverWait(selenium.driver, 60).until(
EC.visibility_of_element_located((By.TAG_NAME, "body")))
time.sleep(2)
# get the text from the page
page_text = selenium.driver.find_element(
By.TAG_NAME, "pre").text
if not page_text or not isinstance(page_text, str):
raise Exception("[-] Could not get page pre text!")
resp = json.loads(page_text)
logger.debug(resp)
except conn_error as error:
logger.fatal(f"[-] Udemy Says: Connection error, {error}")
time.sleep(0.8)
sys.exit(1)
else:
_next = resp.get("next")
results = resp.get("results")
if results and isinstance(results, list):
for d in resp["results"]:
data["results"].append(d)
return data
def _extract_course(self, response, course_name):
_temp = {}
if response: