利用python的requests库爬取某机构教育课程
🛷

利用python的requests库爬取某机构教育课程

Tags
python
活动搞得某app会员,只能app或者官网看,不能下载,会员到期后就无法下载
参照网上的方法写了个爬虫,实现爬取课程的文档并转成pdf文件,视频下载保存到本地
 
LaGouSpider.py
import threading from queue import Queue import re import requests import json from Crypto.Cipher import AES import time import os import pdfkit class LaGou_spider: def __init__(self, url): self.url = url self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60', 'Cookie': 'smidV2=202011262035061f0eaf8ecc0e8384377c9d7cef65238f000bbc964d018f490; LG_HAS_LOGIN=1; sensorsdata2015session=%7B%7D; user-finger=e117bdf8a4eded652b1af27a708b08f2; LG_LOGIN_USER_ID=30ac22a4e177e58ecae4526b8a87cceb3176b59beb20cb5686f1c67b83e97fc9; _putrc=83D1BF86E06960A8123F89F2B170EADC; login=true; unick=%E7%8E%8B%E6%9D%B0; kw_login_authToken="AQKMAWClN3t1ry5bbTeMo+DBQBesYnsGcu+D4WoDouw3sK8jUZeLwi1qyHusIfPUHwIDkUgQ2rqHX54C3Gkn7H8rYMgubtbLwURudujby3Kgrbm3D49bTo/srLBZ7HweLuSVwKfKhYcioIs+rosa9Mk6WGQ6aSLtqal+0MXdWsZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; gate_login_token=ef634b463d96a07d041b294b662b5761e4ece03823d39ab40b09114e40118ed9; X_HTTP_TOKEN=42daf4b72327b2811098618061bf5e71415983ed09; thirdDeviceIdInfo=%5B%7B%22channel%22%3A1%2C%22thirdDeviceId%22%3A%22WC39ZUyXRgdERRmmwqwO50cvDHoKO8PUhqEGKvVRXbd5S97/8y2baZd3jw6Lm17N7PFawzHFjsJED95vPo6++I0k/yXhGiwavzDOuEOzxeuXkErcwnzpxUVX5pVVE247jkJlhk/gbBtr8WQCkkJTqhEZ8pCNkGi2yqy5vXPHvYaQuaF3iaWaQExmUnGUl2icew+aXZ9q7GzxMLnY/xNUm/hKGBkMKVxb9tinvKvaMfeF1cvionFIHc04LQfvsOkAw1487577677129%22%7D%2C%7B%22channel%22%3A2%2C%22thirdDeviceId%22%3A%22137%23dTE9hE9o9c/dqB5vccoMgsPWIr9CGAKugaB1YVb3DsJFEvfbZNeIptsm6xZhtVEWfiKs2sc7vhYQOA3BoldL/Y5cDF5D6OcwRvo2FcOr4togbPZzZ5dw76jAqeH0PZskm4h6GJwKeC81UHTHrNZrIqLdgnmVW7Op9mj5jmYq5GVMAGxHj9NoRGiZ5gLDFudZjcGfXTQ/Eylwb3gbQbpVIuAjJTa2RaZZRs1CHpDGc5Hoa6zhxHZmQofJ+GXppRJE1Aey+tppYSUS1lQypXiwQonoBf6b5nVc1IBi+piVYSUx1lLyJdicQonJ+aQjGRUc14Iyiti+YSJS1qQipppmQofJ+GXpVkJc1IBi+ppVYTUx1lgippiI91YMLfP9iIS25VmyNpPlf2uuYZanlre1pij8MEnh0fIK1I7JQSlRSGeFQQT5Wg7LWr9yOONod6YjUVPoN5gP1XjLiWvWkaKhLh3EybUNw4nJDa3jbQSEO0WQU+g2LVCqfK5mSC9EXxtvjsWJfPdrLw0tsKRJKu/iEhLJ4fzZAQZ3ZR/rGJx0K5i0rZeHeCW2m0NDVZKqVJBoRaZC51rG7VuSMXNecFQX8H9Jtr4uz043BxypNqnqF4FGDPjp6MlYxWQYzn6Rl28VmbZ3fbYVBWI/Js5r5eTCbBiQcdlZz8ptdwwFVqTqokS2OWYzSdeBebxNh6g%3D%2CT2gA4UQODJvKrYMZXcSRnx0sNpjDhI_oBMq74opYhZkuAWNPTqHv61Q6Rfjjk82L_IgAKRwWekuXeV4Oq1m_Ou6n%22%7D%5D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2213185584%22%2C%22first_id%22%3A%221766e5598ae46f-004e19416d8a55-5a301e42-1327104-1766e5598afe1c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2287.0.4280.88%22%7D%2C%22%24device_id%22%3A%22176048cd341271-0afacd8f68394c-5a30124a-2073600-176048cd342d4c%22%7D; JSESSIONID=F6732D1C54DB4767BB4BFC1A040B5504', 'Referer': 'https://kaiwu.lagou.com/', 'Origin': 'https://kaiwu.lagou.com', 'Sec-fetch-dest': 'empty', 'Sec-fetch-mode': 'cors', 'Sec-fetch-site': 'same-site', 'x-l-req-header': '{deviceType:1}'} self.queue = Queue() # 初始化一个队列 self.error_queue = Queue() def get_id(self): ture_url_list = [] html = requests.get(url=self.url, headers=self.headers).text dit_message = json.loads(html) message_list = dit_message['content']['courseSectionList'] for message in message_list: id1 = message["courseLessons"] for t_id in id1: ture_url = "https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessonDetail?lessonId={}".format( t_id["id"]) ture_url_list.append(ture_url) return ture_url_list def parse_one(self, ture_url_list): """ :return:获得所有的课程url和课程名 返回一个队列(请求一次) """ for ture_url in ture_url_list: dir = "D://video" # print(ture_url) html = requests.get(url=ture_url, headers=self.headers).text # print(html) dit_message = json.loads(html) message_list = dit_message['content'] # print(message_list["videoMedia"]) if message_list["videoMedia"] == None: continue else: name = message_list["theme"] m3u8 = message_list["videoMedia"]["fileUrl"] # print(m3u8) m3u8_dict = {m3u8: name} # key为视频的url,val为视频的name if os.path.exists("{}\\{}.mp4".format(dir, name.replace("|", "-").replace("/", "-"))): print("{}已经存在".format(name)) pass else: # print(m3u8_dict) self.queue.put(m3u8_dict) # 将每个本地不存在的视频url(m3u8)和name加入到队列中 # for message in message_list: # # print(message) # for i in message['courseLessons']: # if i['videoMediaDTO'] == None: # pass # else: # key = i['videoMediaDTO']['fileUrl'] # val = i['theme'] # m3u8_dict = {key: val} # key为视频的url,val为视频的name # # print(m3u8_dict) # return self.queue # def get_key(self, **kwargs): # global key m3u8_dict = kwargs # print(m3u8_dict) for k in m3u8_dict: # 获取某个视频的url name = '' # print(k) true_url = k.split('/')[0:-1] t_url = '/'.join(true_url) # 拼接ts的url前面部分 html = requests.get(url=k, headers=self.headers).text # 请求返回包含ts以及key数据 # print(html) message = html.split('\n') # 获取key以及ts的url key_parse = re.compile('URI="(.*?)"') key_list = key_parse.findall(html) # print("密匙链接"+key_list) # print(key_list[0]) key = requests.get(url=key_list[0], headers=self.headers).content # 一个m3u8文件中的所有ts对应的key是同一个 发一次请求获得m3u8文件的key # print(key) name1 = m3u8_dict[k] # 视频的名字 # print("视频名:"+name1) if "|" or '?' or '/' in name1: name = name1.replace("|", "-").replace("/", "-").replace("?", "") for i in message: if '.ts' in i: ts_url = t_url + '/' + i # print("ts_url"+ts_url) self.write(key, ts_url, name, m3u8_dict) else: name = name1 for i in message: # print(i) if '.ts' in i: ts_url = t_url + '/' + i # print(ts_url) self.write(key, ts_url, name, m3u8_dict) def write(self, key, ts_url, name01, m3u8_dict): dir = 'D:\\video' if not os.path.exists(dir): os.makedirs(dir) cryptor = AES.new(key, AES.MODE_CBC, iv=key) with open('{}\\{}.mp4'.format(dir, name01), 'ab')as f: try: html = requests.get(url=ts_url, headers=self.headers).content f.write(cryptor.decrypt(html)) print('{},{}写入成功'.format(ts_url, name01)) except Exception as e: print('{}爬取出错'.format(name01)) while True: if f.close(): # 检查这个出问题的文件是否关闭 闭关则删除然后重新爬取,没关闭则等待10s,直到该文件被删除并重新爬取为止 os.remove('{}.mp4'.format(name01)) print('{}删除成功'.format(name01)) thread = self.thread_method(self.get_key, m3u8_dict) print("开启线程{},{}重新爬取".format(thread.getName(), name01)) thread.start() thread.join() break else: time.sleep(10) def thread_method(self, method, value): # 创建线程方法 thread = threading.Thread(target=method, kwargs=value) return thread def main(self): global m3u8 thread_list = [] ture_url_list = self.get_id() m3u8_dict = self.parse_one(ture_url_list) while not m3u8_dict.empty(): for i in range(5): # 创建线程并启动 if not m3u8_dict.empty(): m3u8 = m3u8_dict.get() # print(type(m3u8)) thread = self.thread_method(self.get_key, m3u8) thread.start() print(thread.getName() + '启动成功,{}'.format(m3u8)) time.sleep(1) thread_list.append(thread) else: break for k in thread_list: k.join() # 回收线程 # if __name__ == "__main__": # run = LaGou_spider("https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessons?courseId=185") # # run.get_id() # time1 = time.time() # run.main() # time2 = time.time() # print(time2 - time1) class LaGou_Article_Spider(): def __init__(self, url): self.url = url self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60', 'Cookie': 'smidV2=202011262035061f0eaf8ecc0e8384377c9d7cef65238f000bbc964d018f490; LG_HAS_LOGIN=1; sensorsdata2015session=%7B%7D; user-finger=e117bdf8a4eded652b1af27a708b08f2; LG_LOGIN_USER_ID=30ac22a4e177e58ecae4526b8a87cceb3176b59beb20cb5686f1c67b83e97fc9; _putrc=83D1BF86E06960A8123F89F2B170EADC; login=true; unick=%E7%8E%8B%E6%9D%B0; kw_login_authToken="AQKMAWClN3t1ry5bbTeMo+DBQBesYnsGcu+D4WoDouw3sK8jUZeLwi1qyHusIfPUHwIDkUgQ2rqHX54C3Gkn7H8rYMgubtbLwURudujby3Kgrbm3D49bTo/srLBZ7HweLuSVwKfKhYcioIs+rosa9Mk6WGQ6aSLtqal+0MXdWsZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; gate_login_token=ef634b463d96a07d041b294b662b5761e4ece03823d39ab40b09114e40118ed9; X_HTTP_TOKEN=42daf4b72327b2811098618061bf5e71415983ed09; thirdDeviceIdInfo=%5B%7B%22channel%22%3A1%2C%22thirdDeviceId%22%3A%22WC39ZUyXRgdERRmmwqwO50cvDHoKO8PUhqEGKvVRXbd5S97/8y2baZd3jw6Lm17N7PFawzHFjsJED95vPo6++I0k/yXhGiwavzDOuEOzxeuXkErcwnzpxUVX5pVVE247jkJlhk/gbBtr8WQCkkJTqhEZ8pCNkGi2yqy5vXPHvYaQuaF3iaWaQExmUnGUl2icew+aXZ9q7GzxMLnY/xNUm/hKGBkMKVxb9tinvKvaMfeF1cvionFIHc04LQfvsOkAw1487577677129%22%7D%2C%7B%22channel%22%3A2%2C%22thirdDeviceId%22%3A%22137%23dTE9hE9o9c/dqB5vccoMgsPWIr9CGAKugaB1YVb3DsJFEvfbZNeIptsm6xZhtVEWfiKs2sc7vhYQOA3BoldL/Y5cDF5D6OcwRvo2FcOr4togbPZzZ5dw76jAqeH0PZskm4h6GJwKeC81UHTHrNZrIqLdgnmVW7Op9mj5jmYq5GVMAGxHj9NoRGiZ5gLDFudZjcGfXTQ/Eylwb3gbQbpVIuAjJTa2RaZZRs1CHpDGc5Hoa6zhxHZmQofJ+GXppRJE1Aey+tppYSUS1lQypXiwQonoBf6b5nVc1IBi+piVYSUx1lLyJdicQonJ+aQjGRUc14Iyiti+YSJS1qQipppmQofJ+GXpVkJc1IBi+ppVYTUx1lgippiI91YMLfP9iIS25VmyNpPlf2uuYZanlre1pij8MEnh0fIK1I7JQSlRSGeFQQT5Wg7LWr9yOONod6YjUVPoN5gP1XjLiWvWkaKhLh3EybUNw4nJDa3jbQSEO0WQU+g2LVCqfK5mSC9EXxtvjsWJfPdrLw0tsKRJKu/iEhLJ4fzZAQZ3ZR/rGJx0K5i0rZeHeCW2m0NDVZKqVJBoRaZC51rG7VuSMXNecFQX8H9Jtr4uz043BxypNqnqF4FGDPjp6MlYxWQYzn6Rl28VmbZ3fbYVBWI/Js5r5eTCbBiQcdlZz8ptdwwFVqTqokS2OWYzSdeBebxNh6g%3D%2CT2gA4UQODJvKrYMZXcSRnx0sNpjDhI_oBMq74opYhZkuAWNPTqHv61Q6Rfjjk82L_IgAKRwWekuXeV4Oq1m_Ou6n%22%7D%5D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2213185584%22%2C%22first_id%22%3A%221766e5598ae46f-004e19416d8a55-5a301e42-1327104-1766e5598afe1c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2287.0.4280.88%22%7D%2C%22%24device_id%22%3A%22176048cd341271-0afacd8f68394c-5a30124a-2073600-176048cd342d4c%22%7D; JSESSIONID=F6732D1C54DB4767BB4BFC1A040B5504', 'Referer': 'https://kaiwu.lagou.com/', 'Origin': 'https://kaiwu.lagou.com', 'Sec-fetch-dest': 'empty', 'Sec-fetch-mode': 'cors', 'Sec-fetch-site': 'same-site', 'x-l-req-header': '{deviceType:1}'} self.textUrl = 'https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessonDetail?lessonId=' # 发现课程文章html的请求url前面都是一样的最后的id不同而已 self.queue = Queue() # 初始化一个队列 self.error_queue = Queue() def parse_one(self): """ :return:获取文章html的url """ # id_list=[] html = requests.get(url=self.url, headers=self.headers).text dit_message = json.loads(html) message_list = dit_message['content']['courseSectionList'] # print(message_list) for message in message_list: for i in message['courseLessons']: true_url = self.textUrl + str(i['id']) self.queue.put(true_url) # 文章的请求url return self.queue def get_html(self, true_url): """ :return:返回一个Str 类型的html """ global article_name html = requests.get(url=true_url, timeout=10, headers=self.headers).text dit_message = json.loads(html) str_html = str(dit_message['content']['textContent']) article_name1 = dit_message['content']['theme'] if "|" or '?' or '/' in article_name1: article_name = article_name1.replace("|", "-").replace("?", "-").replace("/", "-") else: article_name = article_name1 self.htmltopdf(str_html, article_name) def htmltopdf(self, str_html, article_name): path_wk = r'D:\wkhtmltox\bin\wkhtmltopdf.exe' config = pdfkit.configuration(wkhtmltopdf=path_wk) options = { 'page-size': 'Letter', 'encoding': 'UTF-8', 'custom-header': [('Accept-Encoding', 'gzip')] } pdfkit.from_string(str_html, "D:\\video\\{}.pdf".format(article_name), configuration=config, options=options) def thread_method(self, method, value): # 创建线程方法 thread = threading.Thread(target=method, args=value) return thread def main(self): thread_list = [] true_url = self.parse_one() while not true_url.empty(): for i in range(10): # 创建线程并启动 if not true_url.empty(): m3u8 = true_url.get() print(m3u8) thread = self.thread_method(self.get_html, (m3u8,)) thread.start() print(thread.getName() + '启动成功,{}'.format(m3u8)) thread_list.append(thread) else: break while len(thread_list) != 0: for k in thread_list: k.join() # 回收线程 print('{}线程回收完毕'.format(k)) thread_list.remove(k)
decode.py
import base64 res = base64.b64decode('UmVhbExpdVNoYQ==') print(res)
from LaGouSpider import LaGou_Article_Spider from LaGouSpider import LaGou_spider # 16 31 print("请输入课程编号:") number = int(input()) url = 'https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessons?courseId={}'.format(number) # video = LaGou_spider(url) # video.main() article = LaGou_Article_Spider(url) article.main() #1、启动整合视频和文章 #2、输入课程ID(课程ID在进入课程页面时可以从url处得到) #3、自动下载视频和文章(文章会保存成PDF)