活动搞得某app会员,只能app或者官网看,不能下载,会员到期后就无法下载
参照网上的方法写了个爬虫,实现爬取课程的文档并转成pdf文件,视频下载保存到本地
LaGouSpider.py
import threading
from queue import Queue
import re
import requests
import json
from Crypto.Cipher import AES
import time
import os
import pdfkit
class LaGou_spider:
def __init__(self, url):
self.url = url
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60',
'Cookie': 'smidV2=202011262035061f0eaf8ecc0e8384377c9d7cef65238f000bbc964d018f490; LG_HAS_LOGIN=1; sensorsdata2015session=%7B%7D; user-finger=e117bdf8a4eded652b1af27a708b08f2; LG_LOGIN_USER_ID=30ac22a4e177e58ecae4526b8a87cceb3176b59beb20cb5686f1c67b83e97fc9; _putrc=83D1BF86E06960A8123F89F2B170EADC; login=true; unick=%E7%8E%8B%E6%9D%B0; kw_login_authToken="AQKMAWClN3t1ry5bbTeMo+DBQBesYnsGcu+D4WoDouw3sK8jUZeLwi1qyHusIfPUHwIDkUgQ2rqHX54C3Gkn7H8rYMgubtbLwURudujby3Kgrbm3D49bTo/srLBZ7HweLuSVwKfKhYcioIs+rosa9Mk6WGQ6aSLtqal+0MXdWsZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; gate_login_token=ef634b463d96a07d041b294b662b5761e4ece03823d39ab40b09114e40118ed9; X_HTTP_TOKEN=42daf4b72327b2811098618061bf5e71415983ed09; thirdDeviceIdInfo=%5B%7B%22channel%22%3A1%2C%22thirdDeviceId%22%3A%22WC39ZUyXRgdERRmmwqwO50cvDHoKO8PUhqEGKvVRXbd5S97/8y2baZd3jw6Lm17N7PFawzHFjsJED95vPo6++I0k/yXhGiwavzDOuEOzxeuXkErcwnzpxUVX5pVVE247jkJlhk/gbBtr8WQCkkJTqhEZ8pCNkGi2yqy5vXPHvYaQuaF3iaWaQExmUnGUl2icew+aXZ9q7GzxMLnY/xNUm/hKGBkMKVxb9tinvKvaMfeF1cvionFIHc04LQfvsOkAw1487577677129%22%7D%2C%7B%22channel%22%3A2%2C%22thirdDeviceId%22%3A%22137%23dTE9hE9o9c/dqB5vccoMgsPWIr9CGAKugaB1YVb3DsJFEvfbZNeIptsm6xZhtVEWfiKs2sc7vhYQOA3BoldL/Y5cDF5D6OcwRvo2FcOr4togbPZzZ5dw76jAqeH0PZskm4h6GJwKeC81UHTHrNZrIqLdgnmVW7Op9mj5jmYq5GVMAGxHj9NoRGiZ5gLDFudZjcGfXTQ/Eylwb3gbQbpVIuAjJTa2RaZZRs1CHpDGc5Hoa6zhxHZmQofJ+GXppRJE1Aey+tppYSUS1lQypXiwQonoBf6b5nVc1IBi+piVYSUx1lLyJdicQonJ+aQjGRUc14Iyiti+YSJS1qQipppmQofJ+GXpVkJc1IBi+ppVYTUx1lgippiI91YMLfP9iIS25VmyNpPlf2uuYZanlre1pij8MEnh0fIK1I7JQSlRSGeFQQT5Wg7LWr9yOONod6YjUVPoN5gP1XjLiWvWkaKhLh3EybUNw4nJDa3jbQSEO0WQU+g2LVCqfK5mSC9EXxtvjsWJfPdrLw0tsKRJKu/iEhLJ4fzZAQZ3ZR/rGJx0K5i0rZeHeCW2m0NDVZKqVJBoRaZC51rG7VuSMXNecFQX8H9Jtr4uz043BxypNqnqF4FGDPjp6MlYxWQYzn6Rl28VmbZ3fbYVBWI/Js5r5eTCbBiQcdlZz8ptdwwFVqTqokS2OWYzSdeBebxNh6g%3D%2CT2gA4UQODJvKrYMZXcSRnx0sNpjDhI_oBMq74opYhZkuAWNPTqHv61Q6Rfjjk82L_IgAKRwWekuXeV4Oq1m_Ou6n%22%7D%5D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2213185584%22%2C%22first_id%22%3A%221766e5598ae46f-004e19416d8a55-5a301e42-1327104-1766e5598afe1c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2287.0.4280.88%22%7D%2C%22%24device_id%22%3A%22176048cd341271-0afacd8f68394c-5a30124a-2073600-176048cd342d4c%22%7D; JSESSIONID=F6732D1C54DB4767BB4BFC1A040B5504',
'Referer': 'https://kaiwu.lagou.com/',
'Origin': 'https://kaiwu.lagou.com',
'Sec-fetch-dest': 'empty',
'Sec-fetch-mode': 'cors',
'Sec-fetch-site': 'same-site',
'x-l-req-header': '{deviceType:1}'}
self.queue = Queue() # 初始化一个队列
self.error_queue = Queue()
def get_id(self):
ture_url_list = []
html = requests.get(url=self.url, headers=self.headers).text
dit_message = json.loads(html)
message_list = dit_message['content']['courseSectionList']
for message in message_list:
id1 = message["courseLessons"]
for t_id in id1:
ture_url = "https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessonDetail?lessonId={}".format(
t_id["id"])
ture_url_list.append(ture_url)
return ture_url_list
def parse_one(self, ture_url_list):
"""
:return:获得所有的课程url和课程名 返回一个队列(请求一次)
"""
for ture_url in ture_url_list:
dir = "D://video"
# print(ture_url)
html = requests.get(url=ture_url, headers=self.headers).text
# print(html)
dit_message = json.loads(html)
message_list = dit_message['content']
# print(message_list["videoMedia"])
if message_list["videoMedia"] == None:
continue
else:
name = message_list["theme"]
m3u8 = message_list["videoMedia"]["fileUrl"]
# print(m3u8)
m3u8_dict = {m3u8: name} # key为视频的url,val为视频的name
if os.path.exists("{}\\{}.mp4".format(dir, name.replace("|", "-").replace("/", "-"))):
print("{}已经存在".format(name))
pass
else:
# print(m3u8_dict)
self.queue.put(m3u8_dict) # 将每个本地不存在的视频url(m3u8)和name加入到队列中
# for message in message_list:
# # print(message)
# for i in message['courseLessons']:
# if i['videoMediaDTO'] == None:
# pass
# else:
# key = i['videoMediaDTO']['fileUrl']
# val = i['theme']
# m3u8_dict = {key: val} # key为视频的url,val为视频的name
# # print(m3u8_dict)
#
return self.queue
#
def get_key(self, **kwargs):
# global key
m3u8_dict = kwargs
# print(m3u8_dict)
for k in m3u8_dict: # 获取某个视频的url
name = ''
# print(k)
true_url = k.split('/')[0:-1]
t_url = '/'.join(true_url) # 拼接ts的url前面部分
html = requests.get(url=k, headers=self.headers).text # 请求返回包含ts以及key数据
# print(html)
message = html.split('\n') # 获取key以及ts的url
key_parse = re.compile('URI="(.*?)"')
key_list = key_parse.findall(html)
# print("密匙链接"+key_list)
# print(key_list[0])
key = requests.get(url=key_list[0],
headers=self.headers).content # 一个m3u8文件中的所有ts对应的key是同一个 发一次请求获得m3u8文件的key
# print(key)
name1 = m3u8_dict[k] # 视频的名字
# print("视频名:"+name1)
if "|" or '?' or '/' in name1:
name = name1.replace("|", "-").replace("/", "-").replace("?", "")
for i in message:
if '.ts' in i:
ts_url = t_url + '/' + i
# print("ts_url"+ts_url)
self.write(key, ts_url, name, m3u8_dict)
else:
name = name1
for i in message:
# print(i)
if '.ts' in i:
ts_url = t_url + '/' + i
# print(ts_url)
self.write(key, ts_url, name, m3u8_dict)
def write(self, key, ts_url, name01, m3u8_dict):
dir = 'D:\\video'
if not os.path.exists(dir):
os.makedirs(dir)
cryptor = AES.new(key, AES.MODE_CBC, iv=key)
with open('{}\\{}.mp4'.format(dir, name01), 'ab')as f:
try:
html = requests.get(url=ts_url, headers=self.headers).content
f.write(cryptor.decrypt(html))
print('{},{}写入成功'.format(ts_url, name01))
except Exception as e:
print('{}爬取出错'.format(name01))
while True:
if f.close(): # 检查这个出问题的文件是否关闭 闭关则删除然后重新爬取,没关闭则等待10s,直到该文件被删除并重新爬取为止
os.remove('{}.mp4'.format(name01))
print('{}删除成功'.format(name01))
thread = self.thread_method(self.get_key, m3u8_dict)
print("开启线程{},{}重新爬取".format(thread.getName(), name01))
thread.start()
thread.join()
break
else:
time.sleep(10)
def thread_method(self, method, value): # 创建线程方法
thread = threading.Thread(target=method, kwargs=value)
return thread
def main(self):
global m3u8
thread_list = []
ture_url_list = self.get_id()
m3u8_dict = self.parse_one(ture_url_list)
while not m3u8_dict.empty():
for i in range(5): # 创建线程并启动
if not m3u8_dict.empty():
m3u8 = m3u8_dict.get()
# print(type(m3u8))
thread = self.thread_method(self.get_key, m3u8)
thread.start()
print(thread.getName() + '启动成功,{}'.format(m3u8))
time.sleep(1)
thread_list.append(thread)
else:
break
for k in thread_list:
k.join() # 回收线程
# if __name__ == "__main__":
# run = LaGou_spider("https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessons?courseId=185")
# # run.get_id()
# time1 = time.time()
# run.main()
# time2 = time.time()
# print(time2 - time1)
class LaGou_Article_Spider():
def __init__(self, url):
self.url = url
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60',
'Cookie': 'smidV2=202011262035061f0eaf8ecc0e8384377c9d7cef65238f000bbc964d018f490; LG_HAS_LOGIN=1; sensorsdata2015session=%7B%7D; user-finger=e117bdf8a4eded652b1af27a708b08f2; LG_LOGIN_USER_ID=30ac22a4e177e58ecae4526b8a87cceb3176b59beb20cb5686f1c67b83e97fc9; _putrc=83D1BF86E06960A8123F89F2B170EADC; login=true; unick=%E7%8E%8B%E6%9D%B0; kw_login_authToken="AQKMAWClN3t1ry5bbTeMo+DBQBesYnsGcu+D4WoDouw3sK8jUZeLwi1qyHusIfPUHwIDkUgQ2rqHX54C3Gkn7H8rYMgubtbLwURudujby3Kgrbm3D49bTo/srLBZ7HweLuSVwKfKhYcioIs+rosa9Mk6WGQ6aSLtqal+0MXdWsZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; gate_login_token=ef634b463d96a07d041b294b662b5761e4ece03823d39ab40b09114e40118ed9; X_HTTP_TOKEN=42daf4b72327b2811098618061bf5e71415983ed09; thirdDeviceIdInfo=%5B%7B%22channel%22%3A1%2C%22thirdDeviceId%22%3A%22WC39ZUyXRgdERRmmwqwO50cvDHoKO8PUhqEGKvVRXbd5S97/8y2baZd3jw6Lm17N7PFawzHFjsJED95vPo6++I0k/yXhGiwavzDOuEOzxeuXkErcwnzpxUVX5pVVE247jkJlhk/gbBtr8WQCkkJTqhEZ8pCNkGi2yqy5vXPHvYaQuaF3iaWaQExmUnGUl2icew+aXZ9q7GzxMLnY/xNUm/hKGBkMKVxb9tinvKvaMfeF1cvionFIHc04LQfvsOkAw1487577677129%22%7D%2C%7B%22channel%22%3A2%2C%22thirdDeviceId%22%3A%22137%23dTE9hE9o9c/dqB5vccoMgsPWIr9CGAKugaB1YVb3DsJFEvfbZNeIptsm6xZhtVEWfiKs2sc7vhYQOA3BoldL/Y5cDF5D6OcwRvo2FcOr4togbPZzZ5dw76jAqeH0PZskm4h6GJwKeC81UHTHrNZrIqLdgnmVW7Op9mj5jmYq5GVMAGxHj9NoRGiZ5gLDFudZjcGfXTQ/Eylwb3gbQbpVIuAjJTa2RaZZRs1CHpDGc5Hoa6zhxHZmQofJ+GXppRJE1Aey+tppYSUS1lQypXiwQonoBf6b5nVc1IBi+piVYSUx1lLyJdicQonJ+aQjGRUc14Iyiti+YSJS1qQipppmQofJ+GXpVkJc1IBi+ppVYTUx1lgippiI91YMLfP9iIS25VmyNpPlf2uuYZanlre1pij8MEnh0fIK1I7JQSlRSGeFQQT5Wg7LWr9yOONod6YjUVPoN5gP1XjLiWvWkaKhLh3EybUNw4nJDa3jbQSEO0WQU+g2LVCqfK5mSC9EXxtvjsWJfPdrLw0tsKRJKu/iEhLJ4fzZAQZ3ZR/rGJx0K5i0rZeHeCW2m0NDVZKqVJBoRaZC51rG7VuSMXNecFQX8H9Jtr4uz043BxypNqnqF4FGDPjp6MlYxWQYzn6Rl28VmbZ3fbYVBWI/Js5r5eTCbBiQcdlZz8ptdwwFVqTqokS2OWYzSdeBebxNh6g%3D%2CT2gA4UQODJvKrYMZXcSRnx0sNpjDhI_oBMq74opYhZkuAWNPTqHv61Q6Rfjjk82L_IgAKRwWekuXeV4Oq1m_Ou6n%22%7D%5D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2213185584%22%2C%22first_id%22%3A%221766e5598ae46f-004e19416d8a55-5a301e42-1327104-1766e5598afe1c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2287.0.4280.88%22%7D%2C%22%24device_id%22%3A%22176048cd341271-0afacd8f68394c-5a30124a-2073600-176048cd342d4c%22%7D; JSESSIONID=F6732D1C54DB4767BB4BFC1A040B5504',
'Referer': 'https://kaiwu.lagou.com/',
'Origin': 'https://kaiwu.lagou.com',
'Sec-fetch-dest': 'empty',
'Sec-fetch-mode': 'cors',
'Sec-fetch-site': 'same-site',
'x-l-req-header': '{deviceType:1}'}
self.textUrl = 'https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessonDetail?lessonId=' # 发现课程文章html的请求url前面都是一样的最后的id不同而已
self.queue = Queue() # 初始化一个队列
self.error_queue = Queue()
def parse_one(self):
"""
:return:获取文章html的url
"""
# id_list=[]
html = requests.get(url=self.url, headers=self.headers).text
dit_message = json.loads(html)
message_list = dit_message['content']['courseSectionList']
# print(message_list)
for message in message_list:
for i in message['courseLessons']:
true_url = self.textUrl + str(i['id'])
self.queue.put(true_url) # 文章的请求url
return self.queue
def get_html(self, true_url):
"""
:return:返回一个Str 类型的html
"""
global article_name
html = requests.get(url=true_url, timeout=10, headers=self.headers).text
dit_message = json.loads(html)
str_html = str(dit_message['content']['textContent'])
article_name1 = dit_message['content']['theme']
if "|" or '?' or '/' in article_name1:
article_name = article_name1.replace("|", "-").replace("?", "-").replace("/", "-")
else:
article_name = article_name1
self.htmltopdf(str_html, article_name)
def htmltopdf(self, str_html, article_name):
path_wk = r'D:\wkhtmltox\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wk)
options = {
'page-size': 'Letter',
'encoding': 'UTF-8',
'custom-header': [('Accept-Encoding', 'gzip')]
}
pdfkit.from_string(str_html, "D:\\video\\{}.pdf".format(article_name), configuration=config, options=options)
def thread_method(self, method, value): # 创建线程方法
thread = threading.Thread(target=method, args=value)
return thread
def main(self):
thread_list = []
true_url = self.parse_one()
while not true_url.empty():
for i in range(10): # 创建线程并启动
if not true_url.empty():
m3u8 = true_url.get()
print(m3u8)
thread = self.thread_method(self.get_html, (m3u8,))
thread.start()
print(thread.getName() + '启动成功,{}'.format(m3u8))
thread_list.append(thread)
else:
break
while len(thread_list) != 0:
for k in thread_list:
k.join() # 回收线程
print('{}线程回收完毕'.format(k))
thread_list.remove(k)
decode.py
import base64
res = base64.b64decode('UmVhbExpdVNoYQ==')
print(res)
from LaGouSpider import LaGou_Article_Spider
from LaGouSpider import LaGou_spider
# 16 31
print("请输入课程编号:")
number = int(input())
url = 'https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessons?courseId={}'.format(number)
# video = LaGou_spider(url)
# video.main()
article = LaGou_Article_Spider(url)
article.main()
#1、启动整合视频和文章
#2、输入课程ID(课程ID在进入课程页面时可以从url处得到)
#3、自动下载视频和文章(文章会保存成PDF)
Loading Comments...