利用python的requests库爬取某机构教育课程
🛷

利用python的requests库爬取某机构教育课程

Tags
python
活动搞得某app会员,只能app或者官网看,不能下载,会员到期后就无法下载
参照网上的方法写了个爬虫,实现爬取课程的文档并转成pdf文件,视频下载保存到本地
 
LaGouSpider.py
import threading
from queue import Queue
import re
import requests
import json
from Crypto.Cipher import AES
import time
import os
import pdfkit


class LaGou_spider:
    def __init__(self, url):
        self.url = url
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60',
            'Cookie': 'smidV2=202011262035061f0eaf8ecc0e8384377c9d7cef65238f000bbc964d018f490; LG_HAS_LOGIN=1; sensorsdata2015session=%7B%7D; user-finger=e117bdf8a4eded652b1af27a708b08f2; LG_LOGIN_USER_ID=30ac22a4e177e58ecae4526b8a87cceb3176b59beb20cb5686f1c67b83e97fc9; _putrc=83D1BF86E06960A8123F89F2B170EADC; login=true; unick=%E7%8E%8B%E6%9D%B0; kw_login_authToken="AQKMAWClN3t1ry5bbTeMo+DBQBesYnsGcu+D4WoDouw3sK8jUZeLwi1qyHusIfPUHwIDkUgQ2rqHX54C3Gkn7H8rYMgubtbLwURudujby3Kgrbm3D49bTo/srLBZ7HweLuSVwKfKhYcioIs+rosa9Mk6WGQ6aSLtqal+0MXdWsZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; gate_login_token=ef634b463d96a07d041b294b662b5761e4ece03823d39ab40b09114e40118ed9; X_HTTP_TOKEN=42daf4b72327b2811098618061bf5e71415983ed09; thirdDeviceIdInfo=%5B%7B%22channel%22%3A1%2C%22thirdDeviceId%22%3A%22WC39ZUyXRgdERRmmwqwO50cvDHoKO8PUhqEGKvVRXbd5S97/8y2baZd3jw6Lm17N7PFawzHFjsJED95vPo6++I0k/yXhGiwavzDOuEOzxeuXkErcwnzpxUVX5pVVE247jkJlhk/gbBtr8WQCkkJTqhEZ8pCNkGi2yqy5vXPHvYaQuaF3iaWaQExmUnGUl2icew+aXZ9q7GzxMLnY/xNUm/hKGBkMKVxb9tinvKvaMfeF1cvionFIHc04LQfvsOkAw1487577677129%22%7D%2C%7B%22channel%22%3A2%2C%22thirdDeviceId%22%3A%22137%23dTE9hE9o9c/dqB5vccoMgsPWIr9CGAKugaB1YVb3DsJFEvfbZNeIptsm6xZhtVEWfiKs2sc7vhYQOA3BoldL/Y5cDF5D6OcwRvo2FcOr4togbPZzZ5dw76jAqeH0PZskm4h6GJwKeC81UHTHrNZrIqLdgnmVW7Op9mj5jmYq5GVMAGxHj9NoRGiZ5gLDFudZjcGfXTQ/Eylwb3gbQbpVIuAjJTa2RaZZRs1CHpDGc5Hoa6zhxHZmQofJ+GXppRJE1Aey+tppYSUS1lQypXiwQonoBf6b5nVc1IBi+piVYSUx1lLyJdicQonJ+aQjGRUc14Iyiti+YSJS1qQipppmQofJ+GXpVkJc1IBi+ppVYTUx1lgippiI91YMLfP9iIS25VmyNpPlf2uuYZanlre1pij8MEnh0fIK1I7JQSlRSGeFQQT5Wg7LWr9yOONod6YjUVPoN5gP1XjLiWvWkaKhLh3EybUNw4nJDa3jbQSEO0WQU+g2LVCqfK5mSC9EXxtvjsWJfPdrLw0tsKRJKu/iEhLJ4fzZAQZ3ZR/rGJx0K5i0rZeHeCW2m0NDVZKqVJBoRaZC51rG7VuSMXNecFQX8H9Jtr4uz043BxypNqnqF4FGDPjp6MlYxWQYzn6Rl28VmbZ3fbYVBWI/Js5r5eTCbBiQcdlZz8ptdwwFVqTqokS2OWYzSdeBebxNh6g%3D%2CT2gA4UQODJvKrYMZXcSRnx0sNpjDhI_oBMq74opYhZkuAWNPTqHv61Q6Rfjjk82L_IgAKRwWekuXeV4Oq1m_Ou6n%22%7D%5D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2213185584%22%2C%22first_id%22%3A%221766e5598ae46f-004e19416d8a55-5a301e42-1327104-1766e5598afe1c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2287.0.4280.88%22%7D%2C%22%24device_id%22%3A%22176048cd341271-0afacd8f68394c-5a30124a-2073600-176048cd342d4c%22%7D; JSESSIONID=F6732D1C54DB4767BB4BFC1A040B5504',
            'Referer': 'https://kaiwu.lagou.com/',
            'Origin': 'https://kaiwu.lagou.com',
            'Sec-fetch-dest': 'empty',
            'Sec-fetch-mode': 'cors',
            'Sec-fetch-site': 'same-site',
            'x-l-req-header': '{deviceType:1}'}
        self.queue = Queue()  # 初始化一个队列
        self.error_queue = Queue()

    def get_id(self):
        ture_url_list = []
        html = requests.get(url=self.url, headers=self.headers).text
        dit_message = json.loads(html)
        message_list = dit_message['content']['courseSectionList']
        for message in message_list:
            id1 = message["courseLessons"]
            for t_id in id1:
                ture_url = "https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessonDetail?lessonId={}".format(
                    t_id["id"])
                ture_url_list.append(ture_url)
        return ture_url_list

    def parse_one(self, ture_url_list):
        """

        :return:获得所有的课程url和课程名 返回一个队列(请求一次)
        """
        for ture_url in ture_url_list:
            dir = "D://video"
            # print(ture_url)
            html = requests.get(url=ture_url, headers=self.headers).text
            # print(html)
            dit_message = json.loads(html)
            message_list = dit_message['content']
            # print(message_list["videoMedia"])
            if message_list["videoMedia"] == None:
                continue
            else:
                name = message_list["theme"]
                m3u8 = message_list["videoMedia"]["fileUrl"]
                # print(m3u8)
                m3u8_dict = {m3u8: name}  # key为视频的url,val为视频的name
                if os.path.exists("{}\\{}.mp4".format(dir, name.replace("|", "-").replace("/", "-"))):
                    print("{}已经存在".format(name))
                    pass
                else:
                    # print(m3u8_dict)
                    self.queue.put(m3u8_dict)  # 将每个本地不存在的视频url(m3u8)和name加入到队列中
        # for message in message_list:
        #     # print(message)
        #     for i in message['courseLessons']:
        #         if i['videoMediaDTO'] == None:
        #             pass
        #         else:
        #             key = i['videoMediaDTO']['fileUrl']
        #             val = i['theme']
        #             m3u8_dict = {key: val}  # key为视频的url,val为视频的name
        #             # print(m3u8_dict)
        #
        return self.queue

    #
    def get_key(self, **kwargs):
        # global key
        m3u8_dict = kwargs
        # print(m3u8_dict)
        for k in m3u8_dict:  # 获取某个视频的url
            name = ''
            # print(k)
            true_url = k.split('/')[0:-1]
            t_url = '/'.join(true_url)  # 拼接ts的url前面部分
            html = requests.get(url=k, headers=self.headers).text  # 请求返回包含ts以及key数据
            # print(html)
            message = html.split('\n')  # 获取key以及ts的url
            key_parse = re.compile('URI="(.*?)"')
            key_list = key_parse.findall(html)
            # print("密匙链接"+key_list)
            # print(key_list[0])
            key = requests.get(url=key_list[0],
                               headers=self.headers).content  # 一个m3u8文件中的所有ts对应的key是同一个 发一次请求获得m3u8文件的key
            # print(key)
            name1 = m3u8_dict[k]  # 视频的名字
            # print("视频名:"+name1)
            if "|" or '?' or '/' in name1:
                name = name1.replace("|", "-").replace("/", "-").replace("?", "")
                for i in message:
                    if '.ts' in i:
                        ts_url = t_url + '/' + i
                        # print("ts_url"+ts_url)
                        self.write(key, ts_url, name, m3u8_dict)
            else:
                name = name1
                for i in message:
                    # print(i)
                    if '.ts' in i:
                        ts_url = t_url + '/' + i
                        # print(ts_url)
                        self.write(key, ts_url, name, m3u8_dict)

    def write(self, key, ts_url, name01, m3u8_dict):
        dir = 'D:\\video'
        if not os.path.exists(dir):
            os.makedirs(dir)
        cryptor = AES.new(key, AES.MODE_CBC, iv=key)
        with open('{}\\{}.mp4'.format(dir, name01), 'ab')as f:
            try:
                html = requests.get(url=ts_url, headers=self.headers).content
                f.write(cryptor.decrypt(html))
                print('{},{}写入成功'.format(ts_url, name01))
            except Exception as e:
                print('{}爬取出错'.format(name01))
                while True:
                    if f.close():  # 检查这个出问题的文件是否关闭  闭关则删除然后重新爬取,没关闭则等待10s,直到该文件被删除并重新爬取为止
                        os.remove('{}.mp4'.format(name01))
                        print('{}删除成功'.format(name01))
                        thread = self.thread_method(self.get_key, m3u8_dict)
                        print("开启线程{},{}重新爬取".format(thread.getName(), name01))
                        thread.start()
                        thread.join()
                        break
                    else:
                        time.sleep(10)

    def thread_method(self, method, value):  # 创建线程方法
        thread = threading.Thread(target=method, kwargs=value)
        return thread

    def main(self):
        global m3u8
        thread_list = []
        ture_url_list = self.get_id()
        m3u8_dict = self.parse_one(ture_url_list)
        while not m3u8_dict.empty():
            for i in range(5):  # 创建线程并启动
                if not m3u8_dict.empty():
                    m3u8 = m3u8_dict.get()
                    # print(type(m3u8))
                    thread = self.thread_method(self.get_key, m3u8)
                    thread.start()
                    print(thread.getName() + '启动成功,{}'.format(m3u8))
                    time.sleep(1)
                    thread_list.append(thread)
                else:
                    break
            for k in thread_list:
                k.join()  # 回收线程


# if __name__ == "__main__":
#     run = LaGou_spider("https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessons?courseId=185")
#     # run.get_id()
#     time1 = time.time()
#     run.main()
#     time2 = time.time()
#     print(time2 - time1)


class LaGou_Article_Spider():
    def __init__(self, url):
        self.url = url
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60',
            'Cookie': 'smidV2=202011262035061f0eaf8ecc0e8384377c9d7cef65238f000bbc964d018f490; LG_HAS_LOGIN=1; sensorsdata2015session=%7B%7D; user-finger=e117bdf8a4eded652b1af27a708b08f2; LG_LOGIN_USER_ID=30ac22a4e177e58ecae4526b8a87cceb3176b59beb20cb5686f1c67b83e97fc9; _putrc=83D1BF86E06960A8123F89F2B170EADC; login=true; unick=%E7%8E%8B%E6%9D%B0; kw_login_authToken="AQKMAWClN3t1ry5bbTeMo+DBQBesYnsGcu+D4WoDouw3sK8jUZeLwi1qyHusIfPUHwIDkUgQ2rqHX54C3Gkn7H8rYMgubtbLwURudujby3Kgrbm3D49bTo/srLBZ7HweLuSVwKfKhYcioIs+rosa9Mk6WGQ6aSLtqal+0MXdWsZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; gate_login_token=ef634b463d96a07d041b294b662b5761e4ece03823d39ab40b09114e40118ed9; X_HTTP_TOKEN=42daf4b72327b2811098618061bf5e71415983ed09; thirdDeviceIdInfo=%5B%7B%22channel%22%3A1%2C%22thirdDeviceId%22%3A%22WC39ZUyXRgdERRmmwqwO50cvDHoKO8PUhqEGKvVRXbd5S97/8y2baZd3jw6Lm17N7PFawzHFjsJED95vPo6++I0k/yXhGiwavzDOuEOzxeuXkErcwnzpxUVX5pVVE247jkJlhk/gbBtr8WQCkkJTqhEZ8pCNkGi2yqy5vXPHvYaQuaF3iaWaQExmUnGUl2icew+aXZ9q7GzxMLnY/xNUm/hKGBkMKVxb9tinvKvaMfeF1cvionFIHc04LQfvsOkAw1487577677129%22%7D%2C%7B%22channel%22%3A2%2C%22thirdDeviceId%22%3A%22137%23dTE9hE9o9c/dqB5vccoMgsPWIr9CGAKugaB1YVb3DsJFEvfbZNeIptsm6xZhtVEWfiKs2sc7vhYQOA3BoldL/Y5cDF5D6OcwRvo2FcOr4togbPZzZ5dw76jAqeH0PZskm4h6GJwKeC81UHTHrNZrIqLdgnmVW7Op9mj5jmYq5GVMAGxHj9NoRGiZ5gLDFudZjcGfXTQ/Eylwb3gbQbpVIuAjJTa2RaZZRs1CHpDGc5Hoa6zhxHZmQofJ+GXppRJE1Aey+tppYSUS1lQypXiwQonoBf6b5nVc1IBi+piVYSUx1lLyJdicQonJ+aQjGRUc14Iyiti+YSJS1qQipppmQofJ+GXpVkJc1IBi+ppVYTUx1lgippiI91YMLfP9iIS25VmyNpPlf2uuYZanlre1pij8MEnh0fIK1I7JQSlRSGeFQQT5Wg7LWr9yOONod6YjUVPoN5gP1XjLiWvWkaKhLh3EybUNw4nJDa3jbQSEO0WQU+g2LVCqfK5mSC9EXxtvjsWJfPdrLw0tsKRJKu/iEhLJ4fzZAQZ3ZR/rGJx0K5i0rZeHeCW2m0NDVZKqVJBoRaZC51rG7VuSMXNecFQX8H9Jtr4uz043BxypNqnqF4FGDPjp6MlYxWQYzn6Rl28VmbZ3fbYVBWI/Js5r5eTCbBiQcdlZz8ptdwwFVqTqokS2OWYzSdeBebxNh6g%3D%2CT2gA4UQODJvKrYMZXcSRnx0sNpjDhI_oBMq74opYhZkuAWNPTqHv61Q6Rfjjk82L_IgAKRwWekuXeV4Oq1m_Ou6n%22%7D%5D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2213185584%22%2C%22first_id%22%3A%221766e5598ae46f-004e19416d8a55-5a301e42-1327104-1766e5598afe1c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2287.0.4280.88%22%7D%2C%22%24device_id%22%3A%22176048cd341271-0afacd8f68394c-5a30124a-2073600-176048cd342d4c%22%7D; JSESSIONID=F6732D1C54DB4767BB4BFC1A040B5504',
            'Referer': 'https://kaiwu.lagou.com/',
            'Origin': 'https://kaiwu.lagou.com',
            'Sec-fetch-dest': 'empty',
            'Sec-fetch-mode': 'cors',
            'Sec-fetch-site': 'same-site',
            'x-l-req-header': '{deviceType:1}'}
        self.textUrl = 'https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessonDetail?lessonId='  # 发现课程文章html的请求url前面都是一样的最后的id不同而已
        self.queue = Queue()  # 初始化一个队列
        self.error_queue = Queue()

    def parse_one(self):
        """

        :return:获取文章html的url
        """
        # id_list=[]
        html = requests.get(url=self.url, headers=self.headers).text
        dit_message = json.loads(html)
        message_list = dit_message['content']['courseSectionList']
        # print(message_list)
        for message in message_list:
            for i in message['courseLessons']:
                true_url = self.textUrl + str(i['id'])
                self.queue.put(true_url)  # 文章的请求url

        return self.queue

    def get_html(self, true_url):
        """

        :return:返回一个Str 类型的html
        """
        global article_name
        html = requests.get(url=true_url, timeout=10, headers=self.headers).text
        dit_message = json.loads(html)
        str_html = str(dit_message['content']['textContent'])
        article_name1 = dit_message['content']['theme']
        if "|" or '?' or '/' in article_name1:
            article_name = article_name1.replace("|", "-").replace("?", "-").replace("/", "-")
        else:
            article_name = article_name1
        self.htmltopdf(str_html, article_name)

    def htmltopdf(self, str_html, article_name):
        path_wk = r'D:\wkhtmltox\bin\wkhtmltopdf.exe'
        config = pdfkit.configuration(wkhtmltopdf=path_wk)
        options = {
            'page-size': 'Letter',
            'encoding': 'UTF-8',
            'custom-header': [('Accept-Encoding', 'gzip')]
        }
        pdfkit.from_string(str_html, "D:\\video\\{}.pdf".format(article_name), configuration=config, options=options)

    def thread_method(self, method, value):  # 创建线程方法
        thread = threading.Thread(target=method, args=value)
        return thread

    def main(self):

        thread_list = []
        true_url = self.parse_one()
        while not true_url.empty():
            for i in range(10):  # 创建线程并启动
                if not true_url.empty():
                    m3u8 = true_url.get()
                    print(m3u8)
                    thread = self.thread_method(self.get_html, (m3u8,))
                    thread.start()
                    print(thread.getName() + '启动成功,{}'.format(m3u8))
                    thread_list.append(thread)
                else:
                    break
            while len(thread_list) != 0:
                for k in thread_list:
                    k.join()  # 回收线程
                    print('{}线程回收完毕'.format(k))
                    thread_list.remove(k)
decode.py
import base64
res = base64.b64decode('UmVhbExpdVNoYQ==')
print(res)
from LaGouSpider import LaGou_Article_Spider
from LaGouSpider import LaGou_spider
# 16  31
print("请输入课程编号:")
number = int(input())
url = 'https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessons?courseId={}'.format(number)
# video = LaGou_spider(url)
# video.main()
article = LaGou_Article_Spider(url)
article.main()
#1、启动整合视频和文章
#2、输入课程ID(课程ID在进入课程页面时可以从url处得到)
#3、自动下载视频和文章(文章会保存成PDF

Loading Comments...