利用python的requests库爬取某机构教育课程
🛷

利用python的requests库爬取某机构教育课程

Tags
python
Property
活动搞得某app会员,只能app或者官网看,不能下载,会员到期后就无法下载
参照网上的方法写了个爬虫,实现爬取课程的文档并转成pdf文件,视频下载保存到本地
 
LaGouSpider.py
import threading
from queue import Queue
import re
import requests
import json
from Crypto.Cipher import AES
import time
import os
import pdfkit


class LaGou_spider:
    def __init__(self, url):
        self.url = url
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60',
            'Cookie': 'smidV2=202011262035061f0eaf8ecc0e8384377c9d7cef65238f000bbc964d018f490; LG_HAS_LOGIN=1; sensorsdata2015session=%7B%7D; user-finger=e117bdf8a4eded652b1af27a708b08f2; LG_LOGIN_USER_ID=30ac22a4e177e58ecae4526b8a87cceb3176b59beb20cb5686f1c67b83e97fc9; _putrc=83D1BF86E06960A8123F89F2B170EADC; login=true; unick=%E7%8E%8B%E6%9D%B0; kw_login_authToken="AQKMAWClN3t1ry5bbTeMo+DBQBesYnsGcu+D4WoDouw3sK8jUZeLwi1qyHusIfPUHwIDkUgQ2rqHX54C3Gkn7H8rYMgubtbLwURudujby3Kgrbm3D49bTo/srLBZ7HweLuSVwKfKhYcioIs+rosa9Mk6WGQ6aSLtqal+0MXdWsZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; gate_login_token=ef634b463d96a07d041b294b662b5761e4ece03823d39ab40b09114e40118ed9; X_HTTP_TOKEN=42daf4b72327b2811098618061bf5e71415983ed09; thirdDeviceIdInfo=%5B%7B%22channel%22%3A1%2C%22thirdDeviceId%22%3A%22WC39ZUyXRgdERRmmwqwO50cvDHoKO8PUhqEGKvVRXbd5S97/8y2baZd3jw6Lm17N7PFawzHFjsJED95vPo6++I0k/yXhGiwavzDOuEOzxeuXkErcwnzpxUVX5pVVE247jkJlhk/gbBtr8WQCkkJTqhEZ8pCNkGi2yqy5vXPHvYaQuaF3iaWaQExmUnGUl2icew+aXZ9q7GzxMLnY/xNUm/hKGBkMKVxb9tinvKvaMfeF1cvionFIHc04LQfvsOkAw1487577677129%22%7D%2C%7B%22channel%22%3A2%2C%22thirdDeviceId%22%3A%22137%23dTE9hE9o9c/dqB5vccoMgsPWIr9CGAKugaB1YVb3DsJFEvfbZNeIptsm6xZhtVEWfiKs2sc7vhYQOA3BoldL/Y5cDF5D6OcwRvo2FcOr4togbPZzZ5dw76jAqeH0PZskm4h6GJwKeC81UHTHrNZrIqLdgnmVW7Op9mj5jmYq5GVMAGxHj9NoRGiZ5gLDFudZjcGfXTQ/Eylwb3gbQbpVIuAjJTa2RaZZRs1CHpDGc5Hoa6zhxHZmQofJ+GXppRJE1Aey+tppYSUS1lQypXiwQonoBf6b5nVc1IBi+piVYSUx1lLyJdicQonJ+aQjGRUc14Iyiti+YSJS1qQipppmQofJ+GXpVkJc1IBi+ppVYTUx1lgippiI91YMLfP9iIS25VmyNpPlf2uuYZanlre1pij8MEnh0fIK1I7JQSlRSGeFQQT5Wg7LWr9yOONod6YjUVPoN5gP1XjLiWvWkaKhLh3EybUNw4nJDa3jbQSEO0WQU+g2LVCqfK5mSC9EXxtvjsWJfPdrLw0tsKRJKu/iEhLJ4fzZAQZ3ZR/rGJx0K5i0rZeHeCW2m0NDVZKqVJBoRaZC51rG7VuSMXNecFQX8H9Jtr4uz043BxypNqnqF4FGDPjp6MlYxWQYzn6Rl28VmbZ3fbYVBWI/Js5r5eTCbBiQcdlZz8ptdwwFVqTqokS2OWYzSdeBebxNh6g%3D%2CT2gA4UQODJvKrYMZXcSRnx0sNpjDhI_oBMq74opYhZkuAWNPTqHv61Q6Rfjjk82L_IgAKRwWekuXeV4Oq1m_Ou6n%22%7D%5D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2213185584%22%2C%22first_id%22%3A%221766e5598ae46f-004e19416d8a55-5a301e42-1327104-1766e5598afe1c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2287.0.4280.88%22%7D%2C%22%24device_id%22%3A%22176048cd341271-0afacd8f68394c-5a30124a-2073600-176048cd342d4c%22%7D; JSESSIONID=F6732D1C54DB4767BB4BFC1A040B5504',
            'Referer': 'https://kaiwu.lagou.com/',
            'Origin': 'https://kaiwu.lagou.com',
            'Sec-fetch-dest': 'empty',
            'Sec-fetch-mode': 'cors',
            'Sec-fetch-site': 'same-site',
            'x-l-req-header': '{deviceType:1}'}
        self.queue = Queue()  # 初始化一个队列
        self.error_queue = Queue()

    def get_id(self):
        ture_url_list = []
        html = requests.get(url=self.url, headers=self.headers).text
        dit_message = json.loads(html)
        message_list = dit_message['content']['courseSectionList']
        for message in message_list:
            id1 = message["courseLessons"]
            for t_id in id1:
                ture_url = "https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessonDetail?lessonId={}".format(
                    t_id["id"])
                ture_url_list.append(ture_url)
        return ture_url_list

    def parse_one(self, ture_url_list):
        """

        :return:获得所有的课程url和课程名 返回一个队列(请求一次)
        """
        for ture_url in ture_url_list:
            dir = "D://video"
            # print(ture_url)
            html = requests.get(url=ture_url, headers=self.headers).text
            # print(html)
            dit_message = json.loads(html)
            message_list = dit_message['content']
            # print(message_list["videoMedia"])
            if message_list["videoMedia"] == None:
                continue
            else:
                name = message_list["theme"]
                m3u8 = message_list["videoMedia"]["fileUrl"]
                # print(m3u8)
                m3u8_dict = {m3u8: name}  # key为视频的url,val为视频的name
                if os.path.exists("{}\\{}.mp4".format(dir, name.replace("|", "-").replace("/", "-"))):
                    print("{}已经存在".format(name))
                    pass
                else:
                    # print(m3u8_dict)
                    self.queue.put(m3u8_dict)  # 将每个本地不存在的视频url(m3u8)和name加入到队列中
        # for message in message_list:
        #     # print(message)
        #     for i in message['courseLessons']:
        #         if i['videoMediaDTO'] == None:
        #             pass
        #         else:
        #             key = i['videoMediaDTO']['fileUrl']
        #             val = i['theme']
        #             m3u8_dict = {key: val}  # key为视频的url,val为视频的name
        #             # print(m3u8_dict)
        #
        return self.queue

    #
    def get_key(self, **kwargs):
        # global key
        m3u8_dict = kwargs
        # print(m3u8_dict)
        for k in m3u8_dict:  # 获取某个视频的url
            name = ''
            # print(k)
            true_url = k.split('/')[0:-1]
            t_url = '/'.join(true_url)  # 拼接ts的url前面部分
            html = requests.get(url=k, headers=self.headers).text  # 请求返回包含ts以及key数据
            # print(html)
            message = html.split('\n')  # 获取key以及ts的url
            key_parse = re.compile('URI="(.*?)"')
            key_list = key_parse.findall(html)
            # print("密匙链接"+key_list)
            # print(key_list[0])
            key = requests.get(url=key_list[0],
                               headers=self.headers).content  # 一个m3u8文件中的所有ts对应的key是同一个 发一次请求获得m3u8文件的key
            # print(key)
            name1 = m3u8_dict[k]  # 视频的名字
            # print("视频名:"+name1)
            if "|" or '?' or '/' in name1:
                name = name1.replace("|", "-").replace("/", "-").replace("?", "")
                for i in message:
                    if '.ts' in i:
                        ts_url = t_url + '/' + i
                        # print("ts_url"+ts_url)
                        self.write(key, ts_url, name, m3u8_dict)
            else:
                name = name1
                for i in message:
                    # print(i)
                    if '.ts' in i:
                        ts_url = t_url + '/' + i
                        # print(ts_url)
                        self.write(key, ts_url, name, m3u8_dict)

    def write(self, key, ts_url, name01, m3u8_dict):
        dir = 'D:\\video'
        if not os.path.exists(dir):
            os.makedirs(dir)
        cryptor = AES.new(key, AES.MODE_CBC, iv=key)
        with open('{}\\{}.mp4'.format(dir, name01), 'ab')as f:
            try:
                html = requests.get(url=ts_url, headers=self.headers).content
                f.write(cryptor.decrypt(html))
                print('{},{}写入成功'.format(ts_url, name01))
            except Exception as e:
                print('{}爬取出错'.format(name01))
                while True:
                    if f.close():  # 检查这个出问题的文件是否关闭  闭关则删除然后重新爬取,没关闭则等待10s,直到该文件被删除并重新爬取为止
                        os.remove('{}.mp4'.format(name01))
                        print('{}删除成功'.format(name01))
                        thread = self.thread_method(self.get_key, m3u8_dict)
                        print("开启线程{},{}重新爬取".format(thread.getName(), name01))
                        thread.start()
                        thread.join()
                        break
                    else:
                        time.sleep(10)

    def thread_method(self, method, value):  # 创建线程方法
        thread = threading.Thread(target=method, kwargs=value)
        return thread

    def main(self):
        global m3u8
        thread_list = []
        ture_url_list = self.get_id()
        m3u8_dict = self.parse_one(ture_url_list)
        while not m3u8_dict.empty():
            for i in range(5):  # 创建线程并启动
                if not m3u8_dict.empty():
                    m3u8 = m3u8_dict.get()
                    # print(type(m3u8))
                    thread = self.thread_method(self.get_key, m3u8)
                    thread.start()
                    print(thread.getName() + '启动成功,{}'.format(m3u8))
                    time.sleep(1)
                    thread_list.append(thread)
                else:
                    break
            for k in thread_list:
                k.join()  # 回收线程


# if __name__ == "__main__":
#     run = LaGou_spider("https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessons?courseId=185")
#     # run.get_id()
#     time1 = time.time()
#     run.main()
#     time2 = time.time()
#     print(time2 - time1)


class LaGou_Article_Spider():
    def __init__(self, url):
        self.url = url
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60',
            'Cookie': 'smidV2=202011262035061f0eaf8ecc0e8384377c9d7cef65238f000bbc964d018f490; LG_HAS_LOGIN=1; sensorsdata2015session=%7B%7D; user-finger=e117bdf8a4eded652b1af27a708b08f2; LG_LOGIN_USER_ID=30ac22a4e177e58ecae4526b8a87cceb3176b59beb20cb5686f1c67b83e97fc9; _putrc=83D1BF86E06960A8123F89F2B170EADC; login=true; unick=%E7%8E%8B%E6%9D%B0; kw_login_authToken="AQKMAWClN3t1ry5bbTeMo+DBQBesYnsGcu+D4WoDouw3sK8jUZeLwi1qyHusIfPUHwIDkUgQ2rqHX54C3Gkn7H8rYMgubtbLwURudujby3Kgrbm3D49bTo/srLBZ7HweLuSVwKfKhYcioIs+rosa9Mk6WGQ6aSLtqal+0MXdWsZ4rucJXOpldXhUiavxhcCELWDotJ+bmNVwmAvQCptcy5e7czUcjiQC32Lco44BMYXrQ+AIOfEccJKHpj0vJ+ngq/27aqj1hWq8tEPFFjdnxMSfKgAnjbIEAX3F9CIW8BSiMHYmPBt7FDDY0CCVFICHr2dp5gQVGvhfbqg7VzvNsw=="; gate_login_token=ef634b463d96a07d041b294b662b5761e4ece03823d39ab40b09114e40118ed9; X_HTTP_TOKEN=42daf4b72327b2811098618061bf5e71415983ed09; thirdDeviceIdInfo=%5B%7B%22channel%22%3A1%2C%22thirdDeviceId%22%3A%22WC39ZUyXRgdERRmmwqwO50cvDHoKO8PUhqEGKvVRXbd5S97/8y2baZd3jw6Lm17N7PFawzHFjsJED95vPo6++I0k/yXhGiwavzDOuEOzxeuXkErcwnzpxUVX5pVVE247jkJlhk/gbBtr8WQCkkJTqhEZ8pCNkGi2yqy5vXPHvYaQuaF3iaWaQExmUnGUl2icew+aXZ9q7GzxMLnY/xNUm/hKGBkMKVxb9tinvKvaMfeF1cvionFIHc04LQfvsOkAw1487577677129%22%7D%2C%7B%22channel%22%3A2%2C%22thirdDeviceId%22%3A%22137%23dTE9hE9o9c/dqB5vccoMgsPWIr9CGAKugaB1YVb3DsJFEvfbZNeIptsm6xZhtVEWfiKs2sc7vhYQOA3BoldL/Y5cDF5D6OcwRvo2FcOr4togbPZzZ5dw76jAqeH0PZskm4h6GJwKeC81UHTHrNZrIqLdgnmVW7Op9mj5jmYq5GVMAGxHj9NoRGiZ5gLDFudZjcGfXTQ/Eylwb3gbQbpVIuAjJTa2RaZZRs1CHpDGc5Hoa6zhxHZmQofJ+GXppRJE1Aey+tppYSUS1lQypXiwQonoBf6b5nVc1IBi+piVYSUx1lLyJdicQonJ+aQjGRUc14Iyiti+YSJS1qQipppmQofJ+GXpVkJc1IBi+ppVYTUx1lgippiI91YMLfP9iIS25VmyNpPlf2uuYZanlre1pij8MEnh0fIK1I7JQSlRSGeFQQT5Wg7LWr9yOONod6YjUVPoN5gP1XjLiWvWkaKhLh3EybUNw4nJDa3jbQSEO0WQU+g2LVCqfK5mSC9EXxtvjsWJfPdrLw0tsKRJKu/iEhLJ4fzZAQZ3ZR/rGJx0K5i0rZeHeCW2m0NDVZKqVJBoRaZC51rG7VuSMXNecFQX8H9Jtr4uz043BxypNqnqF4FGDPjp6MlYxWQYzn6Rl28VmbZ3fbYVBWI/Js5r5eTCbBiQcdlZz8ptdwwFVqTqokS2OWYzSdeBebxNh6g%3D%2CT2gA4UQODJvKrYMZXcSRnx0sNpjDhI_oBMq74opYhZkuAWNPTqHv61Q6Rfjjk82L_IgAKRwWekuXeV4Oq1m_Ou6n%22%7D%5D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2213185584%22%2C%22first_id%22%3A%221766e5598ae46f-004e19416d8a55-5a301e42-1327104-1766e5598afe1c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2287.0.4280.88%22%7D%2C%22%24device_id%22%3A%22176048cd341271-0afacd8f68394c-5a30124a-2073600-176048cd342d4c%22%7D; JSESSIONID=F6732D1C54DB4767BB4BFC1A040B5504',
            'Referer': 'https://kaiwu.lagou.com/',
            'Origin': 'https://kaiwu.lagou.com',
            'Sec-fetch-dest': 'empty',
            'Sec-fetch-mode': 'cors',
            'Sec-fetch-site': 'same-site',
            'x-l-req-header': '{deviceType:1}'}
        self.textUrl = 'https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessonDetail?lessonId='  # 发现课程文章html的请求url前面都是一样的最后的id不同而已
        self.queue = Queue()  # 初始化一个队列
        self.error_queue = Queue()

    def parse_one(self):
        """

        :return:获取文章html的url
        """
        # id_list=[]
        html = requests.get(url=self.url, headers=self.headers).text
        dit_message = json.loads(html)
        message_list = dit_message['content']['courseSectionList']
        # print(message_list)
        for message in message_list:
            for i in message['courseLessons']:
                true_url = self.textUrl + str(i['id'])
                self.queue.put(true_url)  # 文章的请求url

        return self.queue

    def get_html(self, true_url):
        """

        :return:返回一个Str 类型的html
        """
        global article_name
        html = requests.get(url=true_url, timeout=10, headers=self.headers).text
        dit_message = json.loads(html)
        str_html = str(dit_message['content']['textContent'])
        article_name1 = dit_message['content']['theme']
        if "|" or '?' or '/' in article_name1:
            article_name = article_name1.replace("|", "-").replace("?", "-").replace("/", "-")
        else:
            article_name = article_name1
        self.htmltopdf(str_html, article_name)

    def htmltopdf(self, str_html, article_name):
        path_wk = r'D:\wkhtmltox\bin\wkhtmltopdf.exe'
        config = pdfkit.configuration(wkhtmltopdf=path_wk)
        options = {
            'page-size': 'Letter',
            'encoding': 'UTF-8',
            'custom-header': [('Accept-Encoding', 'gzip')]
        }
        pdfkit.from_string(str_html, "D:\\video\\{}.pdf".format(article_name), configuration=config, options=options)

    def thread_method(self, method, value):  # 创建线程方法
        thread = threading.Thread(target=method, args=value)
        return thread

    def main(self):

        thread_list = []
        true_url = self.parse_one()
        while not true_url.empty():
            for i in range(10):  # 创建线程并启动
                if not true_url.empty():
                    m3u8 = true_url.get()
                    print(m3u8)
                    thread = self.thread_method(self.get_html, (m3u8,))
                    thread.start()
                    print(thread.getName() + '启动成功,{}'.format(m3u8))
                    thread_list.append(thread)
                else:
                    break
            while len(thread_list) != 0:
                for k in thread_list:
                    k.join()  # 回收线程
                    print('{}线程回收完毕'.format(k))
                    thread_list.remove(k)
decode.py
import base64
res = base64.b64decode('UmVhbExpdVNoYQ==')
print(res)
from LaGouSpider import LaGou_Article_Spider
from LaGouSpider import LaGou_spider
# 16  31
print("请输入课程编号:")
number = int(input())
url = 'https://gate.lagou.com/v1/neirong/kaiwu/getCourseLessons?courseId={}'.format(number)
# video = LaGou_spider(url)
# video.main()
article = LaGou_Article_Spider(url)
article.main()
#1、启动整合视频和文章
#2、输入课程ID(课程ID在进入课程页面时可以从url处得到)
#3、自动下载视频和文章(文章会保存成PDF

Loading Comments...