怎样用Python批量下载PDF

2023年9月3日 | 分类: 【技术】
import requests
import jwt
import json
import time
import aiohttp
import asyncio
import os
import logging

logging.basicConfig(format="%(asctime)s [%(levelname)s]: %(message)s", level=logging.INFO)

class Wqxuetang():

    def __init__(self, bookid, max_threads=4, interval=1):
        self.bookid = bookid
        self.max_threads = max_threads
        self.interval = interval
        self.work_queue = asyncio.Queue()
        self.jwt_secret = "g0NnWdSE8qEjdMD8a1aq12qEYphwErKctvfd3IktWHWiOBpVsgkecur38aBRPn2w"
        self.session = requests.session()
        self.jwtkey = self.get_jwt_key()
        self.timeoutlist = []
        (self.bookname, self.totalpages) = self.bookinfo()
        self.totalpages = int(self.totalpages)
        self.creat_and_enter_book_dir()


    def creat_and_enter_book_dir(self):
        curpath = os.getcwd()
        newpath = curpath + os.path.sep + self.bookname
        os.mkdir(newpath)
        os.chdir(newpath)
            

    def bookinfo(self):
        url = f"https://lib-nuanxin.wqxuetang.com/v1/read/initread?bid={self.bookid}"
        r = self.session.get(url)
        info = json.loads(r.text)
        data = info['data']
        return data['name'], data['canreadpages']

    def get_jwt_key(self):
        url = "https://lib-nuanxin.wqxuetang.com/v1/read/k?bid=%s" % self.bookid
        r = self.session.get(url, timeout=5)
        j = json.loads(r.text)
        return j['data']


    def get_jwt_token(self, page):
        cur_time = time.time()
        jwttoken = jwt.encode({
            "p": page,
            "t": int(cur_time)*1000,
            "b": str(self.bookid),
            "w": 1000,
            "k": json.dumps(self.jwtkey),
            "iat": int(cur_time)
        }, self.jwt_secret, algorithm='HS256').decode('ascii')
        return jwttoken

    async def download_img(self, page, task_id):
        token = self.get_jwt_token(page)
        url = f"https://lib-nuanxin.wqxuetang.com/page/img/{self.bookid}/{page}?k={token}"
        headers = {
            'referer': f'https://lib-nuanxin.wqxuetang.com/read/pdf/{self.bookid}',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
        }
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url, headers=headers, timeout=5) as response:
                    r = await response.read()
        except Exception:
            logging.warning(f"线程 {task_id} -> 第 {page} 张下载失败, 重回队列！！！")
            self.work_queue.put_nowait(page)
        else:
            if len(r) == 10400:
                logging.warning(f"线程 {task_id} -> 第 {page} 张图片大小异常, 重回队列！！！")
                self.work_queue.put_nowait(page)
            else:
                with open(f"{self.bookname+str(page)}.png", "wb") as f:
                    f.write(r)
                logging.info(f"线程 {task_id} -> 第 {page} 张下载完成")


    async def handle_tasks(self, task_id):
        while not self.work_queue.empty():
            page = await self.work_queue.get()
            await self.download_img(page, task_id)
            await asyncio.sleep(self.interval)
        logging.info(f"线程 {task_id} 结束工作！~")

    def main(self):
        [self.work_queue.put_nowait(page) for page in range(1, self.totalpages + 1)]
        loop = asyncio.get_event_loop()
        tasks = [self.handle_tasks(task_id) for task_id in range(self.max_threads)]
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

if __name__ == "__main__":
    bookid = 3208566
    # 默认书名为下载文件夹名
    # 参数1：书号， 参数2：线程数, 参数3：下载时间间隔
    w = Wqxuetang(bookid, 3, 1)
    w.main()