mirror of
https://gitee.com/g1879/DrissionPage.git
synced 2024-12-10 04:00:23 +08:00
用DownloadKit对象替代download()方法
This commit is contained in:
parent
969c25845e
commit
adfe628019
@ -6,6 +6,7 @@
|
||||
"""
|
||||
from typing import Union, List, Tuple
|
||||
|
||||
from DownloadKit import DownloadKit
|
||||
from requests import Response, Session
|
||||
from requests.cookies import RequestsCookieJar
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
@ -52,6 +53,7 @@ class MixPage(SessionPage, DriverPage, BasePage):
|
||||
self._wait_object = None
|
||||
self._response = None
|
||||
self._scroll = None
|
||||
self._download_kit = None
|
||||
|
||||
if self._mode == 'd':
|
||||
try:
|
||||
@ -372,38 +374,12 @@ class MixPage(SessionPage, DriverPage, BasePage):
|
||||
self.change_mode('s', go=False)
|
||||
return super().post(url, data, go_anyway, show_errmsg, retry, interval, **kwargs)
|
||||
|
||||
def download(self,
|
||||
file_url: str,
|
||||
goal_path: str,
|
||||
rename: str = None,
|
||||
file_exists: str = 'rename',
|
||||
post_data: Union[str, dict] = None,
|
||||
show_msg: bool = True,
|
||||
show_errmsg: bool = False,
|
||||
retry: int = None,
|
||||
interval: float = None,
|
||||
**kwargs) -> Tuple[bool, str]:
|
||||
"""下载一个文件 \n
|
||||
d模式下下载前先同步cookies \n
|
||||
:param file_url: 文件url
|
||||
:param goal_path: 存放路径
|
||||
:param rename: 重命名文件,可不写扩展名
|
||||
:param file_exists: 若存在同名文件,可选择 'rename', 'overwrite', 'skip' 方式处理
|
||||
:param post_data: post方式的数据,这个参数不为None时自动转成post方式
|
||||
:param show_msg: 是否显示下载信息
|
||||
:param show_errmsg: 是否显示和抛出异常
|
||||
:param retry: 重试次数
|
||||
:param interval: 重试间隔时间
|
||||
:param kwargs: 连接参数
|
||||
:return: 下载是否成功(bool)和状态信息(成功时信息为文件路径)的元组,跳过时第一位返回None
|
||||
"""
|
||||
@property
|
||||
def download(self) -> DownloadKit:
|
||||
if self.mode == 'd':
|
||||
self.cookies_to_session()
|
||||
return super().download
|
||||
|
||||
return super().download(file_url, goal_path, rename, file_exists, post_data, show_msg, show_errmsg, retry,
|
||||
interval, **kwargs)
|
||||
|
||||
# ----------------重写DriverPage的函数-----------------------
|
||||
def chrome_downloading(self, path: str = None) -> list:
|
||||
"""返回浏览器下载中的文件列表 \n
|
||||
:param path: 下载文件夹路径,默认读取配置信息
|
||||
|
@ -4,19 +4,18 @@
|
||||
@Contact : g1879@qq.com
|
||||
@File : session_page.py
|
||||
"""
|
||||
from os import path as os_PATH, sep
|
||||
from pathlib import Path
|
||||
from os import path as os_PATH
|
||||
from random import randint
|
||||
from re import search, sub
|
||||
from re import search
|
||||
from time import time, sleep
|
||||
from typing import Union, List, Tuple
|
||||
from urllib.parse import urlparse, quote, unquote
|
||||
|
||||
from requests import Session, Response
|
||||
from tldextract import extract
|
||||
from DownloadKit import DownloadKit
|
||||
|
||||
from .base import BasePage
|
||||
from .common import get_usable_path, make_valid_name
|
||||
from .config import _cookie_to_dict
|
||||
from .session_element import SessionElement, make_session_ele
|
||||
|
||||
@ -29,6 +28,7 @@ class SessionPage(BasePage):
|
||||
super().__init__(timeout)
|
||||
self._session = session
|
||||
self._response = None
|
||||
self._download_kit = None
|
||||
|
||||
def __call__(self,
|
||||
loc_or_str: Union[Tuple[str, str], str, SessionElement],
|
||||
@ -216,6 +216,13 @@ class SessionPage(BasePage):
|
||||
"""返回访问url得到的response对象"""
|
||||
return self._response
|
||||
|
||||
@property
|
||||
def download(self) -> DownloadKit:
|
||||
if self._download_kit is None:
|
||||
self._download_kit = DownloadKit(session=self.session, timeout=self.timeout)
|
||||
|
||||
return self._download_kit
|
||||
|
||||
def post(self,
|
||||
url: str,
|
||||
data: Union[dict, str] = None,
|
||||
@ -258,167 +265,6 @@ class SessionPage(BasePage):
|
||||
|
||||
return self._url_available
|
||||
|
||||
def download(self,
|
||||
file_url: str,
|
||||
goal_path: str,
|
||||
rename: str = None,
|
||||
file_exists: str = 'rename',
|
||||
post_data: Union[str, dict] = None,
|
||||
show_msg: bool = False,
|
||||
show_errmsg: bool = True,
|
||||
retry: int = None,
|
||||
interval: float = None,
|
||||
**kwargs) -> tuple:
|
||||
"""下载一个文件 \n
|
||||
:param file_url: 文件url
|
||||
:param goal_path: 存放路径
|
||||
:param rename: 重命名文件,可不写扩展名
|
||||
:param file_exists: 若存在同名文件,可选择 'rename', 'overwrite', 'skip' 方式处理
|
||||
:param post_data: post方式的数据,这个参数不为None时自动转成post方式
|
||||
:param show_msg: 是否显示下载信息
|
||||
:param show_errmsg: 是否抛出和显示异常
|
||||
:param retry: 重试次数
|
||||
:param interval: 重试间隔时间
|
||||
:param kwargs: 连接参数
|
||||
:return: 下载是否成功(bool)和状态信息(成功时信息为文件路径)的元组,跳过时第一位为None
|
||||
"""
|
||||
goal_Path = Path(goal_path)
|
||||
|
||||
# 按windows规则去除路径中的非法字符
|
||||
goal_path = goal_Path.anchor + sub(r'[*:|<>?"]', '', goal_path.lstrip(goal_Path.anchor)).strip()
|
||||
goal_path = Path(goal_path).absolute()
|
||||
goal_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if file_exists == 'skip' and rename and (goal_path / rename).exists():
|
||||
if show_msg:
|
||||
print(f'{file_url}\n{goal_path}{sep}{rename}\n存在同名文件,已跳过。\n')
|
||||
return None, '已跳过,因存在同名文件。'
|
||||
|
||||
def do() -> tuple:
|
||||
kwargs['stream'] = True
|
||||
if 'timeout' not in kwargs:
|
||||
kwargs['timeout'] = 20
|
||||
|
||||
# 生成临时的response
|
||||
mode = 'post' if post_data is not None or kwargs.get('json', None) else 'get'
|
||||
r, info = self._make_response(file_url, mode=mode, data=post_data, show_errmsg=show_errmsg, **kwargs)
|
||||
|
||||
if r is None:
|
||||
if show_msg:
|
||||
print(info)
|
||||
return False, info
|
||||
|
||||
if not r.ok:
|
||||
if show_errmsg:
|
||||
raise ConnectionError(f'连接状态码:{r.status_code}')
|
||||
return False, f'状态码:{r.status_code}'
|
||||
|
||||
# -------------------获取文件名-------------------
|
||||
file_name = _get_download_file_name(file_url, r)
|
||||
|
||||
# -------------------重命名,不改变扩展名-------------------
|
||||
if rename:
|
||||
ext_name = file_name.split('.')[-1]
|
||||
if '.' in rename or ext_name == file_name: # 新文件名带后缀或原文件名没有后缀
|
||||
full_name = rename
|
||||
else:
|
||||
full_name = f'{rename}.{ext_name}'
|
||||
else:
|
||||
full_name = file_name
|
||||
|
||||
full_name = make_valid_name(full_name)
|
||||
|
||||
# -------------------生成路径-------------------
|
||||
full_path = goal_path / full_name
|
||||
skip = False
|
||||
|
||||
if full_path.exists():
|
||||
if file_exists == 'rename':
|
||||
full_path = get_usable_path(full_path)
|
||||
full_name = full_path.name
|
||||
|
||||
elif file_exists == 'skip':
|
||||
skip = True
|
||||
|
||||
elif file_exists == 'overwrite':
|
||||
pass
|
||||
|
||||
else:
|
||||
raise ValueError("file_exists参数只能是'skip'、'overwrite' 或 'rename'。")
|
||||
|
||||
# -------------------打印要下载的文件-------------------
|
||||
if show_msg:
|
||||
print(file_url)
|
||||
print(full_name if file_name == full_name else f'{file_name} -> {full_name}')
|
||||
print(f'正在下载到:{goal_path}')
|
||||
if skip:
|
||||
print('存在同名文件,已跳过。\n')
|
||||
|
||||
# -------------------开始下载-------------------
|
||||
if skip:
|
||||
return None, '已跳过,因存在同名文件。'
|
||||
|
||||
# 获取远程文件大小
|
||||
content_length = r.headers.get('content-length')
|
||||
file_size = int(content_length) if content_length else None
|
||||
|
||||
# 已下载文件大小和下载状态
|
||||
downloaded_size, download_status = 0, False
|
||||
|
||||
try:
|
||||
with open(str(full_path), 'wb') as tmpFile:
|
||||
for chunk in r.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
tmpFile.write(chunk)
|
||||
|
||||
# 如表头有返回文件大小,显示进度
|
||||
if show_msg and file_size:
|
||||
downloaded_size += 1024
|
||||
rate = downloaded_size / file_size if downloaded_size < file_size else 1
|
||||
print('\r{:.0%} '.format(rate), end="")
|
||||
|
||||
except Exception as e:
|
||||
if show_errmsg:
|
||||
raise ConnectionError(e)
|
||||
download_status, info = False, f'下载失败。\n{e}'
|
||||
|
||||
else:
|
||||
if full_path.stat().st_size == 0:
|
||||
if show_errmsg:
|
||||
raise ValueError('文件大小为0。')
|
||||
download_status, info = False, '文件大小为0。'
|
||||
|
||||
else:
|
||||
download_status, info = True, str(full_path)
|
||||
|
||||
finally:
|
||||
if download_status is False and full_path.exists():
|
||||
full_path.unlink() # 删除下载出错文件
|
||||
r.close()
|
||||
|
||||
# -------------------显示并返回值-------------------
|
||||
if show_msg:
|
||||
print(info, '\n')
|
||||
|
||||
info = str(full_path) if download_status else info
|
||||
return download_status, info
|
||||
|
||||
retry_times = retry or self.retry_times
|
||||
retry_interval = interval or self.retry_interval
|
||||
result = do()
|
||||
|
||||
if result[0] is False: # 第一位为None表示跳过的情况
|
||||
for i in range(retry_times):
|
||||
sleep(retry_interval)
|
||||
if show_msg:
|
||||
print(f'\n重试 {file_url}')
|
||||
|
||||
result = do()
|
||||
if result[0] is not False:
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
def _make_response(self,
|
||||
url: str,
|
||||
mode: str = 'get',
|
||||
|
@ -1,9 +1,13 @@
|
||||
selenium 缺乏对浏览器下载文件的有效管理,难以进行检测下载状态、重命名、失败管理。
|
||||
使用 requests 下载文件能较好实现以上功能,但代码较为繁琐。
|
||||
因此 DrissionPage 封装了`download()`方法,整合了两者优点,可从 selenium 获取登录信息,用 requests 进行下载。弥补了 selenium 的不足,使下载简洁高效。
|
||||
因此 DrissionPage 提供了高效可靠的下载工具,整合了两者优点,可从 selenium 获取登录信息,用 requests 进行下载。弥补了 selenium 的不足,使下载简洁高效。
|
||||
|
||||
## 功能
|
||||
?> 为了增强灵活性,该工具现在独立打包成一个库,叫 DownloadKit,详细用法见:[DownloadKit](https://gitee.com/g1879/DownloadKit)
|
||||
|
||||
# 功能
|
||||
|
||||
- 支持多线程同时下载多个文件
|
||||
- 自动管理下载列表,只要用`add()`方法添加任务,该工具会自动在多个线程中调度任务,无须等待
|
||||
- 支持 d 模式下用 requests 下载文件
|
||||
- 可指定下载路径,若路径不存在会自动创建文件夹
|
||||
- 重命名文件,可不填写扩展名,程序自动补充
|
||||
@ -11,8 +15,12 @@ selenium 缺乏对浏览器下载文件的有效管理,难以进行检测下
|
||||
- 显示下载进度
|
||||
- 支持 post 方式
|
||||
- 支持自定义连接参数
|
||||
- 任务失败自动重试
|
||||
|
||||
## download() 方法
|
||||
# 单线程下载方式
|
||||
|
||||
`MixPage`对象的`download`属性是一个`DownloadKit`对象,为尽量与旧版兼容,该属性可直接调用。如`page.download(url, path)`
|
||||
使用这个方式时效果与旧版一致,会阻塞程序直到任务结束,因此更加建议用后文讲述的多线程方法。
|
||||
|
||||
参数:
|
||||
|
||||
@ -22,15 +30,12 @@ selenium 缺乏对浏览器下载文件的有效管理,难以进行检测下
|
||||
- file_exists:若存在同名文件,可选择`'rename'`,`'overwrite'`,`'skip'`方式处理,若选择重命名,会在文件名后面添加序号
|
||||
- post_data:post 方式的数据,这个参数不为`None`时自动转成 post 方式
|
||||
- show_msg:是否显示下载信息和进度
|
||||
- show_errmsg:出现异常时是否显示和抛出
|
||||
- retry:重试次数,与页面对象的设置一致,默认 3 次
|
||||
- interval:重试间隔时间,与页面对象的设置一致,默认 2 秒
|
||||
- **kwargs;连接参数,s 模式专用,与 requests 的一致
|
||||
- **kwargs;连接参数,与 requests 的一致
|
||||
|
||||
返回:下载是否成功(`bool`)和状态信息(成功时信息为文件路径)的元组,跳过时第一位返回 None
|
||||
|
||||
## 示例
|
||||
|
||||
```python
|
||||
from DrissionPage import MixPage
|
||||
|
||||
@ -48,12 +53,116 @@ print(res)
|
||||
|
||||
显示:
|
||||
|
||||
```
|
||||
https://www.baidu.com/img/flexible/logo/pc/result.png
|
||||
result.png -> img.png
|
||||
正在下载到:C:\download
|
||||
100% C:\download\img.png
|
||||
```shell
|
||||
url:https://www.baidu.com/img/flexible/logo/pc/result.png
|
||||
文件名:img.png
|
||||
目标路径:C:\download
|
||||
100% 下载完成 C:\download\img.png
|
||||
|
||||
(True, 'C:\\download\\img.png')
|
||||
```
|
||||
|
||||
# 多线程下载方式
|
||||
|
||||
你可以往`DownloadKit`对象添加个数不限的下载任务,它会自动调配线程去完成这些任务。
|
||||
当前默认为 10 个线程,以后的版本会增加修改线程数功能。
|
||||
|
||||
## `add()`方法
|
||||
|
||||
参数:
|
||||
|
||||
- file_ur:文件 url
|
||||
- goal_path:存放路径,填写到文件夹,不填写文件名
|
||||
- session:可指定使用的`Session`对象,默认使用`MixPage`内置的`Session`对象
|
||||
- rename:重命名文件,可不写扩展名,不输入则用网络文件原名
|
||||
- file_exists:若存在同名文件,可选择`'rename'`,`'overwrite'`,`'skip'`方式处理,若选择重命名,会在文件名后面添加序号
|
||||
- post_data:post 方式的数据,这个参数不为`None`时自动转成 post 方式
|
||||
- retry:重试次数,与页面对象的设置一致,默认 3 次
|
||||
- interval:重试间隔时间,与页面对象的设置一致,默认 2 秒
|
||||
- **kwargs;连接参数,与 requests 的一致
|
||||
|
||||
返回:任务对象,可通过任务对象查看任务状态和结果
|
||||
|
||||
```python
|
||||
from DrissionPage import MixPage
|
||||
|
||||
page = MixPage('s')
|
||||
# 文件 url
|
||||
url = 'https://www.baidu.com/img/flexible/logo/pc/result.png'
|
||||
# 存放路径
|
||||
save_path = r'C:\download'
|
||||
|
||||
# 返回一个任务对象
|
||||
mission = page.download.add(url, save_path)
|
||||
|
||||
# 通过任务对象查看状态
|
||||
print(mission.rate, mission.info)
|
||||
```
|
||||
|
||||
输出:
|
||||
|
||||
```shell
|
||||
90% '下载中'
|
||||
```
|
||||
|
||||
## `show()`方法
|
||||
|
||||
多线程方式不会实时显示下载进度,可用`show()`方法把下载进度打印出来。
|
||||
|
||||
!> **注意:** <br> 若使用 pyCharm 运行,须在运行配置里勾选“模拟输出控制台中的终端”才能正常显示输出。
|
||||
|
||||
参数:
|
||||
|
||||
- asyn:是否异步进行
|
||||
|
||||
返回:None
|
||||
|
||||
```python
|
||||
url = 'https://example.com/file/abc.zip'
|
||||
mission = page.download.add(url, r'.\files')
|
||||
page.download.show()
|
||||
```
|
||||
|
||||
输出:
|
||||
|
||||
```shell
|
||||
等待任务数:0
|
||||
线程0:97.41% D:\files\abc.zip
|
||||
线程1:None None\None
|
||||
线程2:None None\None
|
||||
.....
|
||||
```
|
||||
|
||||
|
||||
|
||||
## 等待任务结束
|
||||
|
||||
有时须要等待任务结束,以便获取结果,可用`wait()`方法。
|
||||
当传入任务时,等待该任务结束并返回结果,不传入参数时等待所有任务结束。
|
||||
|
||||
参数:
|
||||
|
||||
- mission:任务对象或任务`id`,为`None`时等待所有任务结束
|
||||
- show:是否显示进度
|
||||
|
||||
返回:
|
||||
|
||||
- 指定任务时,返回任务结果和信息组成的两位 tuple。`True`表示成功,`False`表示失败,`None`表示跳过。
|
||||
- 不指定任务时,返回`None`
|
||||
|
||||
!> **注意:** <br> 若使用 pyCharm 运行,须在运行配置里勾选“模拟输出控制台中的终端”才能正常显示输出。
|
||||
|
||||
```python
|
||||
url = 'https://www.baidu.com/img/PCfb_5bf082d29588c07f842ccde3f97243ea.png'
|
||||
mission = page.download.add(url, save_path)
|
||||
page.download.wait(mission)
|
||||
```
|
||||
|
||||
输出:
|
||||
|
||||
```shell
|
||||
url:https://www.baidu.com/img/PCfb_5bf082d29588c07f842ccde3f97243ea.png
|
||||
文件名:PCfb_5bf082d29588c07f842ccde3f97243ea_4.png
|
||||
目标路径:D:\files
|
||||
100% 下载完成 D:\files\PCfb_5bf082d29588c07f842ccde3f97243ea_4.png
|
||||
```
|
||||
|
Loading…
x
Reference in New Issue
Block a user