mirror of
https://gitee.com/g1879/DrissionPage.git
synced 2024-12-10 04:00:23 +08:00
commit
224c4642a6
@ -28,9 +28,9 @@ class DrissionElement(object):
|
|||||||
def is_valid(self):
|
def is_valid(self):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@property
|
# @property
|
||||||
def text(self):
|
# def text(self):
|
||||||
return
|
# return
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def html(self):
|
def html(self):
|
||||||
@ -52,13 +52,13 @@ class DrissionElement(object):
|
|||||||
def prev(self):
|
def prev(self):
|
||||||
return
|
return
|
||||||
|
|
||||||
@property
|
# @property
|
||||||
def css_path(self):
|
# def css_path(self):
|
||||||
return
|
# return
|
||||||
|
#
|
||||||
@property
|
# @property
|
||||||
def xpath(self):
|
# def xpath(self):
|
||||||
return
|
# return
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def ele(self, loc: Union[tuple, str], mode: str = None, show_errmsg: bool = True):
|
def ele(self, loc: Union[tuple, str], mode: str = None, show_errmsg: bool = True):
|
||||||
@ -68,9 +68,9 @@ class DrissionElement(object):
|
|||||||
def eles(self, loc: Union[tuple, str], show_errmsg: bool = True):
|
def eles(self, loc: Union[tuple, str], show_errmsg: bool = True):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
# @abstractmethod
|
||||||
def attr(self, attr: str):
|
# def attr(self, attr: str):
|
||||||
pass
|
# pass
|
||||||
|
|
||||||
|
|
||||||
def get_loc_from_str(loc: str) -> tuple:
|
def get_loc_from_str(loc: str) -> tuple:
|
||||||
|
@ -123,6 +123,15 @@ class DriverElement(DrissionElement):
|
|||||||
'''
|
'''
|
||||||
return self.run_script(js)
|
return self.run_script(js)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def shadow_root(self):
|
||||||
|
e = self.run_script('return arguments[0].shadowRoot')
|
||||||
|
if e:
|
||||||
|
from .shadow_root_element import ShadowRootElement
|
||||||
|
return ShadowRootElement(e, self)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def parent(self):
|
def parent(self):
|
||||||
"""返回父级元素"""
|
"""返回父级元素"""
|
||||||
@ -199,7 +208,7 @@ class DriverElement(DrissionElement):
|
|||||||
ele.ele('xpath://div[@class="ele_class"]') - 返回第一个符合xpath的子元素 \n
|
ele.ele('xpath://div[@class="ele_class"]') - 返回第一个符合xpath的子元素 \n
|
||||||
ele.ele('css:div.ele_class') - 返回第一个符合css selector的子元素 \n
|
ele.ele('css:div.ele_class') - 返回第一个符合css selector的子元素 \n
|
||||||
:param loc_or_str: 元素的定位信息,可以是loc元组,或查询字符串
|
:param loc_or_str: 元素的定位信息,可以是loc元组,或查询字符串
|
||||||
:param mode: 'single' 或 'all‘,对应查找一个或全部
|
:param mode: 'single' 或 'all',对应查找一个或全部
|
||||||
:param timeout: 查找元素超时时间
|
:param timeout: 查找元素超时时间
|
||||||
:param show_errmsg: 出现异常时是否打印信息
|
:param show_errmsg: 出现异常时是否打印信息
|
||||||
:return: DriverElement对象
|
:return: DriverElement对象
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
"""
|
"""
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import time
|
from time import time, sleep
|
||||||
from typing import Union, List, Any
|
from typing import Union, List, Any
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
@ -60,21 +60,49 @@ class DriverPage(object):
|
|||||||
"""返回网页title"""
|
"""返回网页title"""
|
||||||
return self.driver.title
|
return self.driver.title
|
||||||
|
|
||||||
def get(self, url: str, go_anyway: bool = False, show_errmsg: bool = False) -> Union[None, bool]:
|
def _try_to_get(self,
|
||||||
|
to_url: str,
|
||||||
|
times: int = 0,
|
||||||
|
interval: float = 1,
|
||||||
|
show_errmsg: bool = False, ):
|
||||||
|
"""尝试连接,重试若干次 \n
|
||||||
|
:param to_url: 要访问的url
|
||||||
|
:param times: 重试次数
|
||||||
|
:param interval: 重试间隔(秒)
|
||||||
|
:param show_errmsg: 是否抛出异常
|
||||||
|
:return: 是否成功
|
||||||
|
"""
|
||||||
|
self.driver.get(to_url)
|
||||||
|
is_ok = self.check_page()
|
||||||
|
while times and is_ok is False:
|
||||||
|
sleep(interval)
|
||||||
|
self.driver.get(to_url)
|
||||||
|
is_ok = self.check_page()
|
||||||
|
times -= 1
|
||||||
|
if is_ok is False and show_errmsg:
|
||||||
|
raise ConnectionError('Connect error.')
|
||||||
|
return is_ok
|
||||||
|
|
||||||
|
def get(self,
|
||||||
|
url: str,
|
||||||
|
go_anyway: bool = False,
|
||||||
|
show_errmsg: bool = False,
|
||||||
|
retry: int = 0,
|
||||||
|
interval: float = 1,
|
||||||
|
) -> Union[None, bool]:
|
||||||
"""访问url \n
|
"""访问url \n
|
||||||
:param url: 目标url
|
:param url: 目标url
|
||||||
:param go_anyway: 若目标url与当前url一致,是否强制跳转
|
:param go_anyway: 若目标url与当前url一致,是否强制跳转
|
||||||
:param show_errmsg: 是否显示和抛出异常
|
:param show_errmsg: 是否显示和抛出异常
|
||||||
|
:param retry: 重试次数
|
||||||
|
:param interval: 重试间隔(秒)
|
||||||
:return: 目标url是否可用
|
:return: 目标url是否可用
|
||||||
"""
|
"""
|
||||||
to_url = quote(url, safe='/:&?=%;#@')
|
to_url = quote(url, safe='/:&?=%;#@')
|
||||||
if not url or (not go_anyway and self.url == to_url):
|
if not url or (not go_anyway and self.url == to_url):
|
||||||
return
|
return
|
||||||
self._url = to_url
|
self._url = to_url
|
||||||
self.driver.get(to_url)
|
self._url_available = self._try_to_get(to_url, times=retry, interval=interval, show_errmsg=show_errmsg)
|
||||||
self._url_available = self.check_page()
|
|
||||||
if self._url_available is False and show_errmsg:
|
|
||||||
raise ConnectionError('Connect error.')
|
|
||||||
return self._url_available
|
return self._url_available
|
||||||
|
|
||||||
def ele(self,
|
def ele(self,
|
||||||
|
@ -11,6 +11,7 @@ from requests_html import HTMLSession, Element
|
|||||||
from selenium.webdriver.chrome.webdriver import WebDriver
|
from selenium.webdriver.chrome.webdriver import WebDriver
|
||||||
from selenium.webdriver.remote.webelement import WebElement
|
from selenium.webdriver.remote.webelement import WebElement
|
||||||
|
|
||||||
|
from .config import DriverOptions
|
||||||
from .drission import Drission
|
from .drission import Drission
|
||||||
from .driver_element import DriverElement
|
from .driver_element import DriverElement
|
||||||
from .driver_page import DriverPage
|
from .driver_page import DriverPage
|
||||||
@ -32,16 +33,23 @@ class MixPage(Null, SessionPage, DriverPage):
|
|||||||
这些功能由DriverPage和SessionPage类实现。
|
这些功能由DriverPage和SessionPage类实现。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, drission: Union[Drission, str] = None, mode: str = 'd', timeout: float = 10):
|
def __init__(self,
|
||||||
"""初始化函数 \n
|
drission: Union[Drission, str] = None,
|
||||||
|
mode: str = 'd',
|
||||||
|
timeout: float = 10,
|
||||||
|
driver_options: Union[dict, DriverOptions] = None,
|
||||||
|
session_options: dict = None):
|
||||||
|
"""初始化函数 \n
|
||||||
:param drission: 整合了driver和session的类,传入's'或'd'时快速配置相应模式
|
:param drission: 整合了driver和session的类,传入's'或'd'时快速配置相应模式
|
||||||
:param mode: 默认使用selenium的d模式
|
:param mode: 默认使用selenium的d模式
|
||||||
|
:param driver_options: 浏览器设置,没有传入drission参数时会用这个设置新建Drission对象
|
||||||
|
:param session_options: requests设置,没有传入drission参数时会用这个设置新建Drission对象
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
if drission in ['s', 'd', 'S', 'D']:
|
if drission in ['s', 'd', 'S', 'D']:
|
||||||
mode = drission.lower()
|
mode = drission.lower()
|
||||||
drission = None
|
drission = None
|
||||||
self._drission = drission or Drission()
|
self._drission = drission or Drission(driver_options, session_options)
|
||||||
self._session = None
|
self._session = None
|
||||||
self._driver = None
|
self._driver = None
|
||||||
self._url = None
|
self._url = None
|
||||||
@ -227,20 +235,46 @@ class MixPage(Null, SessionPage, DriverPage):
|
|||||||
return super().chrome_downloading(path)
|
return super().chrome_downloading(path)
|
||||||
|
|
||||||
# ----------------以下为共用函数-----------------------
|
# ----------------以下为共用函数-----------------------
|
||||||
|
def _try_to_get(self,
|
||||||
|
to_url: str,
|
||||||
|
times: int = 0,
|
||||||
|
interval: float = 1,
|
||||||
|
show_errmsg: bool = False,
|
||||||
|
**kwargs):
|
||||||
|
"""尝试连接,重试若干次 \n
|
||||||
|
:param to_url: 要访问的url
|
||||||
|
:param times: 重试次数
|
||||||
|
:param interval: 重试间隔(秒)
|
||||||
|
:param show_errmsg: 是否抛出异常
|
||||||
|
:param kwargs: 连接参数
|
||||||
|
:return: s模式为HTMLResponse对象,d模式为bool
|
||||||
|
"""
|
||||||
|
if self._mode == 'd':
|
||||||
|
return super(SessionPage, self)._try_to_get(to_url, times, interval, show_errmsg)
|
||||||
|
elif self._mode == 's':
|
||||||
|
return super()._try_to_get(to_url, times, interval, show_errmsg, **kwargs)
|
||||||
|
|
||||||
def get(self, url: str, go_anyway=False, show_errmsg: bool = False, **kwargs) -> Union[bool, None]:
|
def get(self,
|
||||||
|
url: str,
|
||||||
|
go_anyway=False,
|
||||||
|
show_errmsg: bool = False,
|
||||||
|
retry: int = 2,
|
||||||
|
interval: float = 1,
|
||||||
|
**kwargs) -> Union[bool, None]:
|
||||||
"""跳转到一个url \n
|
"""跳转到一个url \n
|
||||||
跳转前先同步cookies,跳转后判断目标url是否可用
|
跳转前先同步cookies,跳转后判断目标url是否可用
|
||||||
:param url: 目标url
|
:param url: 目标url
|
||||||
:param go_anyway: 若目标url与当前url一致,是否强制跳转
|
:param go_anyway: 若目标url与当前url一致,是否强制跳转
|
||||||
:param show_errmsg: 是否显示和抛出异常
|
:param show_errmsg: 是否显示和抛出异常
|
||||||
|
:param retry: 重试次数
|
||||||
|
:param interval: 重试间隔(秒)
|
||||||
:param kwargs: 连接参数,s模式专用
|
:param kwargs: 连接参数,s模式专用
|
||||||
:return: url是否可用
|
:return: url是否可用
|
||||||
"""
|
"""
|
||||||
if self._mode == 'd':
|
if self._mode == 'd':
|
||||||
return super(SessionPage, self).get(url, go_anyway, show_errmsg)
|
return super(SessionPage, self).get(url, go_anyway, show_errmsg, retry, interval)
|
||||||
elif self._mode == 's':
|
elif self._mode == 's':
|
||||||
return super().get(url, go_anyway, show_errmsg, **kwargs)
|
return super().get(url, go_anyway, show_errmsg, retry, interval, **kwargs)
|
||||||
|
|
||||||
def ele(self,
|
def ele(self,
|
||||||
loc_or_ele: Union[tuple, str, DriverElement, SessionElement, Element, WebElement],
|
loc_or_ele: Union[tuple, str, DriverElement, SessionElement, Element, WebElement],
|
||||||
|
@ -9,7 +9,7 @@ from pathlib import Path
|
|||||||
from random import randint
|
from random import randint
|
||||||
from re import search as re_SEARCH
|
from re import search as re_SEARCH
|
||||||
from re import sub as re_SUB
|
from re import sub as re_SUB
|
||||||
from time import time
|
from time import time, sleep
|
||||||
from typing import Union, List
|
from typing import Union, List
|
||||||
from urllib.parse import urlparse, quote
|
from urllib.parse import urlparse, quote
|
||||||
|
|
||||||
@ -142,30 +142,60 @@ class SessionPage(object):
|
|||||||
raise TypeError('Type of loc_or_str can only be tuple or str.')
|
raise TypeError('Type of loc_or_str can only be tuple or str.')
|
||||||
return self.ele(loc_or_str, mode='all', show_errmsg=True)
|
return self.ele(loc_or_str, mode='all', show_errmsg=True)
|
||||||
|
|
||||||
|
def _try_to_get(self,
|
||||||
|
to_url: str,
|
||||||
|
times: int = 0,
|
||||||
|
interval: float = 1,
|
||||||
|
show_errmsg: bool = False,
|
||||||
|
**kwargs) -> HTMLResponse:
|
||||||
|
"""尝试连接,重试若干次 \n
|
||||||
|
:param to_url: 要访问的url
|
||||||
|
:param times: 重试次数
|
||||||
|
:param interval: 重试间隔(秒)
|
||||||
|
:param show_errmsg: 是否抛出异常
|
||||||
|
:param kwargs: 连接参数
|
||||||
|
:return: HTMLResponse对象
|
||||||
|
"""
|
||||||
|
r = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0]
|
||||||
|
while times and (not r or r.content == b''):
|
||||||
|
if r is not None and r.status_code in (403, 404):
|
||||||
|
break
|
||||||
|
print('重试', to_url)
|
||||||
|
sleep(interval)
|
||||||
|
r = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0]
|
||||||
|
times -= 1
|
||||||
|
return r
|
||||||
|
|
||||||
def get(self,
|
def get(self,
|
||||||
url: str,
|
url: str,
|
||||||
go_anyway: bool = False,
|
go_anyway: bool = False,
|
||||||
show_errmsg: bool = False,
|
show_errmsg: bool = False,
|
||||||
|
retry: int = 0,
|
||||||
|
interval: float = 1,
|
||||||
**kwargs) -> Union[bool, None]:
|
**kwargs) -> Union[bool, None]:
|
||||||
"""用get方式跳转到url \n
|
"""用get方式跳转到url \n
|
||||||
:param url: 目标url
|
:param url: 目标url
|
||||||
:param go_anyway: 若目标url与当前url一致,是否强制跳转
|
:param go_anyway: 若目标url与当前url一致,是否强制跳转
|
||||||
:param show_errmsg: 是否显示和抛出异常
|
:param show_errmsg: 是否显示和抛出异常
|
||||||
|
:param retry: 重试次数
|
||||||
|
:param interval: 重试间隔(秒)
|
||||||
:param kwargs: 连接参数
|
:param kwargs: 连接参数
|
||||||
:return: url是否可用
|
:return: url是否可用
|
||||||
"""
|
"""
|
||||||
to_url = quote(url, safe='/:&?=%;#@')
|
to_url = quote(url, safe='/:&?=%;#@+')
|
||||||
if not url or (not go_anyway and self.url == to_url):
|
if not url or (not go_anyway and self.url == to_url):
|
||||||
return
|
return
|
||||||
self._url = to_url
|
self._url = to_url
|
||||||
self._response = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0]
|
self._response = self._try_to_get(to_url, times=retry, interval=interval, show_errmsg=show_errmsg, **kwargs)
|
||||||
if self._response is None:
|
if self._response is None:
|
||||||
self._url_available = False
|
self._url_available = False
|
||||||
else:
|
else:
|
||||||
try:
|
stream = tuple(x for x in kwargs if x.lower() == 'stream')
|
||||||
self._response.html.encoding = self._response.encoding # 修复requests_html丢失编码方式的bug
|
if (not stream or not kwargs[stream[0]]) and not self.session.stream:
|
||||||
except:
|
try:
|
||||||
pass
|
self._response.html.encoding = self._response.encoding # 修复requests_html丢失编码方式的bug
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
if self._response.ok:
|
if self._response.ok:
|
||||||
self._url_available = True
|
self._url_available = True
|
||||||
@ -247,31 +277,29 @@ class SessionPage(object):
|
|||||||
raise ConnectionError(f'Status code: {r.status_code}.')
|
raise ConnectionError(f'Status code: {r.status_code}.')
|
||||||
return False, f'Status code: {r.status_code}.'
|
return False, f'Status code: {r.status_code}.'
|
||||||
# -------------------获取文件名-------------------
|
# -------------------获取文件名-------------------
|
||||||
# header里有文件名,则使用它,否则在url里截取,但不能保证url包含文件名
|
if 'Content-disposition' in r.headers: # header里有文件名,则使用它
|
||||||
if 'Content-disposition' in r.headers:
|
|
||||||
file_name = r.headers['Content-disposition'].split('"')[1].encode('ISO-8859-1').decode('utf-8')
|
file_name = r.headers['Content-disposition'].split('"')[1].encode('ISO-8859-1').decode('utf-8')
|
||||||
elif os_PATH.basename(file_url):
|
elif os_PATH.basename(file_url): # 在url里获取文件名
|
||||||
file_name = os_PATH.basename(file_url).split("?")[0]
|
file_name = os_PATH.basename(file_url).split("?")[0]
|
||||||
else:
|
else: # 找不到则用时间和随机数生成文件名
|
||||||
file_name = f'untitled_{time()}_{randint(0, 100)}'
|
file_name = f'untitled_{time()}_{randint(0, 100)}'
|
||||||
|
file_name = re_SUB(r'[\\/*:|<>?"]', '', file_name).strip() # 去除非法字符
|
||||||
file_name = re_SUB(r'[\\/*:|<>?"]', '', file_name).strip()
|
# -------------------重命名文件名-------------------
|
||||||
if rename: # 重命名文件,不改变扩展名
|
if rename: # 重命名文件,不改变扩展名
|
||||||
rename = re_SUB(r'[\\/*:|<>?"]', '', rename).strip()
|
rename = re_SUB(r'[\\/*:|<>?"]', '', rename).strip()
|
||||||
ext_name = file_name.split('.')[-1]
|
ext_name = file_name.split('.')[-1]
|
||||||
if rename.lower().endswith(f'.{ext_name}'.lower()) or ext_name == file_name:
|
if '.' in rename or ext_name == file_name:
|
||||||
full_name = rename
|
full_name = rename
|
||||||
else:
|
else:
|
||||||
full_name = f'{rename}.{ext_name}'
|
full_name = f'{rename}.{ext_name}'
|
||||||
else:
|
else:
|
||||||
full_name = file_name
|
full_name = file_name
|
||||||
|
# -------------------生成路径-------------------
|
||||||
goal_Path = Path(goal_path)
|
goal_Path = Path(goal_path)
|
||||||
goal_path = ''
|
goal_path = ''
|
||||||
for key, i in enumerate(goal_Path.parts): # 去除路径中的非法字符
|
for key, i in enumerate(goal_Path.parts): # 去除路径中的非法字符
|
||||||
goal_path += goal_Path.drive if key == 0 and goal_Path.drive else re_SUB(r'[*:|<>?"]', '', i).strip()
|
goal_path += goal_Path.drive if key == 0 and goal_Path.drive else re_SUB(r'[*:|<>?"]', '', i).strip()
|
||||||
goal_path += '\\' if i != '\\' and key < len(goal_Path.parts) - 1 else ''
|
goal_path += '\\' if i != '\\' and key < len(goal_Path.parts) - 1 else ''
|
||||||
|
|
||||||
goal_Path = Path(goal_path)
|
goal_Path = Path(goal_path)
|
||||||
goal_Path.mkdir(parents=True, exist_ok=True)
|
goal_Path.mkdir(parents=True, exist_ok=True)
|
||||||
goal_path = goal_Path.absolute()
|
goal_path = goal_Path.absolute()
|
||||||
@ -287,8 +315,8 @@ class SessionPage(object):
|
|||||||
full_path = Path(f'{goal_path}\\{full_name}')
|
full_path = Path(f'{goal_path}\\{full_name}')
|
||||||
else:
|
else:
|
||||||
raise ValueError("Argument file_exists can only be 'skip', 'overwrite', 'rename'.")
|
raise ValueError("Argument file_exists can only be 'skip', 'overwrite', 'rename'.")
|
||||||
|
# -------------------打印要下载的文件-------------------
|
||||||
if show_msg: # 打印要下载的文件
|
if show_msg:
|
||||||
print(full_name if file_name == full_name else f'{file_name} -> {full_name}')
|
print(full_name if file_name == full_name else f'{file_name} -> {full_name}')
|
||||||
print(f'Downloading to: {goal_path}')
|
print(f'Downloading to: {goal_path}')
|
||||||
|
|
||||||
@ -317,9 +345,8 @@ class SessionPage(object):
|
|||||||
else:
|
else:
|
||||||
download_status, info = True, 'Success.'
|
download_status, info = True, 'Success.'
|
||||||
finally:
|
finally:
|
||||||
# 删除下载出错文件
|
|
||||||
if not download_status and full_path.exists():
|
if not download_status and full_path.exists():
|
||||||
full_path.unlink()
|
full_path.unlink() # 删除下载出错文件
|
||||||
r.close()
|
r.close()
|
||||||
# -------------------显示并返回值-------------------
|
# -------------------显示并返回值-------------------
|
||||||
if show_msg:
|
if show_msg:
|
||||||
@ -343,7 +370,7 @@ class SessionPage(object):
|
|||||||
"""
|
"""
|
||||||
if mode not in ['get', 'post']:
|
if mode not in ['get', 'post']:
|
||||||
raise ValueError("Argument mode can only be 'get' or 'post'.")
|
raise ValueError("Argument mode can only be 'get' or 'post'.")
|
||||||
url = quote(url, safe='/:&?=%;#@')
|
url = quote(url, safe='/:&?=%;#@+')
|
||||||
|
|
||||||
# 设置referer和host值
|
# 设置referer和host值
|
||||||
kwargs_set = set(x.lower() for x in kwargs)
|
kwargs_set = set(x.lower() for x in kwargs)
|
||||||
@ -374,15 +401,27 @@ class SessionPage(object):
|
|||||||
return None, e
|
return None, e
|
||||||
else:
|
else:
|
||||||
headers = dict(r.headers)
|
headers = dict(r.headers)
|
||||||
if 'Content-Type' not in headers or 'charset' not in headers['Content-Type']:
|
content_type = tuple(x for x in headers if x.lower() == 'content-type')
|
||||||
re_result = re_SEARCH(r'<meta.*?charset=[ \'"]*([^"\' />]+).*?>', r.text)
|
stream = tuple(x for x in kwargs if x.lower() == 'stream')
|
||||||
try:
|
charset = None
|
||||||
charset = re_result.group(1)
|
if not content_type or 'charset' not in headers[content_type[0]].lower():
|
||||||
except:
|
if (not stream or not kwargs[stream[0]]) and not self.session.stream:
|
||||||
charset = r.apparent_encoding
|
# ========================
|
||||||
|
re_result = None
|
||||||
|
for chunk in r.iter_content(chunk_size=512):
|
||||||
|
re_result = re_SEARCH(r'<meta.*?charset=[ \'"]*([^"\' />]+).*?>', chunk.decode())
|
||||||
|
break
|
||||||
|
# ========================
|
||||||
|
# re_result = re_SEARCH(r'<meta.*?charset=[ \'"]*([^"\' />]+).*?>', r.text)
|
||||||
|
try:
|
||||||
|
charset = re_result.group(1)
|
||||||
|
except:
|
||||||
|
charset = r.apparent_encoding
|
||||||
else:
|
else:
|
||||||
charset = headers['Content-Type'].split('=')[1]
|
charset = headers[content_type[0]].split('=')[1]
|
||||||
# 避免存在退格符导致乱码或解析出错
|
# 避免存在退格符导致乱码或解析出错
|
||||||
r._content = r.content if 'stream' in kwargs and kwargs['stream'] else r.content.replace(b'\x08', b'\\b')
|
if (not stream or not kwargs[stream[0]]) and not self.session.stream:
|
||||||
r.encoding = charset
|
r._content = r.content.replace(b'\x08', b'\\b')
|
||||||
|
if charset:
|
||||||
|
r.encoding = charset
|
||||||
return r, 'Success'
|
return r, 'Success'
|
||||||
|
236
DrissionPage/shadow_root_element.py
Normal file
236
DrissionPage/shadow_root_element.py
Normal file
@ -0,0 +1,236 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding:utf-8 -*-
|
||||||
|
from html import unescape
|
||||||
|
from re import split as re_SPLIT
|
||||||
|
from typing import Union, Any
|
||||||
|
|
||||||
|
from selenium.webdriver.remote.webelement import WebElement
|
||||||
|
|
||||||
|
from .common import DrissionElement
|
||||||
|
from .driver_element import execute_driver_find
|
||||||
|
|
||||||
|
|
||||||
|
class ShadowRootElement(DrissionElement):
|
||||||
|
def __init__(self, inner_ele: WebElement, parent_ele, timeout: float = 10):
|
||||||
|
super().__init__(inner_ele)
|
||||||
|
self.parent_ele = parent_ele
|
||||||
|
self.timeout = timeout
|
||||||
|
self._driver = inner_ele.parent
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f'<ShadowRootElement in {self.parent_ele} >'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def driver(self):
|
||||||
|
"""返回控制元素的WebDriver对象"""
|
||||||
|
return self._driver
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tag(self):
|
||||||
|
return 'shadow-root'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def html(self):
|
||||||
|
return unescape(self.inner_ele.get_attribute('innerHTML')).replace('\xa0', ' ')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def parent(self):
|
||||||
|
return self.parent_ele
|
||||||
|
|
||||||
|
def parents(self, num: int = 1):
|
||||||
|
"""返回上面第num级父元素 \n
|
||||||
|
:param num: 第几级父元素
|
||||||
|
:return: DriverElement对象
|
||||||
|
"""
|
||||||
|
loc = 'xpath', f'.{"/.." * (num - 1)}'
|
||||||
|
return self.parent_ele.ele(loc, timeout=0.01, show_errmsg=False)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def next(self):
|
||||||
|
"""返回后一个兄弟元素"""
|
||||||
|
return self.nexts()
|
||||||
|
|
||||||
|
def nexts(self, num: int = 1):
|
||||||
|
"""返回后面第num个兄弟元素 \n
|
||||||
|
:param num: 后面第几个兄弟元素
|
||||||
|
:return: DriverElement对象
|
||||||
|
"""
|
||||||
|
loc = 'css selector', f':nth-child({num})'
|
||||||
|
return self.parent_ele.ele(loc)
|
||||||
|
|
||||||
|
def ele(self,
|
||||||
|
loc_or_str: Union[tuple, str],
|
||||||
|
mode: str = 'single',
|
||||||
|
timeout: float = None,
|
||||||
|
show_errmsg: bool = False):
|
||||||
|
"""返回当前元素下级符合条件的子元素,默认返回第一个 \n
|
||||||
|
示例: \n
|
||||||
|
- 用loc元组查找: \n
|
||||||
|
ele.ele((By.CLASS_NAME, 'ele_class')) - 返回第一个class为ele_class的子元素 \n
|
||||||
|
- 用查询字符串查找: \n
|
||||||
|
查找方式:属性、tag name和属性、文本、css selector \n
|
||||||
|
其中,@表示属性,=表示精确匹配,:表示模糊匹配,无控制字符串时默认搜索该字符串 \n
|
||||||
|
ele.ele('@class:ele_class') - 返回第一个class含有ele_class的子元素 \n
|
||||||
|
ele.ele('@name=ele_name') - 返回第一个name等于ele_name的子元素 \n
|
||||||
|
ele.ele('@placeholder') - 返回第一个带placeholder属性的子元素 \n
|
||||||
|
ele.ele('tag:p') - 返回第一个<p>子元素 \n
|
||||||
|
ele.ele('tag:div@class:ele_class') - 返回第一个class含有ele_class的div子元素 \n
|
||||||
|
ele.ele('tag:div@class=ele_class') - 返回第一个class等于ele_class的div子元素 \n
|
||||||
|
ele.ele('tag:div@text():some_text') - 返回第一个文本含有some_text的div子元素 \n
|
||||||
|
ele.ele('tag:div@text()=some_text') - 返回第一个文本等于some_text的div子元素 \n
|
||||||
|
ele.ele('text:some_text') - 返回第一个文本含有some_text的子元素 \n
|
||||||
|
ele.ele('some_text') - 返回第一个文本含有some_text的子元素(等价于上一行) \n
|
||||||
|
ele.ele('text=some_text') - 返回第一个文本等于some_text的子元素 \n
|
||||||
|
ele.ele('css:div.ele_class') - 返回第一个符合css selector的子元素 \n
|
||||||
|
:param loc_or_str: 元素的定位信息,可以是loc元组,或查询字符串
|
||||||
|
:param mode: 'single' 或 'all',对应查找一个或全部
|
||||||
|
:param timeout: 查找元素超时时间
|
||||||
|
:param show_errmsg: 出现异常时是否打印信息
|
||||||
|
:return: DriverElement对象
|
||||||
|
"""
|
||||||
|
if isinstance(loc_or_str, str):
|
||||||
|
loc_or_str = get_css_from_str(loc_or_str)
|
||||||
|
elif isinstance(loc_or_str, tuple) and len(loc_or_str) == 2:
|
||||||
|
if loc_or_str[0] == 'xpath':
|
||||||
|
raise ValueError('不支持xpath')
|
||||||
|
else:
|
||||||
|
raise ValueError('Argument loc_or_str can only be tuple or str.')
|
||||||
|
|
||||||
|
timeout = timeout or self.timeout
|
||||||
|
if loc_or_str[0] == 'css selector':
|
||||||
|
return execute_driver_find(self.inner_ele, loc_or_str, mode, show_errmsg, timeout)
|
||||||
|
elif loc_or_str[0] == 'text':
|
||||||
|
return self._find_eles_by_text(loc_or_str[1], loc_or_str[2], loc_or_str[3], mode)
|
||||||
|
|
||||||
|
def eles(self,
|
||||||
|
loc_or_str: Union[tuple, str],
|
||||||
|
timeout: float = None,
|
||||||
|
show_errmsg: bool = False):
|
||||||
|
"""返回当前元素下级所有符合条件的子元素 \n
|
||||||
|
示例: \n
|
||||||
|
- 用loc元组查找: \n
|
||||||
|
ele.eles((By.CLASS_NAME, 'ele_class')) - 返回所有class为ele_class的子元素 \n
|
||||||
|
- 用查询字符串查找: \n
|
||||||
|
查找方式:属性、tag name和属性、文本、css selector \n
|
||||||
|
其中,@表示属性,=表示精确匹配,:表示模糊匹配,无控制字符串时默认搜索该字符串 \n
|
||||||
|
ele.eles('@class:ele_class') - 返回所有class含有ele_class的子元素 \n
|
||||||
|
ele.eles('@name=ele_name') - 返回所有name等于ele_name的子元素 \n
|
||||||
|
ele.eles('@placeholder') - 返回所有带placeholder属性的子元素 \n
|
||||||
|
ele.eles('tag:p') - 返回所有<p>子元素 \n
|
||||||
|
ele.eles('tag:div@class:ele_class') - 返回所有class含有ele_class的div子元素 \n
|
||||||
|
ele.eles('tag:div@class=ele_class') - 返回所有class等于ele_class的div子元素 \n
|
||||||
|
ele.eles('tag:div@text():some_text') - 返回所有文本含有some_text的div子元素 \n
|
||||||
|
ele.eles('tag:div@text()=some_text') - 返回所有文本等于some_text的div子元素 \n
|
||||||
|
ele.eles('text:some_text') - 返回所有文本含有some_text的子元素 \n
|
||||||
|
ele.eles('some_text') - 返回所有文本含有some_text的子元素(等价于上一行) \n
|
||||||
|
ele.eles('text=some_text') - 返回所有文本等于some_text的子元素 \n
|
||||||
|
ele.eles('css:div.ele_class') - 返回所有符合css selector的子元素 \n
|
||||||
|
:param loc_or_str: 元素的定位信息,可以是loc元组,或查询字符串
|
||||||
|
:param timeout: 查找元素超时时间
|
||||||
|
:param show_errmsg: 出现异常时是否打印信息
|
||||||
|
:return: DriverElement对象组成的列表
|
||||||
|
"""
|
||||||
|
return self.ele(loc_or_str, mode='all', show_errmsg=show_errmsg, timeout=timeout)
|
||||||
|
|
||||||
|
def run_script(self, script: str, *args) -> Any:
|
||||||
|
"""执行js代码,传入自己为第一个参数 \n
|
||||||
|
:param script: js文本
|
||||||
|
:param args: 传入的参数
|
||||||
|
:return: js执行结果
|
||||||
|
"""
|
||||||
|
return self.inner_ele.parent.execute_script(script, self.inner_ele, *args)
|
||||||
|
|
||||||
|
def is_enabled(self) -> bool:
|
||||||
|
"""是否可用"""
|
||||||
|
return self.inner_ele.is_enabled()
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
"""用于判断元素是否还能用,应对页面跳转元素不能用的情况"""
|
||||||
|
try:
|
||||||
|
self.is_enabled()
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _find_eles_by_text(self, text: str, tag: str = '', match: str = 'exact', mode: str = 'single'):
|
||||||
|
"""根据文本获取页面元素 \n
|
||||||
|
:param text: 文本字符串
|
||||||
|
:param tag: tag name
|
||||||
|
:param match: 'exact' 或 'fuzzy',对应精确或模糊匹配
|
||||||
|
:param mode: 'single' 或 'all',对应匹配一个或全部
|
||||||
|
:return: 返回DriverElement对象或组成的列表
|
||||||
|
"""
|
||||||
|
eles = self.run_script('return arguments[0].querySelectorAll("*")') # 获取所有元素
|
||||||
|
from .driver_element import DriverElement
|
||||||
|
results = []
|
||||||
|
for ele in eles: # 遍历所有元素,找到符合条件的
|
||||||
|
if tag and tag != ele.tag_name:
|
||||||
|
continue
|
||||||
|
txt = self.driver.execute_script(
|
||||||
|
'if(arguments[0].firstChild!=null){return arguments[0].firstChild.nodeValue}', ele)
|
||||||
|
txt = txt or ''
|
||||||
|
if text == '' or match == 'exact': # 匹配没有文本的元素或精确匹配
|
||||||
|
if text == txt:
|
||||||
|
if mode == 'single':
|
||||||
|
return DriverElement(ele)
|
||||||
|
elif mode == 'all':
|
||||||
|
results.append(DriverElement(ele))
|
||||||
|
elif match == 'fuzzy': # 模糊匹配
|
||||||
|
if text in txt:
|
||||||
|
if mode == 'single':
|
||||||
|
return DriverElement(ele)
|
||||||
|
elif mode == 'all':
|
||||||
|
results.append(DriverElement(ele))
|
||||||
|
return None if mode == 'single' else results
|
||||||
|
|
||||||
|
|
||||||
|
def get_css_from_str(loc: str) -> tuple:
|
||||||
|
"""处理元素查找语句 \n
|
||||||
|
查找方式:属性、tag name及属性、文本、css selector \n
|
||||||
|
=表示精确匹配,:表示模糊匹配,无控制字符串时默认搜索该字符串 \n
|
||||||
|
=表示精确匹配,:表示模糊匹配,无控制字符串时默认搜索该字符串 \n
|
||||||
|
示例: \n
|
||||||
|
@class:ele_class - class含有ele_class的元素 \n
|
||||||
|
@class=ele_class - class等于ele_class的元素 \n
|
||||||
|
@class - 带class属性的元素 \n
|
||||||
|
tag:div - div元素 \n
|
||||||
|
tag:div@class:ele_class - class含有ele_class的div元素 \n
|
||||||
|
tag:div@class=ele_class - class等于ele_class的div元素 \n
|
||||||
|
tag:div@text():search_text - 文本含有search_text的div元素 \n
|
||||||
|
tag:div@text()=search_text - 文本等于search_text的div元素 \n
|
||||||
|
text:search_text - 文本含有search_text的元素 \n
|
||||||
|
text=search_text - 文本等于search_text的元素 \n
|
||||||
|
css:div.ele_class \n
|
||||||
|
"""
|
||||||
|
loc_by = 'css selector'
|
||||||
|
if loc.startswith('@'): # 根据属性查找
|
||||||
|
r = re_SPLIT(r'([:=])', loc[1:], maxsplit=1)
|
||||||
|
if len(r) == 3:
|
||||||
|
mode = '=' if r[1] == '=' else '*='
|
||||||
|
loc_str = f'*[{r[0]}{mode}{r[2]}]'
|
||||||
|
else:
|
||||||
|
loc_str = f'*[{loc[1:]}]'
|
||||||
|
elif loc.startswith(('tag=', 'tag:')): # 根据tag name查找
|
||||||
|
if '@' not in loc[4:]:
|
||||||
|
loc_str = f'{loc[4:]}'
|
||||||
|
else:
|
||||||
|
at_lst = loc[4:].split('@', maxsplit=1)
|
||||||
|
r = re_SPLIT(r'([:=])', at_lst[1], maxsplit=1)
|
||||||
|
if len(r) == 3:
|
||||||
|
if r[0] == 'text()':
|
||||||
|
match = 'exact' if r[1] == '=' else 'fuzzy'
|
||||||
|
return 'text', r[2], at_lst[0], match
|
||||||
|
mode = '=' if r[1] == '=' else '*='
|
||||||
|
loc_str = f'{at_lst[0]}[{r[0]}{mode}"{r[2]}"]'
|
||||||
|
else:
|
||||||
|
loc_str = f'{at_lst[0]}[{r[0]}]'
|
||||||
|
elif loc.startswith(('css=', 'css:')): # 用css selector查找
|
||||||
|
loc_str = loc[4:]
|
||||||
|
elif loc.startswith(('xpath=', 'xpath:')): # 用xpath查找
|
||||||
|
raise ValueError('不支持xpath')
|
||||||
|
elif loc.startswith(('text=', 'text:')): # 根据文本查找
|
||||||
|
match = 'exact' if loc[4] == '=' else 'fuzzy'
|
||||||
|
return 'text', loc[5:], '', match
|
||||||
|
else: # 根据文本模糊查找
|
||||||
|
return 'text', loc, '', 'fuzzy'
|
||||||
|
return loc_by, loc_str
|
Loading…
x
Reference in New Issue
Block a user