改进ChromiumFrame,未完成

This commit is contained in:
g1879 2022-12-16 18:25:51 +08:00
parent 925a7400c7
commit 808c1d8745
3 changed files with 268 additions and 46 deletions

View File

@ -6,7 +6,6 @@
"""
from abc import abstractmethod
from re import sub
from typing import Union, List
from urllib.parse import quote
from .common import format_html, get_loc
@ -62,7 +61,7 @@ class BaseElement(BaseParser):
def parent(self, level_or_loc=1):
pass
def prev(self, index=1) -> None:
def prev(self, index=1):
return None # ShadowRootElement直接继承
def prevs(self) -> None:
@ -80,26 +79,26 @@ class DrissionElement(BaseElement):
但不是ShadowRootElement的基类"""
@property
def link(self) -> str:
def link(self):
"""返回href或src绝对url"""
return self.attr('href') or self.attr('src')
@property
def css_path(self) -> str:
def css_path(self):
"""返回css path路径"""
return self._get_ele_path('css')
@property
def xpath(self) -> str:
def xpath(self):
"""返回xpath路径"""
return self._get_ele_path('xpath')
@property
def comments(self) -> list:
def comments(self):
"""返回元素注释文本组成的列表"""
return self.eles('xpath:.//comment()')
def texts(self, text_node_only: bool = False) -> list:
def texts(self, text_node_only=False):
"""返回元素内所有直接子节点的文本,包括元素和文本节点 \n
:param text_node_only: 是否只返回文本节点
:return: 文本列表
@ -111,7 +110,7 @@ class DrissionElement(BaseElement):
return [format_html(x.strip(' ').rstrip('\n')) for x in texts if x and sub('[\r\n\t ]', '', x) != '']
def parent(self, level_or_loc: Union[tuple, str, int] = 1) -> Union['DrissionElement', None]:
def parent(self, level_or_loc=1):
"""返回上面某一级父元素,可指定层数或用查询语法定位 \n
:param level_or_loc: 第几级父元素或定位符
:return: 上级元素对象
@ -132,10 +131,7 @@ class DrissionElement(BaseElement):
return self._ele(loc, timeout=0, relative=True)
def prev(self,
index: int = 1,
filter_loc: Union[tuple, str] = '',
timeout: float = 0) -> Union['DrissionElement', str, None]:
def prev(self, index=1, filter_loc='', timeout=0):
"""返回前面的一个兄弟元素,可用查询语法筛选,可指定返回筛选结果的第几个 \n
:param index: 前面第几个查询结果元素
:param filter_loc: 用于筛选元素的查询语法
@ -145,10 +141,7 @@ class DrissionElement(BaseElement):
nodes = self._get_brothers(index, filter_loc, 'preceding', timeout=timeout)
return nodes[-1] if nodes else None
def next(self,
index: int = 1,
filter_loc: Union[tuple, str] = '',
timeout: float = 0) -> Union['DrissionElement', str, None]:
def next(self, index=1, filter_loc='', timeout=0):
"""返回后面的一个兄弟元素,可用查询语法筛选,可指定返回筛选结果的第几个 \n
:param index: 后面第几个查询结果元素
:param filter_loc: 用于筛选元素的查询语法
@ -158,10 +151,7 @@ class DrissionElement(BaseElement):
nodes = self._get_brothers(index, filter_loc, 'following', timeout=timeout)
return nodes[0] if nodes else None
def before(self,
index: int = 1,
filter_loc: Union[tuple, str] = '',
timeout: float = None) -> Union['DrissionElement', str, None]:
def before(self, index=1, filter_loc='', timeout=None):
"""返回前面的一个兄弟元素,可用查询语法筛选,可指定返回筛选结果的第几个 \n
:param index: 前面第几个查询结果元素
:param filter_loc: 用于筛选元素的查询语法
@ -171,10 +161,7 @@ class DrissionElement(BaseElement):
nodes = self._get_brothers(index, filter_loc, 'preceding', False, timeout=timeout)
return nodes[-1] if nodes else None
def after(self,
index: int = 1,
filter_loc: Union[tuple, str] = '',
timeout: float = None) -> Union['DrissionElement', str, None]:
def after(self, index=1, filter_loc='', timeout=None):
"""返回后面的一个兄弟元素,可用查询语法筛选,可指定返回筛选结果的第几个 \n
:param index: 后面第几个查询结果元素
:param filter_loc: 用于筛选元素的查询语法
@ -184,9 +171,7 @@ class DrissionElement(BaseElement):
nodes = self._get_brothers(index, filter_loc, 'following', False, timeout)
return nodes[0] if nodes else None
def prevs(self,
filter_loc: Union[tuple, str] = '',
timeout: float = 0) -> List[Union['DrissionElement', str]]:
def prevs(self, filter_loc='', timeout=0):
"""返回前面全部兄弟元素或节点组成的列表,可用查询语法筛选 \n
:param filter_loc: 用于筛选元素的查询语法
:param timeout: 查找元素的超时时间
@ -194,9 +179,7 @@ class DrissionElement(BaseElement):
"""
return self._get_brothers(filter_loc=filter_loc, direction='preceding', timeout=timeout)
def nexts(self,
filter_loc: Union[tuple, str] = '',
timeout: float = 0) -> List[Union['DrissionElement', str]]:
def nexts(self, filter_loc='', timeout=0):
"""返回后面全部兄弟元素或节点组成的列表,可用查询语法筛选 \n
:param filter_loc: 用于筛选元素的查询语法
:param timeout: 查找元素的超时时间
@ -204,9 +187,7 @@ class DrissionElement(BaseElement):
"""
return self._get_brothers(filter_loc=filter_loc, direction='following', timeout=timeout)
def befores(self,
filter_loc: Union[tuple, str] = '',
timeout: float = None) -> List[Union['DrissionElement', str]]:
def befores(self, filter_loc='', timeout=None):
"""返回后面全部兄弟元素或节点组成的列表,可用查询语法筛选 \n
:param filter_loc: 用于筛选元素的查询语法
:param timeout: 查找元素的超时时间
@ -214,9 +195,7 @@ class DrissionElement(BaseElement):
"""
return self._get_brothers(filter_loc=filter_loc, direction='preceding', brother=False, timeout=timeout)
def afters(self,
filter_loc: Union[tuple, str] = '',
timeout: float = None) -> List[Union['DrissionElement', str]]:
def afters(self, filter_loc, timeout=None):
"""返回前面全部兄弟元素或节点组成的列表,可用查询语法筛选 \n
:param filter_loc: 用于筛选元素的查询语法
:param timeout: 查找元素的超时时间
@ -224,12 +203,7 @@ class DrissionElement(BaseElement):
"""
return self._get_brothers(filter_loc=filter_loc, direction='following', brother=False, timeout=timeout)
def _get_brothers(self,
index: int = None,
filter_loc: Union[tuple, str] = '',
direction: str = 'following',
brother: bool = True,
timeout: float = .5) -> List[Union['DrissionElement', str]]:
def _get_brothers(self, index=None, filter_loc='', direction='following', brother=True, timeout=.5):
"""按要求返回兄弟元素或节点组成的列表 \n
:param index: 获取第几个该参数不为None时只获取该编号的元素
:param filter_loc: 用于筛选元素的查询语法

View File

@ -0,0 +1,248 @@
# -*- coding:utf-8 -*-
from re import search
from typing import Union, Tuple, List
from urllib.parse import urlparse
from .chromium_element import ChromiumElement
from .chromium_base import ChromiumBase
class ChromiumFrame(object):
def __init__(self, page: ChromiumBase, ele: ChromiumElement):
self.page = page
self._inner_ele = ele
self._is_diff_domain = False
self.frame_id = page.run_cdp('DOM.describeNode', nodeId=ele.node_id)['node'].get('frameId', None)
# 有src属性且域名和主框架不一样为异域frame
src = ele.attr('src')
if src and urlparse(src).netloc != urlparse(page.url).netloc:
self._is_diff_domain = True
self.inner_page = ChromiumBase(page.address, self.frame_id, page.timeout)
self.inner_page.set_page_load_strategy(self.page.page_load_strategy)
self.inner_page.timeouts = self.page.timeouts
def __repr__(self) -> str:
attrs = self._inner_ele.attrs
attrs = [f"{attr}='{attrs[attr]}'" for attr in attrs]
return f'<ChromiumFrame {self._inner_ele.tag} {" ".join(attrs)}>'
@property
def tag(self) -> str:
"""返回元素tag"""
return self._inner_ele.tag
@property
def url(self) -> str:
""""""
if self._is_diff_domain:
return self.inner_page.url
else:
r = self.page.run_cdp('DOM.describeNode', nodeId=self._inner_ele.node_id)
return r['node']['contentDocument']['documentURL']
@property
def html(self) -> str:
"""返回元素outerHTML文本"""
if self._is_diff_domain:
tag = self.tag
out_html = self.page.run_cdp('DOM.getOuterHTML', nodeId=self._inner_ele.node_id)['outerHTML']
in_html = self.inner_page.html
sign = search(rf'<{tag}.*?>', out_html).group(0)
return f'{sign}{in_html}</{tag}>'
else:
return self._inner_ele.html
@property
def title(self) -> str:
d = self.inner_page if self._is_diff_domain else self._inner_ele
ele = d.ele('xpath://title')
return ele.text if ele else None
@property
def cookies(self):
return self.inner_page.cookies if self._is_diff_domain else self.page.cookies
@property
def inner_html(self) -> str:
"""返回元素innerHTML文本"""
return self.inner_page.html if self._is_diff_domain else self._inner_ele.inner_html
@property
def attrs(self) -> dict:
return self._inner_ele.attrs
@property
def frame_size(self) -> dict:
if self._is_diff_domain:
return self.inner_page.size
else:
h = self._inner_ele.run_script('return this.contentDocument.body.scrollHeight;')
w = self._inner_ele.run_script('return this.contentDocument.body.scrollWidth;')
return {'height': h, 'width': w}
@property
def size(self) -> dict:
"""返回frame元素大小"""
return self._inner_ele.size
@property
def obj_id(self) -> str:
"""返回js中的object id"""
return self._inner_ele.obj_id
@property
def node_id(self) -> str:
"""返回cdp中的node id"""
return self._inner_ele.node_id
@property
def location(self) -> dict:
"""返回frame元素左上角的绝对坐标"""
return self._inner_ele.location
@property
def is_displayed(self) -> bool:
"""返回frame元素是否显示"""
return self._inner_ele.is_displayed
def get(self, url):
self.page._get(url, False, None, None, None, self.frame_id)
def ele(self,
loc_or_ele: Union[Tuple[str, str], str, ChromiumElement, 'ChromiumFrame'],
timeout: float = None):
d = self.inner_page if self._is_diff_domain else self._inner_ele
return d.ele(loc_or_ele, timeout)
def eles(self,
loc_or_ele: Union[Tuple[str, str], str],
timeout: float = None):
d = self.inner_page if self._is_diff_domain else self._inner_ele
return d.eles(loc_or_ele, timeout)
# def s_ele(self, loc_or_ele: Union[Tuple[str, str], str, ChromiumElement] = None) \
# -> Union[SessionElement, str, None]:
# """查找第一个符合条件的元素以SessionElement形式返回处理复杂页面时效率很高 \n
# :param loc_or_ele: 元素的定位信息可以是loc元组或查询字符串
# :return: SessionElement对象或属性、文本
# """
# if isinstance(loc_or_ele, ChromiumElement):
# return make_session_ele(loc_or_ele)
# else:
# return make_session_ele(self, loc_or_ele)
#
# def s_eles(self, loc_or_str: Union[Tuple[str, str], str] = None) -> List[Union[SessionElement, str]]:
# """查找所有符合条件的元素以SessionElement列表形式返回 \n
# :param loc_or_str: 元素的定位信息可以是loc元组或查询字符串
# :return: SessionElement对象组成的列表
# """
# return make_session_ele(self, loc_or_str, single=False)
def attr(self, attr: str) -> Union[str, None]:
"""返回frame元素attribute属性值 \n
:param attr: 属性名
:return: 属性值文本没有该属性返回None
"""
return self._inner_ele.attr(attr)
def set_attr(self, attr: str, value: str) -> None:
"""设置frame元素attribute属性 \n
:param attr: 属性名
:param value: 属性值
:return: None
"""
self._inner_ele.set_attr(attr, value)
def remove_attr(self, attr: str) -> None:
"""删除frame元素attribute属性 \n
:param attr: 属性名
:return: None
"""
self._inner_ele.remove_attr(attr)
def parent(self, level_or_loc: Union[tuple, str, int] = 1) -> Union['ChromiumElement', None]:
"""返回上面某一级父元素,可指定层数或用查询语法定位 \n
:param level_or_loc: 第几级父元素或定位符
:return: 上级元素对象
"""
return self._inner_ele.parent(level_or_loc)
def prev(self,
filter_loc: Union[tuple, str] = '',
index: int = 1,
timeout: float = 0) -> Union['ChromiumElement', str, None]:
"""返回前面的一个兄弟元素,可用查询语法筛选,可指定返回筛选结果的第几个 \n
:param filter_loc: 用于筛选元素的查询语法
:param index: 前面第几个查询结果元素
:param timeout: 查找元素的超时时间
:return: 兄弟元素
"""
return self._inner_ele.prev(filter_loc, index, timeout)
def next(self,
filter_loc: Union[tuple, str] = '',
index: int = 1,
timeout: float = 0) -> Union['ChromiumElement', str, None]:
"""返回后面的一个兄弟元素,可用查询语法筛选,可指定返回筛选结果的第几个 \n
:param filter_loc: 用于筛选元素的查询语法
:param index: 后面第几个查询结果元素
:param timeout: 查找元素的超时时间
:return: 兄弟元素
"""
return self._inner_ele.next(filter_loc, index, timeout)
def before(self,
filter_loc: Union[tuple, str] = '',
index: int = 1,
timeout: float = None) -> Union['ChromiumElement', str, None]:
"""返回当前元素前面的一个元素可指定筛选条件和第几个。查找范围不限兄弟元素而是整个DOM文档 \n
:param filter_loc: 用于筛选元素的查询语法
:param index: 前面第几个查询结果元素
:param timeout: 查找元素的超时时间
:return: 本元素前面的某个元素或节点
"""
return self._inner_ele.before(filter_loc, index, timeout)
def after(self,
filter_loc: Union[tuple, str] = '',
index: int = 1,
timeout: float = None) -> Union['ChromiumElement', str, None]:
"""返回当前元素后面的一个元素可指定筛选条件和第几个。查找范围不限兄弟元素而是整个DOM文档 \n
:param filter_loc: 用于筛选元素的查询语法
:param index: 后面第几个查询结果元素
:param timeout: 查找元素的超时时间
:return: 本元素后面的某个元素或节点
"""
return self._inner_ele.after(filter_loc, index, timeout)
def prevs(self,
filter_loc: Union[tuple, str] = '',
timeout: float = 0) -> List[Union['ChromiumElement', str]]:
"""返回前面全部兄弟元素或节点组成的列表,可用查询语法筛选 \n
:param filter_loc: 用于筛选元素的查询语法
:param timeout: 查找元素的超时时间
:return: 兄弟元素或节点文本组成的列表
"""
return self._inner_ele.prevs(filter_loc, timeout)
def nexts(self,
filter_loc: Union[tuple, str] = '',
timeout: float = 0) -> List[Union['ChromiumElement', str]]:
"""返回后面全部兄弟元素或节点组成的列表,可用查询语法筛选 \n
:param filter_loc: 用于筛选元素的查询语法
:param timeout: 查找元素的超时时间
:return: 兄弟元素或节点文本组成的列表
"""
return self._inner_ele.nexts(filter_loc, timeout)
def befores(self,
filter_loc: Union[tuple, str] = '',
timeout: float = None) -> List[Union['ChromiumElement', str]]:
"""返回当前元素后面符合条件的全部兄弟元素或节点组成的列表可用查询语法筛选。查找范围不限兄弟元素而是整个DOM文档 \n
:param filter_loc: 用于筛选元素的查询语法
:param timeout: 查找元素的超时时间
:return: 本元素前面的元素或节点组成的列表
"""
return self._inner_ele.befores(filter_loc, timeout)

View File

@ -467,16 +467,16 @@ d 模式下所有查找元素操作都自带等待,默认为跟随元素所在
```python
# 页面初始化时设置查找元素超时时间为 15 秒
page = MixPage(timeout=15)
page = WebPage(timeout=15)
# 设置查找元素超时时间为 5 秒
page.timeout = 5
# 使用页面超时时间来查找元素5 秒)
ele1 = page.ele('some text')
ele1 = page.ele('search text')
# 为这次查找页面独立设置等待时间1 秒)
ele1 = page.ele('some text', timeout=1)
ele1 = page.ele('search text', timeout=1)
# 查找后代元素使用页面超时时间5 秒)
ele2 = ele1.ele('some text')
ele2 = ele1.ele('search text')
# 查找后代元素使用单独设置的超时时间1 秒)
ele2 = ele1.ele('some text', timeout=1)
```