继续开发新版,未完成

This commit is contained in:
g1879 2022-10-28 19:09:34 +08:00
parent 35f19aa174
commit c447448e8f
8 changed files with 992 additions and 618 deletions

View File

@ -46,14 +46,9 @@ class BaseParser(object):
class BaseElement(BaseParser):
"""各元素类的基类"""
def __init__(self, ele: Union[WebElement, HtmlElement], page=None):
self._inner_ele = ele
def __init__(self, page=None):
self.page = page
@property
def inner_ele(self) -> Union[WebElement, HtmlElement]:
return self._inner_ele
# ----------------以下属性或方法由后代实现----------------
@property
def tag(self):

View File

@ -1,13 +1,17 @@
# -*- coding:utf-8 -*-
# 问题跨iframe查找元素可能出现同名元素如何解决
# 须用DOM.documentUpdated检测元素有效性
from typing import Union, Tuple, List
from .base import DrissionElement
from .common import make_absolute_link, get_loc
class ChromeElement(object):
class ChromeElement(DrissionElement):
def __init__(self, page, node_id: str = None, obj_id: str = None):
self.page = page
super().__init__(page)
if not node_id and not obj_id:
raise TypeError('node_id或obj_id必须传入一个')
raise TypeError('node_id或obj_id必须传入一个')
if node_id:
self._node_id = node_id
@ -17,18 +21,86 @@ class ChromeElement(object):
self._obj_id = obj_id
@property
def html(self):
def html(self) -> str:
"""返回元素outerHTML文本"""
return self.page.driver.DOM.getOuterHTML(nodeId=self._node_id)['outerHTML']
def ele(self, xpath: str):
# todo: 引号记得转码
js = f'''function(){{
frame=this.contentDocument;
return document.evaluate("{xpath}", frame, null, 9, null).singleNodeValue;
}}'''
r = self.page.driver.Runtime.callFunctionOn(functionDeclaration=js,
objectId=self._obj_id)['result'].get('objectId', None)
return r if not r else _ele(self.page, obj_id=r)
@property
def inner_html(self) -> str:
"""返回元素innerHTML文本"""
return self.page.driver.Runtime.callFunctionOn('function(){this.innerHTML;}')
@property
def attrs(self) -> dict:
attrs = self.page.driver.DOM.getAttributes(nodeId=self._node_id)['attributes']
attrs_len = len(attrs)
return {attrs[i]: attrs[i + 1] for i in range(0, attrs_len, 2)}
def ele(self,
loc_or_str: Union[Tuple[str, str], str],
timeout: float = None) -> Union['ChromeElement', str, None]:
"""返回当前元素下级符合条件的第一个元素、属性或节点文本 \n
:param loc_or_str: 元素的定位信息可以是loc元组或查询字符串
:param timeout: 查找元素超时时间默认与元素所在页面等待时间一致
:return: DriverElement对象或属性文本
"""
return self._ele(loc_or_str, timeout)
def eles(self,
loc_or_str: Union[Tuple[str, str], str],
timeout: float = None) -> List[Union['ChromeElement', str]]:
"""返回当前元素下级所有符合条件的子元素、属性或节点文本 \n
:param loc_or_str: 元素的定位信息可以是loc元组或查询字符串
:param timeout: 查找元素超时时间默认与元素所在页面等待时间一致
:return: DriverElement对象或属性文本组成的列表
"""
return self._ele(loc_or_str, timeout=timeout, single=False)
def _ele(self,
loc_or_str: Union[Tuple[str, str], str],
timeout: float = None,
single: bool = True) -> Union['ChromeElement', str, None, List[Union['ChromeElement', str]]]:
"""返回当前元素下级符合条件的子元素、属性或节点文本,默认返回第一个 \n
:param loc_or_str: 元素的定位信息可以是loc元组或查询字符串
:param timeout: 查找元素超时时间
:param single: True则返回第一个False则返回全部
:return: DriverElement对象
"""
return make_chrome_ele(self, loc_or_str, single, timeout)
def attr(self, attr: str) -> Union[str, None]:
"""返回attribute属性值 \n
:param attr: 属性名
:return: 属性值文本没有该属性返回None
"""
# 获取href属性时返回绝对url
attrs = self.attrs
if attr == 'href':
link = attrs['href']
# 若为链接为None、js或邮件直接返回
if not link or link.lower().startswith(('javascript:', 'mailto:')):
return link
else: # 其它情况直接返回绝对url
return make_absolute_link(link, self.page)
elif attr == 'src':
return make_absolute_link(attrs['src'], self.page)
elif attr == 'text':
return self.text
elif attr == 'innerText':
return self.raw_text
elif attr in ('html', 'outerHTML'):
return self.html
elif attr == 'innerHTML':
return self.inner_html
else:
return attrs[attr]
def click(self, by_js: bool = True):
if by_js:
@ -41,6 +113,237 @@ class ChromeElement(object):
def _get_node_id(self, obj_id):
return self.page.driver.DOM.requestNode(objectId=obj_id)['nodeId']
@property
def tag(self) -> str:
return self.page.driver.DOM.describeNode(nodeId=self._node_id)['node']['localName']
def _ele(page, node_id=None, obj_id=None) -> ChromeElement:
return ChromeElement(page=page, node_id=node_id, obj_id=obj_id)
@property
def is_valid(self):
return True
@property
def text(self):
return
@property
def raw_text(self):
return
def _get_ele_path(self, mode):
return ''
def make_chrome_ele(ele: ChromeElement,
loc: Union[str, Tuple[str, str]],
single: bool = True,
timeout: float = None) -> Union[ChromeElement, str, None, List[Union[ChromeElement, str]]]:
"""在chrome元素中查找 \n
:param ele: ChromeElement对象
:param loc: 元素定位元组
:param single: True则返回第一个False则返回全部
:param timeout: 查找元素超时时间
:return: 返回DriverElement元素或它们组成的列表
"""
# ---------------处理定位符---------------
if isinstance(loc, (str, tuple)):
loc = get_loc(loc)
else:
raise ValueError("定位符必须为str或长度为2的tuple对象。")
loc_str = loc[1]
if loc[0] == 'xpath' and loc[1].lstrip().startswith('/'):
loc_str = f'.{loc_str}'
elif loc[0] == 'css selector' and loc[1].lstrip().startswith('>'):
loc_str = f'{ele.css_path}{loc[1]}'
loc = loc[0], loc_str
timeout = timeout if timeout is not None else ele.page.timeout
# ---------------执行查找-----------------
if loc[0] == 'xpath':
type_txt = '9' if single else '7'
node_txt = 'this.contentDocument' if ele.tag in ('iframe', 'frame') else 'this'
js = _make_js(loc[1], type_txt, node_txt)
print(js)
r = ele.page.driver.Runtime.callFunctionOn(functionDeclaration=js, objectId=ele._obj_id,)
# print(r)
if r['result']['type'] == 'string':
return r['result']['value']
if r['result']['subtype'] == 'null':
return None if single else []
if r['result']['className'] == 'TypeError':
if 'The result is not a node set' in r['result']['description']:
js = _make_js(loc[1], '1', node_txt)
r = ele.page.driver.Runtime.callFunctionOn(functionDeclaration=js, objectId=ele._obj_id)
return r['result']['value']
else:
raise RuntimeError(r['result']['description'])
elif 'objectId' in r['result']:
if not single:
r = ele.page.driver.Runtime.getProperties(objectId=r['result']['objectId'])['result']
result = []
for i in r:
if not i['enumerable']:
break
result.append(ChromeElement(ele.page, obj_id=i['value']['objectId']))
r = result
return r
# try:
# # 使用xpath查找
# if loc[0] == 'xpath':
# js = _make_js()
# r = ele.page.driver.Runtime.callFunctionOn(functionDeclaration=js,
# objectId=self._obj_id)['result'].get('objectId', None)
# return r if not r else _ele(self.page, obj_id=r)
#
# return wait.until(ElementsByXpath(page, loc[1], single, timeout))
#
# # 使用css selector查找
# else:
# if single:
# return DriverElement(wait.until(ec.presence_of_element_located(loc)), page)
# else:
# eles = wait.until(ec.presence_of_all_elements_located(loc))
# return [DriverElement(ele, page) for ele in eles]
#
# except TimeoutException:
# return [] if not single else None
#
# except InvalidElementStateException:
# raise ValueError(f'无效的查找语句:{loc}')
def _make_js(xpath: str, type_txt: str, node_txt: str):
for_txt = ''
# 获取第一个元素、节点或属性
if type_txt == '9':
return_txt = '''
if(e.singleNodeValue==null){return null;}
else if(e.singleNodeValue.constructor.name=="Text"){return e.singleNodeValue.data;}
else if(e.singleNodeValue.constructor.name=="Attr"){return e.singleNodeValue.nodeValue;}
else if(e.singleNodeValue.constructor.name=="Comment"){return e.singleNodeValue.nodeValue;}
else{return e.singleNodeValue;}'''
# 按顺序获取所有元素、节点或属性
elif type_txt == '7':
for_txt = """
var a=new Array();
for(var i = 0; i <e.snapshotLength ; i++){
if(e.snapshotItem(i).constructor.name=="Text"){a.push(e.snapshotItem(i).data);}
else if(e.snapshotItem(i).constructor.name=="Attr"){a.push(e.snapshotItem(i).nodeValue);}
else if(e.snapshotItem(i).constructor.name=="Comment"){a.push(e.snapshotItem(i).nodeValue);}
else{a.push(e.snapshotItem(i));}}"""
return_txt = 'return a;'
elif type_txt == '2':
return_txt = 'return e.stringValue;'
elif type_txt == '1':
return_txt = 'return e.numberValue;'
else:
return_txt = 'return e.singleNodeValue;'
js = f'function(){{var e=document.evaluate(\'{xpath}\',{node_txt},null,{type_txt},null);\n{for_txt}\n{return_txt}}}'
return js
# class ElementsByXpath(object):
# """用js通过xpath获取元素、节点或属性与WebDriverWait配合使用"""
#
# def __init__(self, page, xpath: str = None, single: bool = False, timeout: float = 10):
# """
# :param page: DrissionPage对象
# :param xpath: xpath文本
# :param single: True则返回第一个False则返回全部
# :param timeout: 超时时间
# """
# self.page = page
# self.xpath = xpath
# self.single = single
# self.timeout = timeout
#
# def __call__(self, ele_or_driver: Union[RemoteWebDriver, WebElement]) \
# -> Union[str, DriverElement, None, List[str or DriverElement]]:
#
# def get_nodes(node=None, xpath_txt=None, type_txt='7'):
# """用js通过xpath获取元素、节点或属性
# :param node: 'document' 或 元素对象
# :param xpath_txt: xpath语句
# :param type_txt: resultType,参考 https://developer.mozilla.org/zh-CN/docs/Web/API/Document/evaluate
# :return: 元素对象或属性、文本字符串
# """
# node_txt = 'document' if not node or node == 'document' else 'arguments[0]'
# for_txt = ''
#
# # 获取第一个元素、节点或属性
# if type_txt == '9':
# return_txt = '''
# if(e.singleNodeValue.constructor.name=="Text"){return e.singleNodeValue.data;}
# else if(e.singleNodeValue.constructor.name=="Attr"){return e.singleNodeValue.nodeValue;}
# else if(e.singleNodeValue.constructor.name=="Comment"){return e.singleNodeValue.nodeValue;}
# else{return e.singleNodeValue;}
# '''
#
# # 按顺序获取所有元素、节点或属性
# elif type_txt == '7':
# for_txt = """
# var a=new Array();
# for(var i = 0; i <e.snapshotLength ; i++){
# if(e.snapshotItem(i).constructor.name=="Text"){a.push(e.snapshotItem(i).data);}
# else if(e.snapshotItem(i).constructor.name=="Attr"){a.push(e.snapshotItem(i).nodeValue);}
# else if(e.snapshotItem(i).constructor.name=="Comment"){a.push(e.snapshotItem(i).nodeValue);}
# else{a.push(e.snapshotItem(i));}
# }
# """
# return_txt = 'return a;'
#
# elif type_txt == '2':
# return_txt = 'return e.stringValue;'
# elif type_txt == '1':
# return_txt = 'return e.numberValue;'
# else:
# return_txt = 'return e.singleNodeValue;'
#
# js = """
# var e=document.evaluate(arguments[1], """ + node_txt + """, null, """ + type_txt + """,null);
# """ + for_txt + """
# """ + return_txt + """
# """
# return driver.execute_script(js, node, xpath_txt)
#
# if isinstance(ele_or_driver, RemoteWebDriver):
# driver, the_node = ele_or_driver, 'document'
# else:
# driver, the_node = ele_or_driver.parent, ele_or_driver
#
# # 把lxml元素对象包装成DriverElement对象并按需要返回第一个或全部
# if self.single:
# try:
# e = get_nodes(the_node, xpath_txt=self.xpath, type_txt='9')
#
# if isinstance(e, WebElement):
# return DriverElement(e, self.page)
# elif isinstance(e, str):
# return format_html(e)
# else:
# return e
#
# # 找不到目标时
# except JavascriptException as err:
# if 'The result is not a node set' in err.msg:
# try:
# return get_nodes(the_node, xpath_txt=self.xpath, type_txt='1')
# except JavascriptException:
# return None
# else:
# return None
#
# else: # 返回全部
# return ([DriverElement(x, self.page) if isinstance(x, WebElement)
# else format_html(x)
# for x in get_nodes(the_node, xpath_txt=self.xpath)
# if x != '\n'])

View File

@ -70,6 +70,10 @@ class ChromePage(BasePage):
"""返回当前页面加载状态,"""
return self.driver.Runtime.evaluate(expression='document.readyState;')['result']['value']
@property
def active_ele(self):
pass
def get(self,
url: str,
show_errmsg: bool = False,
@ -90,6 +94,7 @@ class ChromePage(BasePage):
interval=interval,
show_errmsg=show_errmsg,
timeout=timeout)
self.driver.DOM.getDocument()
return self._url_available
def get_cookies(self, as_dict: bool = False):
@ -101,6 +106,12 @@ class ChromePage(BasePage):
def eles(self, loc_or_ele: Union[Tuple[str, str], str, ChromeElement], timeout: float = None):
return self._ele(loc_or_ele, timeout=timeout, single=False)
def s_ele(self):
pass
def s_eles(self):
pass
def _ele(self,
loc_or_ele: Union[Tuple[str, str], str, ChromeElement],
timeout: float = None,
@ -164,6 +175,47 @@ class ChromePage(BasePage):
"""
return self.driver.call_method(cmd, **cmd_args)
def set_user_agent(self, ua: str) -> None:
"""为当前tab设置user agent只在当前tab有效 \n
:param ua: user agent字符串
:return: None
"""
self.driver.Network.setUserAgentOverride(userAgent=ua)
def get_session_storage(self, item: str = None) -> Union[str, dict, None]:
"""获取sessionStorage信息不设置item则获取全部 \n
:param item: 要获取的项不设置则返回全部
:return: sessionStorage一个或所有项内容
"""
js = f'sessionStorage.getItem("{item}");' if item else 'sessionStorage;'
return self.driver.Runtime.evaluate(js)
def get_local_storage(self, item: str = None) -> Union[str, dict, None]:
"""获取localStorage信息不设置item则获取全部 \n
:param item: 要获取的项目不设置则返回全部
:return: localStorage一个或所有项内容
"""
js = f'localStorage.getItem("{item}");' if item else 'localStorage;'
return self.driver.Runtime.evaluate(js)
def set_session_storage(self, item: str, value: Union[str, bool]) -> None:
"""设置或删除某项sessionStorage信息 \n
:param item: 要设置的项
:param value: 项的值设置为False时删除该项
:return: None
"""
s = f'sessionStorage.removeItem("{item}");' if item is False else f'sessionStorage.setItem("{item}","{value}");'
return self.driver.Runtime.evaluate(s)
def set_local_storage(self, item: str, value: Union[str, bool]) -> None:
"""设置或删除某项localStorage信息 \n
:param item: 要设置的项
:param value: 项的值设置为False时删除该项
:return: None
"""
s = f'localStorage.removeItem("{item}");' if item is False else f'localStorage.setItem("{item}","{value}");'
return self.driver.Runtime.evaluate(s)
def create_tab(self, url: str = None) -> None:
"""新建并定位到一个标签页,该标签页在最后面 \n
:param url: 新标签页跳转到的网址
@ -194,6 +246,10 @@ class ChromePage(BasePage):
if activate:
requests_get(f'http://{self.debugger_address}/json/activate/{tab}')
def to_front(self) -> None:
"""激活当前标签页使其处于最前面"""
requests_get(f'http://{self.debugger_address}/json/activate/{self.current_tab_handle}')
def close_tabs(self, num_or_handles: Union[int, str, list, tuple, set] = None, others: bool = False) -> None:
"""关闭传入的标签页,默认关闭当前页。可传入多个 \n
注意当程序使用的是接管的浏览器获取到的 handle 顺序和视觉效果不一致不能按序号关闭 \n

View File

@ -10,6 +10,7 @@ from re import split, search, sub
from shutil import rmtree
from typing import Union
from zipfile import ZipFile
from urllib.parse import urlparse, urljoin, urlunparse
def get_ele_txt(e) -> str:
@ -451,3 +452,28 @@ def get_long(txt) -> int:
"""
txt_len = len(txt)
return int((len(txt.encode('utf-8')) - txt_len) / 2 + txt_len)
def make_absolute_link(link, page=None) -> str:
"""获取绝对url
:param link: 超链接
:param page: 页面对象
:return: 绝对链接
"""
if not link:
return link
parsed = urlparse(link)._asdict()
# 是相对路径与页面url拼接并返回
if not parsed['netloc']:
return urljoin(page.url, link) if page else link
# 是绝对路径但缺少协议从页面url获取协议并修复
if not parsed['scheme'] and page:
parsed['scheme'] = urlparse(page.url).scheme
parsed = tuple(v for v in parsed.values())
return urlunparse(parsed)
# 绝对路径且不缺协议,直接返回
return link

File diff suppressed because it is too large Load Diff

View File

@ -30,9 +30,10 @@ class DriverElement(DrissionElement):
:param ele: 被包装的WebElement元素
:param page: 元素所在页面
"""
super().__init__(ele, page)
super().__init__(page)
self._select = None
self._scroll = None
self._inner_ele = ele
def __repr__(self) -> str:
attrs = [f"{attr}='{self.attrs[attr]}'" for attr in self.attrs]
@ -50,6 +51,10 @@ class DriverElement(DrissionElement):
return self.ele(loc_or_str, timeout)
# -----------------共有属性和方法-------------------
@property
def inner_ele(self) -> WebElement:
return self._inner_ele
@property
def tag(self) -> str:
"""返回元素类型"""

View File

@ -12,7 +12,7 @@ from lxml.etree import tostring
from lxml.html import HtmlElement, fromstring
from .base import DrissionElement, BasePage, BaseElement
from .common import get_ele_txt, get_loc
from .common import get_ele_txt, get_loc, make_absolute_link
class SessionElement(DrissionElement):
@ -23,7 +23,12 @@ class SessionElement(DrissionElement):
:param ele: 被包装的HtmlElement元素
:param page: 元素所在页面对象如果是从 html 文本生成的元素则为 None
"""
super().__init__(ele, page)
super().__init__(page)
self._inner_ele = ele
@property
def inner_ele(self) -> HtmlElement:
return self._inner_ele
def __repr__(self) -> str:
attrs = [f"{attr}='{self.attrs[attr]}'" for attr in self.attrs]
@ -180,10 +185,10 @@ class SessionElement(DrissionElement):
return link
else: # 其它情况直接返回绝对url
return self._make_absolute(link)
return make_absolute_link(link, self.page)
elif attr == 'src':
return self._make_absolute(self.inner_ele.get('src'))
return make_absolute_link(self.inner_ele.get('src'), self.page)
elif attr == 'text':
return self.text
@ -268,30 +273,6 @@ class SessionElement(DrissionElement):
return f':root{path_str[1:]}' if mode == 'css' else path_str
# ----------------session独有方法-----------------------
def _make_absolute(self, link) -> str:
"""获取绝对url
:param link: 超链接
:return: 绝对链接
"""
if not link:
return link
parsed = urlparse(link)._asdict()
# 是相对路径与页面url拼接并返回
if not parsed['netloc']:
return urljoin(self.page.url, link) if self.page else link
# 是绝对路径但缺少协议从页面url获取协议并修复
if not parsed['scheme'] and self.page:
parsed['scheme'] = urlparse(self.page.url).scheme
parsed = tuple(v for v in parsed.values())
return urlunparse(parsed)
# 绝对路径且不缺协议,直接返回
return link
def make_session_ele(html_or_ele: Union[str, BaseElement, BasePage],
loc: Union[str, Tuple[str, str]] = None,

View File

@ -19,8 +19,13 @@ class ShadowRootElement(BaseElement):
"""ShadowRootElement是用于处理ShadowRoot的类使用方法和DriverElement基本一致"""
def __init__(self, inner_ele: WebElement, parent_ele: DriverElement):
super().__init__(inner_ele, parent_ele.page)
super().__init__(parent_ele.page)
self.parent_ele = parent_ele
self._inner_ele = inner_ele
@property
def inner_ele(self) -> WebElement:
return self._inner_ele
def __repr__(self) -> str:
return f'<ShadowRootElement in {self.parent_ele} >'