- d模式如指定了调试端口,可自动启动浏览器进程并接入
- 去除对cssselect库依赖
- 提高查找元素效率
- 调整获取元素xpath和css_path逻辑
This commit is contained in:
g1879 2020-12-08 09:33:27 +08:00 committed by Gitee
commit 4d6ca7f699
7 changed files with 70 additions and 53 deletions

View File

@ -20,9 +20,8 @@ from .common import DrissionElement, str_to_loc, get_available_file_name, transl
class DriverElement(DrissionElement):
"""driver模式的元素对象包装了一个WebElement对象并封装了常用功能"""
def __init__(self, ele: WebElement, page=None, timeout: float = 10):
def __init__(self, ele: WebElement, page=None):
super().__init__(ele, page)
self.timeout = timeout
def __repr__(self):
attrs = [f"{attr}='{self.attrs[attr]}'" for attr in self.attrs]
@ -39,7 +38,7 @@ class DriverElement(DrissionElement):
:param timeout: 超时时间
:return: DriverElement对象
"""
return self.ele(loc_or_str, mode, timeout or self.timeout)
return self.ele(loc_or_str, mode, timeout)
# -----------------共有属性-------------------
@property
@ -235,7 +234,6 @@ class DriverElement(DrissionElement):
if loc_or_str[0] == 'css selector' and loc_or_str[1].lstrip().startswith('>'):
loc_str = f'{self.css_path}{loc_or_str[1]}'
timeout = timeout or self.timeout
loc_or_str = loc_or_str[0], loc_str
return execute_driver_find(self, loc_or_str, mode, timeout)
@ -489,7 +487,7 @@ class DriverElement(DrissionElement):
"""返获取css路径或xpath路径"""
if mode == 'xpath':
txt1 = 'var tag = el.nodeName.toLowerCase();'
txt2 = '''return '//' + tag + '[@id="' + el.id + '"]' + path;'''
# txt2 = '''return '//' + tag + '[@id="' + el.id + '"]' + path;'''
txt3 = ''' && sib.nodeName.toLowerCase()==tag'''
txt4 = '''
if(nth>1){path = '/' + tag + '[' + nth + ']' + path;}
@ -497,7 +495,7 @@ class DriverElement(DrissionElement):
txt5 = '''return path;'''
elif mode == 'css':
txt1 = ''
txt2 = '''return '#' + el.id + path;'''
# txt2 = '''return '#' + el.id + path;'''
txt3 = ''
txt4 = '''path = '>' + ":nth-child(" + nth + ")" + path;'''
txt5 = '''return path.substr(1);'''
@ -510,16 +508,13 @@ class DriverElement(DrissionElement):
var path = '';
while (el.nodeType === Node.ELEMENT_NODE) {
''' + txt1 + '''
if (el.id) {
''' + txt2 + '''
} else {
var sib = el, nth = 0;
while (sib) {
if(sib.nodeType === Node.ELEMENT_NODE''' + txt3 + '''){nth += 1;}
sib = sib.previousSibling;
}
''' + txt4 + '''
}
el = el.parentNode;
}
''' + txt5 + '''
@ -567,7 +562,7 @@ class DriverElement(DrissionElement):
def execute_driver_find(page_or_ele,
loc: Tuple[str, str],
mode: str = 'single',
timeout: float = 10) -> Union[DriverElement, List[DriverElement], str, None]:
timeout: float = None) -> Union[DriverElement, List[DriverElement], str, None]:
"""执行driver模式元素的查找 \n
页面查找元素及元素查找下级元素皆使用此方法 \n
:param page_or_ele: DriverPage对象或DriverElement对象
@ -588,15 +583,19 @@ def execute_driver_find(page_or_ele,
driver = page_or_ele.driver
try:
wait = WebDriverWait(driver, timeout=timeout)
if timeout and timeout != page.timeout:
wait = WebDriverWait(driver, timeout=timeout)
else:
wait = page.wait
if loc[0] == 'xpath':
return wait.until(ElementsByXpath(page, loc[1], mode, timeout))
else:
if mode == 'single':
return DriverElement(wait.until(ec.presence_of_element_located(loc)), page, timeout)
return DriverElement(wait.until(ec.presence_of_element_located(loc)), page)
elif mode == 'all':
eles = wait.until(ec.presence_of_all_elements_located(loc))
return [DriverElement(ele, page, timeout) for ele in eles]
return [DriverElement(ele, page) for ele in eles]
except TimeoutException:
return [] if mode == 'all' else None
@ -622,8 +621,6 @@ class ElementsByXpath(object):
def __call__(self, ele_or_driver: Union[WebDriver, WebElement]) \
-> Union[str, DriverElement, None, List[str or DriverElement]]:
driver, the_node = ((ele_or_driver, 'document') if isinstance(ele_or_driver, WebDriver)
else (ele_or_driver.parent, ele_or_driver))
def get_nodes(node=None, xpath_txt=None, type_txt='7'):
"""用js通过xpath获取元素、节点或属性
@ -669,12 +666,18 @@ class ElementsByXpath(object):
"""
return driver.execute_script(js, node)
# 把lxml元素对象包装成DriverElement对象并按需要返回第一个或全部
if isinstance(ele_or_driver, WebDriver):
driver, the_node = ele_or_driver, 'document'
else:
driver, the_node = ele_or_driver.parent, ele_or_driver
# 把lxml元素对象包装成DriverElement对象并按需要返回第一个或全部
if self.mode == 'single':
try:
e = get_nodes(the_node, xpath_txt=self.xpath, type_txt='9')
if isinstance(e, WebElement):
return DriverElement(e, self.page, self.timeout)
return DriverElement(e, self.page)
elif isinstance(e, str):
return format_html(e)
else:
@ -685,7 +688,7 @@ class ElementsByXpath(object):
return None
elif self.mode == 'all':
return ([DriverElement(x, self.page, self.timeout) if isinstance(x, WebElement)
return ([DriverElement(x, self.page) if isinstance(x, WebElement)
else format_html(x)
for x in get_nodes(the_node, xpath_txt=self.xpath)
if x != '\n'])

View File

@ -13,6 +13,7 @@ from urllib.parse import quote
from selenium.common.exceptions import NoAlertPresentException
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.wait import WebDriverWait
from .common import str_to_loc, get_available_file_name, translate_loc, format_html
from .driver_element import DriverElement, execute_driver_find
@ -24,9 +25,10 @@ class DriverPage(object):
def __init__(self, driver: WebDriver, timeout: float = 10):
"""初始化函数接收一个WebDriver对象用来操作网页"""
self._driver = driver
self.timeout = timeout
self._timeout = timeout
self._url = None
self._url_available = None
self._wait = None
@property
def driver(self) -> WebDriver:
@ -60,6 +62,22 @@ class DriverPage(object):
"""返回网页title"""
return self.driver.title
@property
def timeout(self) -> float:
return self._timeout
@timeout.setter
def timeout(self, second: float) -> None:
self._timeout = second
self._wait = None
@property
def wait(self) -> WebDriverWait:
if self._wait is None:
self._wait = WebDriverWait(self.driver, timeout=self.timeout)
return self._wait
def get_cookies(self, as_dict: bool = False) -> Union[list, dict]:
"""返回当前网站cookies"""
if as_dict:
@ -165,7 +183,7 @@ class DriverPage(object):
# 接收到WebElement对象打包成DriverElement对象返回
elif isinstance(loc_or_ele, WebElement):
return DriverElement(loc_or_ele, self, self.timeout)
return DriverElement(loc_or_ele, self)
# 接收到的类型不正确,抛出异常
else:

View File

@ -336,8 +336,8 @@ class MixPage(Null, SessionPage, DriverPage):
def ele(self,
loc_or_ele: Union[Tuple[str, str], str, DriverElement, SessionElement, WebElement],
mode: str = None,
timeout: float = None) -> Union[
DriverElement, SessionElement, str, List[SessionElement], List[DriverElement]]:
timeout: float = None) \
-> Union[DriverElement, SessionElement, str, List[SessionElement], List[DriverElement]]:
"""返回页面中符合条件的元素、属性或节点文本,默认返回第一个 \n
示例 \n
- 接收到元素对象时 \n

View File

@ -8,8 +8,7 @@ import re
from typing import Union, List, Tuple
from urllib.parse import urlparse, urljoin, urlunparse
from cssselect import SelectorSyntaxError
from lxml.etree import tostring, XPathEvalError
from lxml.etree import tostring
from lxml.html import HtmlElement, fromstring
from .common import DrissionElement, str_to_loc, translate_loc, format_html
@ -285,20 +284,20 @@ class SessionElement(DrissionElement):
ele = self
while ele:
ele_id = ele.attr('id')
# ele_id = ele.attr('id')
if ele_id:
return f'#{ele_id}{path_str}' if mode == 'css' else f'//{ele.tag}[@id="{ele_id}"]{path_str}'
# if ele_id:
# return f'#{ele_id}{path_str}' if mode == 'css' else f'//{ele.tag}[@id="{ele_id}"]{path_str}'
# else:
if mode == 'css':
brothers = len(ele.eles(f'xpath:./preceding-sibling::*'))
path_str = f'>:nth-child({brothers + 1}){path_str}'
else:
brothers = len(ele.eles(f'xpath:./preceding-sibling::{ele.tag}'))
path_str = f'/{ele.tag}[{brothers + 1}]{path_str}' if brothers > 0 else f'/{ele.tag}{path_str}'
if mode == 'css':
brothers = len(ele.eles(f'xpath:./preceding-sibling::*'))
path_str = f'>:nth-child({brothers + 1}){path_str}'
else:
brothers = len(ele.eles(f'xpath:./preceding-sibling::{ele.tag}'))
path_str = f'/{ele.tag}[{brothers + 1}]{path_str}' if brothers > 0 else f'/{ele.tag}{path_str}'
ele = ele.parent
ele = ele.parent
return path_str[1:] if mode == 'css' else path_str
@ -383,8 +382,11 @@ def execute_session_find(page_or_ele,
elif mode == 'all':
return [SessionElement(e, page) if isinstance(e, HtmlElement) else e for e in ele if e != '\n']
except XPathEvalError:
raise SyntaxError(f'Invalid xpath syntax. {loc}')
except Exception as e:
except SelectorSyntaxError:
raise SyntaxError(f'Invalid css selector syntax. {loc}')
if 'Invalid expression' in str(e):
raise SyntaxError(f'Invalid xpath syntax. {loc}')
elif 'Expected selector' in str(e):
raise SyntaxError(f'Invalid css selector syntax. {loc}')
raise e

View File

@ -10,10 +10,9 @@ from .driver_element import execute_driver_find, DriverElement
class ShadowRootElement(DrissionElement):
def __init__(self, inner_ele: WebElement, parent_ele: DriverElement, timeout: float = 10):
def __init__(self, inner_ele: WebElement, parent_ele: DriverElement):
super().__init__(inner_ele, parent_ele.page)
self.parent_ele = parent_ele
self.timeout = timeout
def __repr__(self):
return f'<ShadowRootElement in {self.parent_ele} >'
@ -29,7 +28,7 @@ class ShadowRootElement(DrissionElement):
:param timeout: 超时时间
:return: DriverElement对象
"""
return self.ele(loc_or_str, mode, timeout or self.timeout)
return self.ele(loc_or_str, mode, timeout)
@property
def tag(self):
@ -107,8 +106,6 @@ class ShadowRootElement(DrissionElement):
else:
raise ValueError('Argument loc_or_str can only be tuple or str.')
timeout = timeout or self.timeout
if loc_or_str[0] == 'css selector':
return execute_driver_find(self, loc_or_str, mode, timeout)
elif loc_or_str[0] == 'text':
@ -192,18 +189,18 @@ class ShadowRootElement(DrissionElement):
if text == txt:
if mode == 'single':
return DriverElement(ele, self.page, self.timeout)
return DriverElement(ele, self.page)
elif mode == 'all':
results.append(DriverElement(ele, self.page, self.timeout))
results.append(DriverElement(ele, self.page))
# 模糊匹配
elif match == 'fuzzy':
if text in txt:
if mode == 'single':
return DriverElement(ele, self.page, self.timeout)
return DriverElement(ele, self.page)
elif mode == 'all':
results.append(DriverElement(ele, self.page, self.timeout))
results.append(DriverElement(ele, self.page))
return None if mode == 'single' else results

View File

@ -1846,7 +1846,6 @@ Parameter Description:
- ele: WebElement- WebElement object
- page: DriverPage- the page object where the element is located
- timeout: float - Find the timeout of the element (it can be set separately each time the element is searched)
@ -2555,7 +2554,6 @@ Parameter Description:
- parent_ele: DriverElement-the element to which the shadow-root is attached
- timeout: float-timeout

View File

@ -1,5 +1,4 @@
selenium
requests
tldextract
lxml
cssselect
lxml