- d模式如指定了调试端口,可自动启动浏览器进程并接入
- 去除对cssselect库依赖
- 提高查找元素效率
- 调整获取元素xpath和css_path逻辑
This commit is contained in:
g1879 2020-12-08 09:33:27 +08:00 committed by Gitee
commit 4d6ca7f699
7 changed files with 70 additions and 53 deletions

View File

@ -20,9 +20,8 @@ from .common import DrissionElement, str_to_loc, get_available_file_name, transl
class DriverElement(DrissionElement): class DriverElement(DrissionElement):
"""driver模式的元素对象包装了一个WebElement对象并封装了常用功能""" """driver模式的元素对象包装了一个WebElement对象并封装了常用功能"""
def __init__(self, ele: WebElement, page=None, timeout: float = 10): def __init__(self, ele: WebElement, page=None):
super().__init__(ele, page) super().__init__(ele, page)
self.timeout = timeout
def __repr__(self): def __repr__(self):
attrs = [f"{attr}='{self.attrs[attr]}'" for attr in self.attrs] attrs = [f"{attr}='{self.attrs[attr]}'" for attr in self.attrs]
@ -39,7 +38,7 @@ class DriverElement(DrissionElement):
:param timeout: 超时时间 :param timeout: 超时时间
:return: DriverElement对象 :return: DriverElement对象
""" """
return self.ele(loc_or_str, mode, timeout or self.timeout) return self.ele(loc_or_str, mode, timeout)
# -----------------共有属性------------------- # -----------------共有属性-------------------
@property @property
@ -235,7 +234,6 @@ class DriverElement(DrissionElement):
if loc_or_str[0] == 'css selector' and loc_or_str[1].lstrip().startswith('>'): if loc_or_str[0] == 'css selector' and loc_or_str[1].lstrip().startswith('>'):
loc_str = f'{self.css_path}{loc_or_str[1]}' loc_str = f'{self.css_path}{loc_or_str[1]}'
timeout = timeout or self.timeout
loc_or_str = loc_or_str[0], loc_str loc_or_str = loc_or_str[0], loc_str
return execute_driver_find(self, loc_or_str, mode, timeout) return execute_driver_find(self, loc_or_str, mode, timeout)
@ -489,7 +487,7 @@ class DriverElement(DrissionElement):
"""返获取css路径或xpath路径""" """返获取css路径或xpath路径"""
if mode == 'xpath': if mode == 'xpath':
txt1 = 'var tag = el.nodeName.toLowerCase();' txt1 = 'var tag = el.nodeName.toLowerCase();'
txt2 = '''return '//' + tag + '[@id="' + el.id + '"]' + path;''' # txt2 = '''return '//' + tag + '[@id="' + el.id + '"]' + path;'''
txt3 = ''' && sib.nodeName.toLowerCase()==tag''' txt3 = ''' && sib.nodeName.toLowerCase()==tag'''
txt4 = ''' txt4 = '''
if(nth>1){path = '/' + tag + '[' + nth + ']' + path;} if(nth>1){path = '/' + tag + '[' + nth + ']' + path;}
@ -497,7 +495,7 @@ class DriverElement(DrissionElement):
txt5 = '''return path;''' txt5 = '''return path;'''
elif mode == 'css': elif mode == 'css':
txt1 = '' txt1 = ''
txt2 = '''return '#' + el.id + path;''' # txt2 = '''return '#' + el.id + path;'''
txt3 = '' txt3 = ''
txt4 = '''path = '>' + ":nth-child(" + nth + ")" + path;''' txt4 = '''path = '>' + ":nth-child(" + nth + ")" + path;'''
txt5 = '''return path.substr(1);''' txt5 = '''return path.substr(1);'''
@ -510,16 +508,13 @@ class DriverElement(DrissionElement):
var path = ''; var path = '';
while (el.nodeType === Node.ELEMENT_NODE) { while (el.nodeType === Node.ELEMENT_NODE) {
''' + txt1 + ''' ''' + txt1 + '''
if (el.id) {
''' + txt2 + '''
} else {
var sib = el, nth = 0; var sib = el, nth = 0;
while (sib) { while (sib) {
if(sib.nodeType === Node.ELEMENT_NODE''' + txt3 + '''){nth += 1;} if(sib.nodeType === Node.ELEMENT_NODE''' + txt3 + '''){nth += 1;}
sib = sib.previousSibling; sib = sib.previousSibling;
} }
''' + txt4 + ''' ''' + txt4 + '''
}
el = el.parentNode; el = el.parentNode;
} }
''' + txt5 + ''' ''' + txt5 + '''
@ -567,7 +562,7 @@ class DriverElement(DrissionElement):
def execute_driver_find(page_or_ele, def execute_driver_find(page_or_ele,
loc: Tuple[str, str], loc: Tuple[str, str],
mode: str = 'single', mode: str = 'single',
timeout: float = 10) -> Union[DriverElement, List[DriverElement], str, None]: timeout: float = None) -> Union[DriverElement, List[DriverElement], str, None]:
"""执行driver模式元素的查找 \n """执行driver模式元素的查找 \n
页面查找元素及元素查找下级元素皆使用此方法 \n 页面查找元素及元素查找下级元素皆使用此方法 \n
:param page_or_ele: DriverPage对象或DriverElement对象 :param page_or_ele: DriverPage对象或DriverElement对象
@ -588,15 +583,19 @@ def execute_driver_find(page_or_ele,
driver = page_or_ele.driver driver = page_or_ele.driver
try: try:
wait = WebDriverWait(driver, timeout=timeout) if timeout and timeout != page.timeout:
wait = WebDriverWait(driver, timeout=timeout)
else:
wait = page.wait
if loc[0] == 'xpath': if loc[0] == 'xpath':
return wait.until(ElementsByXpath(page, loc[1], mode, timeout)) return wait.until(ElementsByXpath(page, loc[1], mode, timeout))
else: else:
if mode == 'single': if mode == 'single':
return DriverElement(wait.until(ec.presence_of_element_located(loc)), page, timeout) return DriverElement(wait.until(ec.presence_of_element_located(loc)), page)
elif mode == 'all': elif mode == 'all':
eles = wait.until(ec.presence_of_all_elements_located(loc)) eles = wait.until(ec.presence_of_all_elements_located(loc))
return [DriverElement(ele, page, timeout) for ele in eles] return [DriverElement(ele, page) for ele in eles]
except TimeoutException: except TimeoutException:
return [] if mode == 'all' else None return [] if mode == 'all' else None
@ -622,8 +621,6 @@ class ElementsByXpath(object):
def __call__(self, ele_or_driver: Union[WebDriver, WebElement]) \ def __call__(self, ele_or_driver: Union[WebDriver, WebElement]) \
-> Union[str, DriverElement, None, List[str or DriverElement]]: -> Union[str, DriverElement, None, List[str or DriverElement]]:
driver, the_node = ((ele_or_driver, 'document') if isinstance(ele_or_driver, WebDriver)
else (ele_or_driver.parent, ele_or_driver))
def get_nodes(node=None, xpath_txt=None, type_txt='7'): def get_nodes(node=None, xpath_txt=None, type_txt='7'):
"""用js通过xpath获取元素、节点或属性 """用js通过xpath获取元素、节点或属性
@ -669,12 +666,18 @@ class ElementsByXpath(object):
""" """
return driver.execute_script(js, node) return driver.execute_script(js, node)
# 把lxml元素对象包装成DriverElement对象并按需要返回第一个或全部 if isinstance(ele_or_driver, WebDriver):
driver, the_node = ele_or_driver, 'document'
else:
driver, the_node = ele_or_driver.parent, ele_or_driver
# 把lxml元素对象包装成DriverElement对象并按需要返回第一个或全部
if self.mode == 'single': if self.mode == 'single':
try: try:
e = get_nodes(the_node, xpath_txt=self.xpath, type_txt='9') e = get_nodes(the_node, xpath_txt=self.xpath, type_txt='9')
if isinstance(e, WebElement): if isinstance(e, WebElement):
return DriverElement(e, self.page, self.timeout) return DriverElement(e, self.page)
elif isinstance(e, str): elif isinstance(e, str):
return format_html(e) return format_html(e)
else: else:
@ -685,7 +688,7 @@ class ElementsByXpath(object):
return None return None
elif self.mode == 'all': elif self.mode == 'all':
return ([DriverElement(x, self.page, self.timeout) if isinstance(x, WebElement) return ([DriverElement(x, self.page) if isinstance(x, WebElement)
else format_html(x) else format_html(x)
for x in get_nodes(the_node, xpath_txt=self.xpath) for x in get_nodes(the_node, xpath_txt=self.xpath)
if x != '\n']) if x != '\n'])

View File

@ -13,6 +13,7 @@ from urllib.parse import quote
from selenium.common.exceptions import NoAlertPresentException from selenium.common.exceptions import NoAlertPresentException
from selenium.webdriver.chrome.webdriver import WebDriver from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.wait import WebDriverWait
from .common import str_to_loc, get_available_file_name, translate_loc, format_html from .common import str_to_loc, get_available_file_name, translate_loc, format_html
from .driver_element import DriverElement, execute_driver_find from .driver_element import DriverElement, execute_driver_find
@ -24,9 +25,10 @@ class DriverPage(object):
def __init__(self, driver: WebDriver, timeout: float = 10): def __init__(self, driver: WebDriver, timeout: float = 10):
"""初始化函数接收一个WebDriver对象用来操作网页""" """初始化函数接收一个WebDriver对象用来操作网页"""
self._driver = driver self._driver = driver
self.timeout = timeout self._timeout = timeout
self._url = None self._url = None
self._url_available = None self._url_available = None
self._wait = None
@property @property
def driver(self) -> WebDriver: def driver(self) -> WebDriver:
@ -60,6 +62,22 @@ class DriverPage(object):
"""返回网页title""" """返回网页title"""
return self.driver.title return self.driver.title
@property
def timeout(self) -> float:
return self._timeout
@timeout.setter
def timeout(self, second: float) -> None:
self._timeout = second
self._wait = None
@property
def wait(self) -> WebDriverWait:
if self._wait is None:
self._wait = WebDriverWait(self.driver, timeout=self.timeout)
return self._wait
def get_cookies(self, as_dict: bool = False) -> Union[list, dict]: def get_cookies(self, as_dict: bool = False) -> Union[list, dict]:
"""返回当前网站cookies""" """返回当前网站cookies"""
if as_dict: if as_dict:
@ -165,7 +183,7 @@ class DriverPage(object):
# 接收到WebElement对象打包成DriverElement对象返回 # 接收到WebElement对象打包成DriverElement对象返回
elif isinstance(loc_or_ele, WebElement): elif isinstance(loc_or_ele, WebElement):
return DriverElement(loc_or_ele, self, self.timeout) return DriverElement(loc_or_ele, self)
# 接收到的类型不正确,抛出异常 # 接收到的类型不正确,抛出异常
else: else:

View File

@ -336,8 +336,8 @@ class MixPage(Null, SessionPage, DriverPage):
def ele(self, def ele(self,
loc_or_ele: Union[Tuple[str, str], str, DriverElement, SessionElement, WebElement], loc_or_ele: Union[Tuple[str, str], str, DriverElement, SessionElement, WebElement],
mode: str = None, mode: str = None,
timeout: float = None) -> Union[ timeout: float = None) \
DriverElement, SessionElement, str, List[SessionElement], List[DriverElement]]: -> Union[DriverElement, SessionElement, str, List[SessionElement], List[DriverElement]]:
"""返回页面中符合条件的元素、属性或节点文本,默认返回第一个 \n """返回页面中符合条件的元素、属性或节点文本,默认返回第一个 \n
示例 \n 示例 \n
- 接收到元素对象时 \n - 接收到元素对象时 \n

View File

@ -8,8 +8,7 @@ import re
from typing import Union, List, Tuple from typing import Union, List, Tuple
from urllib.parse import urlparse, urljoin, urlunparse from urllib.parse import urlparse, urljoin, urlunparse
from cssselect import SelectorSyntaxError from lxml.etree import tostring
from lxml.etree import tostring, XPathEvalError
from lxml.html import HtmlElement, fromstring from lxml.html import HtmlElement, fromstring
from .common import DrissionElement, str_to_loc, translate_loc, format_html from .common import DrissionElement, str_to_loc, translate_loc, format_html
@ -285,20 +284,20 @@ class SessionElement(DrissionElement):
ele = self ele = self
while ele: while ele:
ele_id = ele.attr('id') # ele_id = ele.attr('id')
if ele_id: # if ele_id:
return f'#{ele_id}{path_str}' if mode == 'css' else f'//{ele.tag}[@id="{ele_id}"]{path_str}' # return f'#{ele_id}{path_str}' if mode == 'css' else f'//{ele.tag}[@id="{ele_id}"]{path_str}'
# else:
if mode == 'css':
brothers = len(ele.eles(f'xpath:./preceding-sibling::*'))
path_str = f'>:nth-child({brothers + 1}){path_str}'
else: else:
brothers = len(ele.eles(f'xpath:./preceding-sibling::{ele.tag}'))
path_str = f'/{ele.tag}[{brothers + 1}]{path_str}' if brothers > 0 else f'/{ele.tag}{path_str}'
if mode == 'css': ele = ele.parent
brothers = len(ele.eles(f'xpath:./preceding-sibling::*'))
path_str = f'>:nth-child({brothers + 1}){path_str}'
else:
brothers = len(ele.eles(f'xpath:./preceding-sibling::{ele.tag}'))
path_str = f'/{ele.tag}[{brothers + 1}]{path_str}' if brothers > 0 else f'/{ele.tag}{path_str}'
ele = ele.parent
return path_str[1:] if mode == 'css' else path_str return path_str[1:] if mode == 'css' else path_str
@ -383,8 +382,11 @@ def execute_session_find(page_or_ele,
elif mode == 'all': elif mode == 'all':
return [SessionElement(e, page) if isinstance(e, HtmlElement) else e for e in ele if e != '\n'] return [SessionElement(e, page) if isinstance(e, HtmlElement) else e for e in ele if e != '\n']
except XPathEvalError: except Exception as e:
raise SyntaxError(f'Invalid xpath syntax. {loc}')
except SelectorSyntaxError: if 'Invalid expression' in str(e):
raise SyntaxError(f'Invalid css selector syntax. {loc}') raise SyntaxError(f'Invalid xpath syntax. {loc}')
elif 'Expected selector' in str(e):
raise SyntaxError(f'Invalid css selector syntax. {loc}')
raise e

View File

@ -10,10 +10,9 @@ from .driver_element import execute_driver_find, DriverElement
class ShadowRootElement(DrissionElement): class ShadowRootElement(DrissionElement):
def __init__(self, inner_ele: WebElement, parent_ele: DriverElement, timeout: float = 10): def __init__(self, inner_ele: WebElement, parent_ele: DriverElement):
super().__init__(inner_ele, parent_ele.page) super().__init__(inner_ele, parent_ele.page)
self.parent_ele = parent_ele self.parent_ele = parent_ele
self.timeout = timeout
def __repr__(self): def __repr__(self):
return f'<ShadowRootElement in {self.parent_ele} >' return f'<ShadowRootElement in {self.parent_ele} >'
@ -29,7 +28,7 @@ class ShadowRootElement(DrissionElement):
:param timeout: 超时时间 :param timeout: 超时时间
:return: DriverElement对象 :return: DriverElement对象
""" """
return self.ele(loc_or_str, mode, timeout or self.timeout) return self.ele(loc_or_str, mode, timeout)
@property @property
def tag(self): def tag(self):
@ -107,8 +106,6 @@ class ShadowRootElement(DrissionElement):
else: else:
raise ValueError('Argument loc_or_str can only be tuple or str.') raise ValueError('Argument loc_or_str can only be tuple or str.')
timeout = timeout or self.timeout
if loc_or_str[0] == 'css selector': if loc_or_str[0] == 'css selector':
return execute_driver_find(self, loc_or_str, mode, timeout) return execute_driver_find(self, loc_or_str, mode, timeout)
elif loc_or_str[0] == 'text': elif loc_or_str[0] == 'text':
@ -192,18 +189,18 @@ class ShadowRootElement(DrissionElement):
if text == txt: if text == txt:
if mode == 'single': if mode == 'single':
return DriverElement(ele, self.page, self.timeout) return DriverElement(ele, self.page)
elif mode == 'all': elif mode == 'all':
results.append(DriverElement(ele, self.page, self.timeout)) results.append(DriverElement(ele, self.page))
# 模糊匹配 # 模糊匹配
elif match == 'fuzzy': elif match == 'fuzzy':
if text in txt: if text in txt:
if mode == 'single': if mode == 'single':
return DriverElement(ele, self.page, self.timeout) return DriverElement(ele, self.page)
elif mode == 'all': elif mode == 'all':
results.append(DriverElement(ele, self.page, self.timeout)) results.append(DriverElement(ele, self.page))
return None if mode == 'single' else results return None if mode == 'single' else results

View File

@ -1846,7 +1846,6 @@ Parameter Description:
- ele: WebElement- WebElement object - ele: WebElement- WebElement object
- page: DriverPage- the page object where the element is located - page: DriverPage- the page object where the element is located
- timeout: float - Find the timeout of the element (it can be set separately each time the element is searched)
@ -2555,7 +2554,6 @@ Parameter Description:
- parent_ele: DriverElement-the element to which the shadow-root is attached - parent_ele: DriverElement-the element to which the shadow-root is attached
- timeout: float-timeout

View File

@ -1,5 +1,4 @@
selenium selenium
requests requests
tldextract tldextract
lxml lxml
cssselect