diff --git a/DrissionPage/common.py b/DrissionPage/common.py index fffd47f..01b1728 100644 --- a/DrissionPage/common.py +++ b/DrissionPage/common.py @@ -5,24 +5,25 @@ @File : common.py """ from abc import abstractmethod +from html import unescape from pathlib import Path from re import split as re_SPLIT from shutil import rmtree from typing import Union -from lxml.etree import _Element +from lxml.html import HtmlElement from selenium.webdriver.remote.webelement import WebElement class DrissionElement(object): """SessionElement和DriverElement的基类""" - def __init__(self, ele: Union[WebElement, _Element], page=None): + def __init__(self, ele: Union[WebElement, HtmlElement], page=None): self._inner_ele = ele self.page = page @property - def inner_ele(self) -> Union[WebElement, _Element]: + def inner_ele(self) -> Union[WebElement, HtmlElement]: return self._inner_ele @property @@ -74,7 +75,7 @@ class DrissionElement(object): # pass -def get_loc_from_str(loc: str) -> tuple: +def str_to_loc(loc: str) -> tuple: """处理元素查找语句 \n 查找方式:属性、tag name及属性、文本、xpath、css selector \n =表示精确匹配,:表示模糊匹配,无控制字符串时默认搜索该字符串 \n @@ -182,7 +183,11 @@ def _make_search_str(search_str: str) -> str: return search_str -def translate_loc_to_xpath(loc: tuple) -> tuple: +def format_html(text: str) -> str: + return unescape(text).replace('\xa0', ' ') + + +def translate_loc(loc: tuple) -> tuple: """把By类型的loc元组转换为css selector或xpath类型的 \n :param loc: By类型的loc元组 :return: css selector或xpath类型的loc元组 diff --git a/DrissionPage/driver_page.py b/DrissionPage/driver_page.py index 4770e0c..2ea5896 100644 --- a/DrissionPage/driver_page.py +++ b/DrissionPage/driver_page.py @@ -14,7 +14,7 @@ from selenium.common.exceptions import NoAlertPresentException from selenium.webdriver.chrome.webdriver import WebDriver from selenium.webdriver.remote.webelement import WebElement -from .common import get_loc_from_str, get_available_file_name, translate_loc_to_xpath +from .common import str_to_loc, get_available_file_name, translate_loc, format_html from .driver_element import DriverElement, execute_driver_find @@ -43,7 +43,7 @@ class DriverPage(object): @property def html(self) -> str: """返回页面html文本""" - return self.driver.find_element_by_xpath("//*").get_attribute("outerHTML") + return format_html(self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")) @property def url_available(self) -> bool: @@ -139,11 +139,11 @@ class DriverPage(object): # 接收到字符串或元组,获取定位loc元组 if isinstance(loc_or_ele, (str, tuple)): if isinstance(loc_or_ele, str): - loc_or_ele = get_loc_from_str(loc_or_ele) + loc_or_ele = str_to_loc(loc_or_ele) else: if len(loc_or_ele) != 2: raise ValueError("Len of loc_or_ele must be 2 when it's a tuple.") - loc_or_ele = translate_loc_to_xpath(loc_or_ele) + loc_or_ele = translate_loc(loc_or_ele) if loc_or_ele[0] == 'xpath' and not loc_or_ele[1].startswith(('/', '(')): loc_or_ele = loc_or_ele[0], f'//{loc_or_ele[1]}' @@ -219,7 +219,7 @@ class DriverPage(object): elif isinstance(loc_or_ele, WebElement): is_ele = True elif isinstance(loc_or_ele, str): - loc_or_ele = get_loc_from_str(loc_or_ele) + loc_or_ele = str_to_loc(loc_or_ele) elif isinstance(loc_or_ele, tuple): pass else: diff --git a/DrissionPage/session_page.py b/DrissionPage/session_page.py index 9fd6567..f17e923 100644 --- a/DrissionPage/session_page.py +++ b/DrissionPage/session_page.py @@ -16,7 +16,7 @@ from urllib.parse import urlparse, quote, unquote from requests import Session, Response -from .common import get_loc_from_str, translate_loc_to_xpath, get_available_file_name +from .common import str_to_loc, translate_loc, get_available_file_name, format_html from .config import OptionsManager from .session_element import SessionElement, execute_session_find @@ -65,7 +65,7 @@ class SessionPage(object): @property def html(self) -> str: """返回页面html文本""" - return self.response.text + return format_html(self.response.text) def ele(self, loc_or_ele: Union[Tuple[str, str], str, SessionElement], @@ -98,11 +98,12 @@ class SessionPage(object): """ if isinstance(loc_or_ele, (str, tuple)): if isinstance(loc_or_ele, str): - loc_or_ele = get_loc_from_str(loc_or_ele) + loc_or_ele = str_to_loc(loc_or_ele) else: if len(loc_or_ele) != 2: raise ValueError("Len of loc_or_ele must be 2 when it's a tuple.") - loc_or_ele = translate_loc_to_xpath(loc_or_ele) + loc_or_ele = translate_loc(loc_or_ele) + if loc_or_ele[0] == 'xpath' and not loc_or_ele[1].startswith(('/', '(')): loc_or_ele = loc_or_ele[0], f'//{loc_or_ele[1]}' diff --git a/DrissionPage/shadow_root_element.py b/DrissionPage/shadow_root_element.py index 2224263..b854529 100644 --- a/DrissionPage/shadow_root_element.py +++ b/DrissionPage/shadow_root_element.py @@ -1,12 +1,11 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from html import unescape from re import split as re_SPLIT from typing import Union, Any from selenium.webdriver.remote.webelement import WebElement -from .common import DrissionElement +from .common import DrissionElement, format_html from .driver_element import execute_driver_find @@ -31,7 +30,7 @@ class ShadowRootElement(DrissionElement): @property def html(self): - return unescape(self.inner_ele.get_attribute('innerHTML')).replace('\xa0', ' ') + return format_html(self.inner_ele.get_attribute('innerHTML')) @property def parent(self): @@ -87,7 +86,7 @@ class ShadowRootElement(DrissionElement): :return: DriverElement对象 """ if isinstance(loc_or_str, str): - loc_or_str = get_css_from_str(loc_or_str) + loc_or_str = str_to_css_loc(loc_or_str) elif isinstance(loc_or_str, tuple) and len(loc_or_str) == 2: if loc_or_str[0] == 'xpath': raise ValueError('不支持xpath') @@ -189,7 +188,7 @@ class ShadowRootElement(DrissionElement): return None if mode == 'single' else results -def get_css_from_str(loc: str) -> tuple: +def str_to_css_loc(loc: str) -> tuple: """处理元素查找语句 \n 查找方式:属性、tag name及属性、文本、css selector \n =表示精确匹配,:表示模糊匹配,无控制字符串时默认搜索该字符串 \n @@ -212,6 +211,7 @@ def get_css_from_str(loc: str) -> tuple: # 根据属性查找 if loc.startswith('@'): r = re_SPLIT(r'([:=])', loc[1:], maxsplit=1) + if len(r) == 3: mode = '=' if r[1] == '=' else '*=' loc_str = f'*[{r[0]}{mode}{r[2]}]' @@ -225,6 +225,7 @@ def get_css_from_str(loc: str) -> tuple: else: at_lst = loc[4:].split('@', maxsplit=1) r = re_SPLIT(r'([:=])', at_lst[1], maxsplit=1) + if len(r) == 3: if r[0] == 'text()': match = 'exact' if r[1] == '=' else 'fuzzy'