元素的html属性改为返回outerHTML,增加inner_html属性;

处理xpath时只对/开头的情况添加.;
在统一的函数处理html转码及空格替换
This commit is contained in:
g1879 2020-11-12 18:09:01 +08:00
parent 78437f604e
commit eb866ba29e
2 changed files with 93 additions and 62 deletions

View File

@ -4,8 +4,6 @@
@Contact : g1879@qq.com @Contact : g1879@qq.com
@File : driver_element.py @File : driver_element.py
""" """
import re
from html import unescape
from pathlib import Path from pathlib import Path
from time import sleep from time import sleep
from typing import Union, List, Any, Tuple from typing import Union, List, Any, Tuple
@ -16,7 +14,7 @@ from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support.wait import WebDriverWait
from .common import DrissionElement, get_loc_from_str, get_available_file_name, translate_loc_to_xpath from .common import DrissionElement, str_to_loc, get_available_file_name, translate_loc, format_html
class DriverElement(DrissionElement): class DriverElement(DrissionElement):
@ -39,8 +37,13 @@ class DriverElement(DrissionElement):
# -----------------共有属性------------------- # -----------------共有属性-------------------
@property @property
def html(self) -> str: def html(self) -> str:
"""返回元素outerHTML文本"""
return self.attr('outerHTML')
@property
def inner_html(self) -> str:
"""返回元素innerHTML文本""" """返回元素innerHTML文本"""
return unescape(self.attr('innerHTML')).replace('\xa0', ' ') return self.attr('innerHTML')
@property @property
def tag(self) -> str: def tag(self) -> str:
@ -69,7 +72,7 @@ class DriverElement(DrissionElement):
@property @property
def text(self) -> str: def text(self) -> str:
"""返回元素内所有文本""" """返回元素内所有文本"""
return unescape(self.attr('innerText')).replace('\xa0', ' ') return self.attr('innerText')
@property @property
def css_path(self) -> str: def css_path(self) -> str:
@ -164,7 +167,8 @@ class DriverElement(DrissionElement):
:param attr: 属性名 :param attr: 属性名
:return: 属性值文本 :return: 属性值文本
""" """
return self.text if attr == 'text' else self.inner_ele.get_attribute(attr) attr = 'innerText' if attr == 'text' else attr
return format_html(self.inner_ele.get_attribute(attr))
def ele(self, def ele(self,
loc_or_str: Union[Tuple[str, str], str], loc_or_str: Union[Tuple[str, str], str],
@ -197,29 +201,35 @@ class DriverElement(DrissionElement):
""" """
if isinstance(loc_or_str, (str, tuple)): if isinstance(loc_or_str, (str, tuple)):
if isinstance(loc_or_str, str): if isinstance(loc_or_str, str):
loc_or_str = get_loc_from_str(loc_or_str) loc_or_str = str_to_loc(loc_or_str)
else: else:
if len(loc_or_str) != 2: if len(loc_or_str) != 2:
raise ValueError("Len of loc_or_str must be 2 when it's a tuple.") raise ValueError("Len of loc_or_str must be 2 when it's a tuple.")
loc_or_str = translate_loc_to_xpath(loc_or_str)
loc_or_str = translate_loc(loc_or_str)
else: else:
raise ValueError('Argument loc_or_str can only be tuple or str.') raise ValueError('Argument loc_or_str can only be tuple or str.')
if loc_or_str[0] == 'xpath': loc_str = loc_or_str[1]
# 处理语句最前面的( # if loc_or_str[0] == 'xpath':
brackets = len(re.match(r'\(*', loc_or_str[1]).group(0)) # # 处理语句最前面的(
bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:] # brackets = len(re.match(r'\(*', loc_or_str[1]).group(0))
# bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:]
#
# # 确保查询语句最前面是.
# loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}'
# loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}'
# loc_str = f'{bracket}{loc_str}'
# 确保查询语句最前面是. if loc_or_str[0] == 'xpath' and loc_or_str[1].lstrip().startswith('/'):
loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}' loc_str = f'.{loc_str}'
loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}'
loc_or_str = loc_or_str[0], f'{bracket}{loc_str}'
elif loc_or_str[0] == 'css selector': if loc_or_str[0] == 'css selector' and loc_or_str[1].lstrip().startswith('>'):
if loc_or_str[1].lstrip().startswith('>'): loc_str = f'{self.css_path}{loc_or_str[1]}'
loc_or_str = loc_or_str[0], f'{self.css_path}{loc_or_str[1]}'
timeout = timeout or self.timeout timeout = timeout or self.timeout
loc_or_str = loc_or_str[0], loc_str
return execute_driver_find(self, loc_or_str, mode, timeout) return execute_driver_find(self, loc_or_str, mode, timeout)
@ -571,7 +581,7 @@ def execute_driver_find(page_or_ele,
return [] if mode == 'all' else None return [] if mode == 'all' else None
except InvalidElementStateException: except InvalidElementStateException:
raise ValueError('Invalid query syntax.', loc) raise ValueError(f'Invalid query syntax. {loc}')
class ElementsByXpath(object): class ElementsByXpath(object):
@ -641,17 +651,20 @@ class ElementsByXpath(object):
if self.mode == 'single': if self.mode == 'single':
try: try:
e = get_nodes(the_node, xpath_txt=self.xpath, type_txt='9') e = get_nodes(the_node, xpath_txt=self.xpath, type_txt='9')
return (DriverElement(e, self.page, self.timeout) if isinstance(e, WebElement):
if isinstance(e, WebElement) else unescape(e).replace('\xa0', ' ')) return DriverElement(e, self.page, self.timeout)
elif isinstance(e, str):
return format_html(e)
else:
return e
# 找不到目标时 # 找不到目标时
except JavascriptException: except JavascriptException:
return None return None
elif self.mode == 'all': elif self.mode == 'all':
e = get_nodes(the_node, xpath_txt=self.xpath) # 去除元素间换行符
return ([DriverElement(x, self.page, self.timeout) if isinstance(x, WebElement)
# 去除元素间换行符并替换空格 else format_html(x)
e = (unescape(x).replace('\xa0', ' ') if isinstance(x, str) else x for x in e if x != '\n') for x in get_nodes(the_node, xpath_txt=self.xpath)
if x != '\n'])
return [DriverElement(x, self.page, self.timeout) if isinstance(x, WebElement) else x for x in e]

View File

@ -5,20 +5,20 @@
@File : session_element.py @File : session_element.py
""" """
import re import re
from html import unescape
from typing import Union, List, Tuple from typing import Union, List, Tuple
from urllib.parse import urlparse, urljoin, urlunparse from urllib.parse import urlparse, urljoin, urlunparse
from cssselect import SelectorSyntaxError from cssselect import SelectorSyntaxError
from lxml.etree import tostring, HTML, _Element, XPathEvalError from lxml.etree import tostring, XPathEvalError
from lxml.html import HtmlElement, fromstring
from .common import DrissionElement, get_loc_from_str, translate_loc_to_xpath from .common import DrissionElement, str_to_loc, translate_loc, format_html
class SessionElement(DrissionElement): class SessionElement(DrissionElement):
"""session模式的元素对象包装了一个lxml的Element对象并封装了常用功能""" """session模式的元素对象包装了一个lxml的Element对象并封装了常用功能"""
def __init__(self, ele: _Element, page=None): def __init__(self, ele: HtmlElement, page=None):
super().__init__(ele, page) super().__init__(ele, page)
def __repr__(self): def __repr__(self):
@ -30,8 +30,13 @@ class SessionElement(DrissionElement):
@property @property
def html(self) -> str: def html(self) -> str:
"""返回元素outerHTML文本"""
return format_html(tostring(self._inner_ele).decode())
@property
def inner_html(self) -> str:
"""返回元素innerHTML文本""" """返回元素innerHTML文本"""
html = unescape(tostring(self._inner_ele).decode()).replace('\xa0', ' ') html = format_html(tostring(self._inner_ele).decode())
r = re.match(r'<.*?>(.*)</.*?>', html, flags=re.DOTALL) r = re.match(r'<.*?>(.*)</.*?>', html, flags=re.DOTALL)
return None if not r else r.group(1) return None if not r else r.group(1)
@ -48,7 +53,7 @@ class SessionElement(DrissionElement):
@property @property
def text(self) -> str: def text(self) -> str:
"""返回元素内所有文本""" """返回元素内所有文本"""
return unescape(self._inner_ele.text).replace('\xa0', ' ') return self._inner_ele.text_content()
@property @property
def css_path(self) -> str: def css_path(self) -> str:
@ -81,9 +86,20 @@ class SessionElement(DrissionElement):
:return: 文本列表 :return: 文本列表
""" """
if text_node_only: if text_node_only:
return self.eles('xpath:./*/text()') return self.eles('xpath:/text()')
else: else:
return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./*/node()')] texts = []
for node in self.eles('xpath:/node()'):
if isinstance(node, str):
text = node
else:
text = node.text
if text:
texts.append(text)
return texts
def parents(self, num: int = 1): def parents(self, num: int = 1):
"""返回上面第num级父元素 \n """返回上面第num级父元素 \n
@ -128,14 +144,14 @@ class SessionElement(DrissionElement):
elif attr == 'src': elif attr == 'src':
return self._make_absolute(self.inner_ele.get('src')) return self._make_absolute(self.inner_ele.get('src'))
elif attr == 'text': elif attr in ['text', 'innerText']:
return self.text return self.text
elif attr == 'outerHTML': elif attr == 'outerHTML':
return unescape(tostring(self._inner_ele).decode()).replace('\xa0', ' ') return self.html
elif attr == 'innerHTML': elif attr == 'innerHTML':
return self.html return self.inner_html
else: else:
return self.inner_ele.get(attr) return self.inner_ele.get(attr)
@ -167,30 +183,33 @@ class SessionElement(DrissionElement):
""" """
if isinstance(loc_or_str, (str, tuple)): if isinstance(loc_or_str, (str, tuple)):
if isinstance(loc_or_str, str): if isinstance(loc_or_str, str):
loc_or_str = get_loc_from_str(loc_or_str) loc_or_str = str_to_loc(loc_or_str)
else: else:
if len(loc_or_str) != 2: if len(loc_or_str) != 2:
raise ValueError("Len of loc_or_str must be 2 when it's a tuple.") raise ValueError("Len of loc_or_str must be 2 when it's a tuple.")
loc_or_str = translate_loc_to_xpath(loc_or_str) loc_or_str = translate_loc(loc_or_str)
else: else:
raise ValueError('Argument loc_or_str can only be tuple or str.') raise ValueError('Argument loc_or_str can only be tuple or str.')
element = self element = self
if loc_or_str[0] == 'xpath': loc_str = loc_or_str[1]
brackets = len(re.match(r'\(*', loc_or_str[1]).group(0)) # if loc_or_str[0] == 'xpath':
bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:] # brackets = len(re.match(r'\(*', loc_or_str[1]).group(0))
loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}' # bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:]
loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}' # loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}'
loc_str = f'{bracket}{loc_str}' # loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}'
# loc_str = f'{bracket}{loc_str}'
else: # css selector if loc_or_str[0] == 'xpath' and loc_or_str[1].lstrip().startswith('/'):
if loc_or_str[1][0].startswith('>'): loc_str = f'.{loc_str}'
# 若css以>开头表示找元素的直接子元素要用page以绝对路径才能找到
if loc_or_str[0] == 'css selector' and loc_or_str[1].lstrip().startswith('>'):
loc_str = f'{self.css_path}{loc_or_str[1]}' loc_str = f'{self.css_path}{loc_or_str[1]}'
element = self.page element = self.page
else:
loc_str = loc_or_str[1]
loc_or_str = loc_or_str[0], loc_str loc_or_str = loc_or_str[0], loc_str
return execute_session_find(element, loc_or_str, mode) return execute_session_find(element, loc_or_str, mode)
def eles(self, loc_or_str: Union[Tuple[str, str], str]): def eles(self, loc_or_str: Union[Tuple[str, str], str]):
@ -298,7 +317,7 @@ class SessionElement(DrissionElement):
def execute_session_find(page_or_ele, def execute_session_find(page_or_ele,
loc: Tuple[str, str], loc: Tuple[str, str],
mode: str = 'single', ) -> Union[SessionElement, List[SessionElement or str], None]: mode: str = 'single', ) -> Union[SessionElement, List[SessionElement or str], str, None]:
"""执行session模式元素的查找 \n """执行session模式元素的查找 \n
页面查找元素及元素查找下级元素皆使用此方法 \n 页面查找元素及元素查找下级元素皆使用此方法 \n
:param page_or_ele: SessionPage对象或SessionElement对象 :param page_or_ele: SessionPage对象或SessionElement对象
@ -316,7 +335,7 @@ def execute_session_find(page_or_ele,
page_or_ele = page_or_ele.inner_ele page_or_ele = page_or_ele.inner_ele
else: # 传入的是SessionPage对象 else: # 传入的是SessionPage对象
page = page_or_ele page = page_or_ele
page_or_ele = HTML(page_or_ele.response.text) page_or_ele = fromstring(page_or_ele.html)
try: try:
# 用lxml内置方法获取lxml的元素对象列表 # 用lxml内置方法获取lxml的元素对象列表
@ -329,20 +348,19 @@ def execute_session_find(page_or_ele,
if mode == 'single': if mode == 'single':
ele = ele[0] if ele else None ele = ele[0] if ele else None
if isinstance(ele, _Element): if isinstance(ele, HtmlElement):
return SessionElement(ele, page) return SessionElement(ele, page)
elif isinstance(ele, str): elif isinstance(ele, str):
return unescape(ele).replace('\xa0', ' ') return ele
else: else:
return None return None
elif mode == 'all': elif mode == 'all':
# 去除元素间换行符并替换空格 # 去除元素间换行符
ele = (unescape(x).replace('\xa0', ' ') if isinstance(x, str) else x for x in ele if x != '\n') return [SessionElement(e, page) if isinstance(e, HtmlElement) else e for e in ele if e != '\n']
return [SessionElement(e, page) if isinstance(e, _Element) else e for e in ele]
except XPathEvalError: except XPathEvalError:
raise SyntaxError('Invalid xpath syntax.', loc) raise SyntaxError(f'Invalid xpath syntax. {loc}')
except SelectorSyntaxError: except SelectorSyntaxError:
raise SyntaxError('Invalid css selector syntax.', loc) raise SyntaxError(f'Invalid css selector syntax. {loc}')