mirror of
https://gitee.com/g1879/DrissionPage.git
synced 2024-12-10 04:00:23 +08:00
元素的html属性改为返回outerHTML,增加inner_html属性;
处理xpath时只对/开头的情况添加.; 在统一的函数处理html转码及空格替换
This commit is contained in:
parent
78437f604e
commit
eb866ba29e
@ -4,8 +4,6 @@
|
|||||||
@Contact : g1879@qq.com
|
@Contact : g1879@qq.com
|
||||||
@File : driver_element.py
|
@File : driver_element.py
|
||||||
"""
|
"""
|
||||||
import re
|
|
||||||
from html import unescape
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from typing import Union, List, Any, Tuple
|
from typing import Union, List, Any, Tuple
|
||||||
@ -16,7 +14,7 @@ from selenium.webdriver.remote.webelement import WebElement
|
|||||||
from selenium.webdriver.support import expected_conditions as ec
|
from selenium.webdriver.support import expected_conditions as ec
|
||||||
from selenium.webdriver.support.wait import WebDriverWait
|
from selenium.webdriver.support.wait import WebDriverWait
|
||||||
|
|
||||||
from .common import DrissionElement, get_loc_from_str, get_available_file_name, translate_loc_to_xpath
|
from .common import DrissionElement, str_to_loc, get_available_file_name, translate_loc, format_html
|
||||||
|
|
||||||
|
|
||||||
class DriverElement(DrissionElement):
|
class DriverElement(DrissionElement):
|
||||||
@ -39,8 +37,13 @@ class DriverElement(DrissionElement):
|
|||||||
# -----------------共有属性-------------------
|
# -----------------共有属性-------------------
|
||||||
@property
|
@property
|
||||||
def html(self) -> str:
|
def html(self) -> str:
|
||||||
|
"""返回元素outerHTML文本"""
|
||||||
|
return self.attr('outerHTML')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def inner_html(self) -> str:
|
||||||
"""返回元素innerHTML文本"""
|
"""返回元素innerHTML文本"""
|
||||||
return unescape(self.attr('innerHTML')).replace('\xa0', ' ')
|
return self.attr('innerHTML')
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tag(self) -> str:
|
def tag(self) -> str:
|
||||||
@ -69,7 +72,7 @@ class DriverElement(DrissionElement):
|
|||||||
@property
|
@property
|
||||||
def text(self) -> str:
|
def text(self) -> str:
|
||||||
"""返回元素内所有文本"""
|
"""返回元素内所有文本"""
|
||||||
return unescape(self.attr('innerText')).replace('\xa0', ' ')
|
return self.attr('innerText')
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def css_path(self) -> str:
|
def css_path(self) -> str:
|
||||||
@ -164,7 +167,8 @@ class DriverElement(DrissionElement):
|
|||||||
:param attr: 属性名
|
:param attr: 属性名
|
||||||
:return: 属性值文本
|
:return: 属性值文本
|
||||||
"""
|
"""
|
||||||
return self.text if attr == 'text' else self.inner_ele.get_attribute(attr)
|
attr = 'innerText' if attr == 'text' else attr
|
||||||
|
return format_html(self.inner_ele.get_attribute(attr))
|
||||||
|
|
||||||
def ele(self,
|
def ele(self,
|
||||||
loc_or_str: Union[Tuple[str, str], str],
|
loc_or_str: Union[Tuple[str, str], str],
|
||||||
@ -197,29 +201,35 @@ class DriverElement(DrissionElement):
|
|||||||
"""
|
"""
|
||||||
if isinstance(loc_or_str, (str, tuple)):
|
if isinstance(loc_or_str, (str, tuple)):
|
||||||
if isinstance(loc_or_str, str):
|
if isinstance(loc_or_str, str):
|
||||||
loc_or_str = get_loc_from_str(loc_or_str)
|
loc_or_str = str_to_loc(loc_or_str)
|
||||||
else:
|
else:
|
||||||
if len(loc_or_str) != 2:
|
if len(loc_or_str) != 2:
|
||||||
raise ValueError("Len of loc_or_str must be 2 when it's a tuple.")
|
raise ValueError("Len of loc_or_str must be 2 when it's a tuple.")
|
||||||
loc_or_str = translate_loc_to_xpath(loc_or_str)
|
|
||||||
|
loc_or_str = translate_loc(loc_or_str)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError('Argument loc_or_str can only be tuple or str.')
|
raise ValueError('Argument loc_or_str can only be tuple or str.')
|
||||||
|
|
||||||
if loc_or_str[0] == 'xpath':
|
loc_str = loc_or_str[1]
|
||||||
# 处理语句最前面的(
|
# if loc_or_str[0] == 'xpath':
|
||||||
brackets = len(re.match(r'\(*', loc_or_str[1]).group(0))
|
# # 处理语句最前面的(
|
||||||
bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:]
|
# brackets = len(re.match(r'\(*', loc_or_str[1]).group(0))
|
||||||
|
# bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:]
|
||||||
|
#
|
||||||
|
# # 确保查询语句最前面是.
|
||||||
|
# loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}'
|
||||||
|
# loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}'
|
||||||
|
# loc_str = f'{bracket}{loc_str}'
|
||||||
|
|
||||||
# 确保查询语句最前面是.
|
if loc_or_str[0] == 'xpath' and loc_or_str[1].lstrip().startswith('/'):
|
||||||
loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}'
|
loc_str = f'.{loc_str}'
|
||||||
loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}'
|
|
||||||
loc_or_str = loc_or_str[0], f'{bracket}{loc_str}'
|
|
||||||
|
|
||||||
elif loc_or_str[0] == 'css selector':
|
if loc_or_str[0] == 'css selector' and loc_or_str[1].lstrip().startswith('>'):
|
||||||
if loc_or_str[1].lstrip().startswith('>'):
|
loc_str = f'{self.css_path}{loc_or_str[1]}'
|
||||||
loc_or_str = loc_or_str[0], f'{self.css_path}{loc_or_str[1]}'
|
|
||||||
|
|
||||||
timeout = timeout or self.timeout
|
timeout = timeout or self.timeout
|
||||||
|
loc_or_str = loc_or_str[0], loc_str
|
||||||
|
|
||||||
return execute_driver_find(self, loc_or_str, mode, timeout)
|
return execute_driver_find(self, loc_or_str, mode, timeout)
|
||||||
|
|
||||||
@ -571,7 +581,7 @@ def execute_driver_find(page_or_ele,
|
|||||||
return [] if mode == 'all' else None
|
return [] if mode == 'all' else None
|
||||||
|
|
||||||
except InvalidElementStateException:
|
except InvalidElementStateException:
|
||||||
raise ValueError('Invalid query syntax.', loc)
|
raise ValueError(f'Invalid query syntax. {loc}')
|
||||||
|
|
||||||
|
|
||||||
class ElementsByXpath(object):
|
class ElementsByXpath(object):
|
||||||
@ -641,17 +651,20 @@ class ElementsByXpath(object):
|
|||||||
if self.mode == 'single':
|
if self.mode == 'single':
|
||||||
try:
|
try:
|
||||||
e = get_nodes(the_node, xpath_txt=self.xpath, type_txt='9')
|
e = get_nodes(the_node, xpath_txt=self.xpath, type_txt='9')
|
||||||
return (DriverElement(e, self.page, self.timeout)
|
if isinstance(e, WebElement):
|
||||||
if isinstance(e, WebElement) else unescape(e).replace('\xa0', ' '))
|
return DriverElement(e, self.page, self.timeout)
|
||||||
|
elif isinstance(e, str):
|
||||||
|
return format_html(e)
|
||||||
|
else:
|
||||||
|
return e
|
||||||
|
|
||||||
# 找不到目标时
|
# 找不到目标时
|
||||||
except JavascriptException:
|
except JavascriptException:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
elif self.mode == 'all':
|
elif self.mode == 'all':
|
||||||
e = get_nodes(the_node, xpath_txt=self.xpath)
|
# 去除元素间换行符
|
||||||
|
return ([DriverElement(x, self.page, self.timeout) if isinstance(x, WebElement)
|
||||||
# 去除元素间换行符并替换空格
|
else format_html(x)
|
||||||
e = (unescape(x).replace('\xa0', ' ') if isinstance(x, str) else x for x in e if x != '\n')
|
for x in get_nodes(the_node, xpath_txt=self.xpath)
|
||||||
|
if x != '\n'])
|
||||||
return [DriverElement(x, self.page, self.timeout) if isinstance(x, WebElement) else x for x in e]
|
|
||||||
|
@ -5,20 +5,20 @@
|
|||||||
@File : session_element.py
|
@File : session_element.py
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from html import unescape
|
|
||||||
from typing import Union, List, Tuple
|
from typing import Union, List, Tuple
|
||||||
from urllib.parse import urlparse, urljoin, urlunparse
|
from urllib.parse import urlparse, urljoin, urlunparse
|
||||||
|
|
||||||
from cssselect import SelectorSyntaxError
|
from cssselect import SelectorSyntaxError
|
||||||
from lxml.etree import tostring, HTML, _Element, XPathEvalError
|
from lxml.etree import tostring, XPathEvalError
|
||||||
|
from lxml.html import HtmlElement, fromstring
|
||||||
|
|
||||||
from .common import DrissionElement, get_loc_from_str, translate_loc_to_xpath
|
from .common import DrissionElement, str_to_loc, translate_loc, format_html
|
||||||
|
|
||||||
|
|
||||||
class SessionElement(DrissionElement):
|
class SessionElement(DrissionElement):
|
||||||
"""session模式的元素对象,包装了一个lxml的Element对象,并封装了常用功能"""
|
"""session模式的元素对象,包装了一个lxml的Element对象,并封装了常用功能"""
|
||||||
|
|
||||||
def __init__(self, ele: _Element, page=None):
|
def __init__(self, ele: HtmlElement, page=None):
|
||||||
super().__init__(ele, page)
|
super().__init__(ele, page)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
@ -30,8 +30,13 @@ class SessionElement(DrissionElement):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def html(self) -> str:
|
def html(self) -> str:
|
||||||
|
"""返回元素outerHTML文本"""
|
||||||
|
return format_html(tostring(self._inner_ele).decode())
|
||||||
|
|
||||||
|
@property
|
||||||
|
def inner_html(self) -> str:
|
||||||
"""返回元素innerHTML文本"""
|
"""返回元素innerHTML文本"""
|
||||||
html = unescape(tostring(self._inner_ele).decode()).replace('\xa0', ' ')
|
html = format_html(tostring(self._inner_ele).decode())
|
||||||
r = re.match(r'<.*?>(.*)</.*?>', html, flags=re.DOTALL)
|
r = re.match(r'<.*?>(.*)</.*?>', html, flags=re.DOTALL)
|
||||||
return None if not r else r.group(1)
|
return None if not r else r.group(1)
|
||||||
|
|
||||||
@ -48,7 +53,7 @@ class SessionElement(DrissionElement):
|
|||||||
@property
|
@property
|
||||||
def text(self) -> str:
|
def text(self) -> str:
|
||||||
"""返回元素内所有文本"""
|
"""返回元素内所有文本"""
|
||||||
return unescape(self._inner_ele.text).replace('\xa0', ' ')
|
return self._inner_ele.text_content()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def css_path(self) -> str:
|
def css_path(self) -> str:
|
||||||
@ -81,9 +86,20 @@ class SessionElement(DrissionElement):
|
|||||||
:return: 文本列表
|
:return: 文本列表
|
||||||
"""
|
"""
|
||||||
if text_node_only:
|
if text_node_only:
|
||||||
return self.eles('xpath:./*/text()')
|
return self.eles('xpath:/text()')
|
||||||
else:
|
else:
|
||||||
return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./*/node()')]
|
texts = []
|
||||||
|
|
||||||
|
for node in self.eles('xpath:/node()'):
|
||||||
|
if isinstance(node, str):
|
||||||
|
text = node
|
||||||
|
else:
|
||||||
|
text = node.text
|
||||||
|
|
||||||
|
if text:
|
||||||
|
texts.append(text)
|
||||||
|
|
||||||
|
return texts
|
||||||
|
|
||||||
def parents(self, num: int = 1):
|
def parents(self, num: int = 1):
|
||||||
"""返回上面第num级父元素 \n
|
"""返回上面第num级父元素 \n
|
||||||
@ -128,14 +144,14 @@ class SessionElement(DrissionElement):
|
|||||||
elif attr == 'src':
|
elif attr == 'src':
|
||||||
return self._make_absolute(self.inner_ele.get('src'))
|
return self._make_absolute(self.inner_ele.get('src'))
|
||||||
|
|
||||||
elif attr == 'text':
|
elif attr in ['text', 'innerText']:
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
elif attr == 'outerHTML':
|
elif attr == 'outerHTML':
|
||||||
return unescape(tostring(self._inner_ele).decode()).replace('\xa0', ' ')
|
return self.html
|
||||||
|
|
||||||
elif attr == 'innerHTML':
|
elif attr == 'innerHTML':
|
||||||
return self.html
|
return self.inner_html
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return self.inner_ele.get(attr)
|
return self.inner_ele.get(attr)
|
||||||
@ -167,30 +183,33 @@ class SessionElement(DrissionElement):
|
|||||||
"""
|
"""
|
||||||
if isinstance(loc_or_str, (str, tuple)):
|
if isinstance(loc_or_str, (str, tuple)):
|
||||||
if isinstance(loc_or_str, str):
|
if isinstance(loc_or_str, str):
|
||||||
loc_or_str = get_loc_from_str(loc_or_str)
|
loc_or_str = str_to_loc(loc_or_str)
|
||||||
else:
|
else:
|
||||||
if len(loc_or_str) != 2:
|
if len(loc_or_str) != 2:
|
||||||
raise ValueError("Len of loc_or_str must be 2 when it's a tuple.")
|
raise ValueError("Len of loc_or_str must be 2 when it's a tuple.")
|
||||||
loc_or_str = translate_loc_to_xpath(loc_or_str)
|
loc_or_str = translate_loc(loc_or_str)
|
||||||
else:
|
else:
|
||||||
raise ValueError('Argument loc_or_str can only be tuple or str.')
|
raise ValueError('Argument loc_or_str can only be tuple or str.')
|
||||||
|
|
||||||
element = self
|
element = self
|
||||||
if loc_or_str[0] == 'xpath':
|
loc_str = loc_or_str[1]
|
||||||
brackets = len(re.match(r'\(*', loc_or_str[1]).group(0))
|
# if loc_or_str[0] == 'xpath':
|
||||||
bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:]
|
# brackets = len(re.match(r'\(*', loc_or_str[1]).group(0))
|
||||||
loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}'
|
# bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:]
|
||||||
loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}'
|
# loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}'
|
||||||
loc_str = f'{bracket}{loc_str}'
|
# loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}'
|
||||||
|
# loc_str = f'{bracket}{loc_str}'
|
||||||
|
|
||||||
else: # css selector
|
if loc_or_str[0] == 'xpath' and loc_or_str[1].lstrip().startswith('/'):
|
||||||
if loc_or_str[1][0].startswith('>'):
|
loc_str = f'.{loc_str}'
|
||||||
loc_str = f'{self.css_path}{loc_or_str[1]}'
|
|
||||||
element = self.page
|
# 若css以>开头,表示找元素的直接子元素,要用page以绝对路径才能找到
|
||||||
else:
|
if loc_or_str[0] == 'css selector' and loc_or_str[1].lstrip().startswith('>'):
|
||||||
loc_str = loc_or_str[1]
|
loc_str = f'{self.css_path}{loc_or_str[1]}'
|
||||||
|
element = self.page
|
||||||
|
|
||||||
loc_or_str = loc_or_str[0], loc_str
|
loc_or_str = loc_or_str[0], loc_str
|
||||||
|
|
||||||
return execute_session_find(element, loc_or_str, mode)
|
return execute_session_find(element, loc_or_str, mode)
|
||||||
|
|
||||||
def eles(self, loc_or_str: Union[Tuple[str, str], str]):
|
def eles(self, loc_or_str: Union[Tuple[str, str], str]):
|
||||||
@ -298,7 +317,7 @@ class SessionElement(DrissionElement):
|
|||||||
|
|
||||||
def execute_session_find(page_or_ele,
|
def execute_session_find(page_or_ele,
|
||||||
loc: Tuple[str, str],
|
loc: Tuple[str, str],
|
||||||
mode: str = 'single', ) -> Union[SessionElement, List[SessionElement or str], None]:
|
mode: str = 'single', ) -> Union[SessionElement, List[SessionElement or str], str, None]:
|
||||||
"""执行session模式元素的查找 \n
|
"""执行session模式元素的查找 \n
|
||||||
页面查找元素及元素查找下级元素皆使用此方法 \n
|
页面查找元素及元素查找下级元素皆使用此方法 \n
|
||||||
:param page_or_ele: SessionPage对象或SessionElement对象
|
:param page_or_ele: SessionPage对象或SessionElement对象
|
||||||
@ -316,7 +335,7 @@ def execute_session_find(page_or_ele,
|
|||||||
page_or_ele = page_or_ele.inner_ele
|
page_or_ele = page_or_ele.inner_ele
|
||||||
else: # 传入的是SessionPage对象
|
else: # 传入的是SessionPage对象
|
||||||
page = page_or_ele
|
page = page_or_ele
|
||||||
page_or_ele = HTML(page_or_ele.response.text)
|
page_or_ele = fromstring(page_or_ele.html)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 用lxml内置方法获取lxml的元素对象列表
|
# 用lxml内置方法获取lxml的元素对象列表
|
||||||
@ -329,20 +348,19 @@ def execute_session_find(page_or_ele,
|
|||||||
if mode == 'single':
|
if mode == 'single':
|
||||||
ele = ele[0] if ele else None
|
ele = ele[0] if ele else None
|
||||||
|
|
||||||
if isinstance(ele, _Element):
|
if isinstance(ele, HtmlElement):
|
||||||
return SessionElement(ele, page)
|
return SessionElement(ele, page)
|
||||||
elif isinstance(ele, str):
|
elif isinstance(ele, str):
|
||||||
return unescape(ele).replace('\xa0', ' ')
|
return ele
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
elif mode == 'all':
|
elif mode == 'all':
|
||||||
# 去除元素间换行符并替换空格
|
# 去除元素间换行符
|
||||||
ele = (unescape(x).replace('\xa0', ' ') if isinstance(x, str) else x for x in ele if x != '\n')
|
return [SessionElement(e, page) if isinstance(e, HtmlElement) else e for e in ele if e != '\n']
|
||||||
return [SessionElement(e, page) if isinstance(e, _Element) else e for e in ele]
|
|
||||||
|
|
||||||
except XPathEvalError:
|
except XPathEvalError:
|
||||||
raise SyntaxError('Invalid xpath syntax.', loc)
|
raise SyntaxError(f'Invalid xpath syntax. {loc}')
|
||||||
|
|
||||||
except SelectorSyntaxError:
|
except SelectorSyntaxError:
|
||||||
raise SyntaxError('Invalid css selector syntax.', loc)
|
raise SyntaxError(f'Invalid css selector syntax. {loc}')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user