mirror of
https://gitee.com/g1879/DrissionPage.git
synced 2024-12-10 04:00:23 +08:00
元素的html属性改为返回outerHTML,增加inner_html属性;
处理xpath时只对/开头的情况添加.; 在统一的函数处理html转码及空格替换
This commit is contained in:
parent
78437f604e
commit
eb866ba29e
@ -4,8 +4,6 @@
|
||||
@Contact : g1879@qq.com
|
||||
@File : driver_element.py
|
||||
"""
|
||||
import re
|
||||
from html import unescape
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
from typing import Union, List, Any, Tuple
|
||||
@ -16,7 +14,7 @@ from selenium.webdriver.remote.webelement import WebElement
|
||||
from selenium.webdriver.support import expected_conditions as ec
|
||||
from selenium.webdriver.support.wait import WebDriverWait
|
||||
|
||||
from .common import DrissionElement, get_loc_from_str, get_available_file_name, translate_loc_to_xpath
|
||||
from .common import DrissionElement, str_to_loc, get_available_file_name, translate_loc, format_html
|
||||
|
||||
|
||||
class DriverElement(DrissionElement):
|
||||
@ -39,8 +37,13 @@ class DriverElement(DrissionElement):
|
||||
# -----------------共有属性-------------------
|
||||
@property
|
||||
def html(self) -> str:
|
||||
"""返回元素outerHTML文本"""
|
||||
return self.attr('outerHTML')
|
||||
|
||||
@property
|
||||
def inner_html(self) -> str:
|
||||
"""返回元素innerHTML文本"""
|
||||
return unescape(self.attr('innerHTML')).replace('\xa0', ' ')
|
||||
return self.attr('innerHTML')
|
||||
|
||||
@property
|
||||
def tag(self) -> str:
|
||||
@ -69,7 +72,7 @@ class DriverElement(DrissionElement):
|
||||
@property
|
||||
def text(self) -> str:
|
||||
"""返回元素内所有文本"""
|
||||
return unescape(self.attr('innerText')).replace('\xa0', ' ')
|
||||
return self.attr('innerText')
|
||||
|
||||
@property
|
||||
def css_path(self) -> str:
|
||||
@ -164,7 +167,8 @@ class DriverElement(DrissionElement):
|
||||
:param attr: 属性名
|
||||
:return: 属性值文本
|
||||
"""
|
||||
return self.text if attr == 'text' else self.inner_ele.get_attribute(attr)
|
||||
attr = 'innerText' if attr == 'text' else attr
|
||||
return format_html(self.inner_ele.get_attribute(attr))
|
||||
|
||||
def ele(self,
|
||||
loc_or_str: Union[Tuple[str, str], str],
|
||||
@ -197,29 +201,35 @@ class DriverElement(DrissionElement):
|
||||
"""
|
||||
if isinstance(loc_or_str, (str, tuple)):
|
||||
if isinstance(loc_or_str, str):
|
||||
loc_or_str = get_loc_from_str(loc_or_str)
|
||||
loc_or_str = str_to_loc(loc_or_str)
|
||||
else:
|
||||
if len(loc_or_str) != 2:
|
||||
raise ValueError("Len of loc_or_str must be 2 when it's a tuple.")
|
||||
loc_or_str = translate_loc_to_xpath(loc_or_str)
|
||||
|
||||
loc_or_str = translate_loc(loc_or_str)
|
||||
|
||||
else:
|
||||
raise ValueError('Argument loc_or_str can only be tuple or str.')
|
||||
|
||||
if loc_or_str[0] == 'xpath':
|
||||
# 处理语句最前面的(
|
||||
brackets = len(re.match(r'\(*', loc_or_str[1]).group(0))
|
||||
bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:]
|
||||
loc_str = loc_or_str[1]
|
||||
# if loc_or_str[0] == 'xpath':
|
||||
# # 处理语句最前面的(
|
||||
# brackets = len(re.match(r'\(*', loc_or_str[1]).group(0))
|
||||
# bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:]
|
||||
#
|
||||
# # 确保查询语句最前面是.
|
||||
# loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}'
|
||||
# loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}'
|
||||
# loc_str = f'{bracket}{loc_str}'
|
||||
|
||||
# 确保查询语句最前面是.
|
||||
loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}'
|
||||
loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}'
|
||||
loc_or_str = loc_or_str[0], f'{bracket}{loc_str}'
|
||||
if loc_or_str[0] == 'xpath' and loc_or_str[1].lstrip().startswith('/'):
|
||||
loc_str = f'.{loc_str}'
|
||||
|
||||
elif loc_or_str[0] == 'css selector':
|
||||
if loc_or_str[1].lstrip().startswith('>'):
|
||||
loc_or_str = loc_or_str[0], f'{self.css_path}{loc_or_str[1]}'
|
||||
if loc_or_str[0] == 'css selector' and loc_or_str[1].lstrip().startswith('>'):
|
||||
loc_str = f'{self.css_path}{loc_or_str[1]}'
|
||||
|
||||
timeout = timeout or self.timeout
|
||||
loc_or_str = loc_or_str[0], loc_str
|
||||
|
||||
return execute_driver_find(self, loc_or_str, mode, timeout)
|
||||
|
||||
@ -571,7 +581,7 @@ def execute_driver_find(page_or_ele,
|
||||
return [] if mode == 'all' else None
|
||||
|
||||
except InvalidElementStateException:
|
||||
raise ValueError('Invalid query syntax.', loc)
|
||||
raise ValueError(f'Invalid query syntax. {loc}')
|
||||
|
||||
|
||||
class ElementsByXpath(object):
|
||||
@ -641,17 +651,20 @@ class ElementsByXpath(object):
|
||||
if self.mode == 'single':
|
||||
try:
|
||||
e = get_nodes(the_node, xpath_txt=self.xpath, type_txt='9')
|
||||
return (DriverElement(e, self.page, self.timeout)
|
||||
if isinstance(e, WebElement) else unescape(e).replace('\xa0', ' '))
|
||||
if isinstance(e, WebElement):
|
||||
return DriverElement(e, self.page, self.timeout)
|
||||
elif isinstance(e, str):
|
||||
return format_html(e)
|
||||
else:
|
||||
return e
|
||||
|
||||
# 找不到目标时
|
||||
except JavascriptException:
|
||||
return None
|
||||
|
||||
elif self.mode == 'all':
|
||||
e = get_nodes(the_node, xpath_txt=self.xpath)
|
||||
|
||||
# 去除元素间换行符并替换空格
|
||||
e = (unescape(x).replace('\xa0', ' ') if isinstance(x, str) else x for x in e if x != '\n')
|
||||
|
||||
return [DriverElement(x, self.page, self.timeout) if isinstance(x, WebElement) else x for x in e]
|
||||
# 去除元素间换行符
|
||||
return ([DriverElement(x, self.page, self.timeout) if isinstance(x, WebElement)
|
||||
else format_html(x)
|
||||
for x in get_nodes(the_node, xpath_txt=self.xpath)
|
||||
if x != '\n'])
|
||||
|
@ -5,20 +5,20 @@
|
||||
@File : session_element.py
|
||||
"""
|
||||
import re
|
||||
from html import unescape
|
||||
from typing import Union, List, Tuple
|
||||
from urllib.parse import urlparse, urljoin, urlunparse
|
||||
|
||||
from cssselect import SelectorSyntaxError
|
||||
from lxml.etree import tostring, HTML, _Element, XPathEvalError
|
||||
from lxml.etree import tostring, XPathEvalError
|
||||
from lxml.html import HtmlElement, fromstring
|
||||
|
||||
from .common import DrissionElement, get_loc_from_str, translate_loc_to_xpath
|
||||
from .common import DrissionElement, str_to_loc, translate_loc, format_html
|
||||
|
||||
|
||||
class SessionElement(DrissionElement):
|
||||
"""session模式的元素对象,包装了一个lxml的Element对象,并封装了常用功能"""
|
||||
|
||||
def __init__(self, ele: _Element, page=None):
|
||||
def __init__(self, ele: HtmlElement, page=None):
|
||||
super().__init__(ele, page)
|
||||
|
||||
def __repr__(self):
|
||||
@ -30,8 +30,13 @@ class SessionElement(DrissionElement):
|
||||
|
||||
@property
|
||||
def html(self) -> str:
|
||||
"""返回元素outerHTML文本"""
|
||||
return format_html(tostring(self._inner_ele).decode())
|
||||
|
||||
@property
|
||||
def inner_html(self) -> str:
|
||||
"""返回元素innerHTML文本"""
|
||||
html = unescape(tostring(self._inner_ele).decode()).replace('\xa0', ' ')
|
||||
html = format_html(tostring(self._inner_ele).decode())
|
||||
r = re.match(r'<.*?>(.*)</.*?>', html, flags=re.DOTALL)
|
||||
return None if not r else r.group(1)
|
||||
|
||||
@ -48,7 +53,7 @@ class SessionElement(DrissionElement):
|
||||
@property
|
||||
def text(self) -> str:
|
||||
"""返回元素内所有文本"""
|
||||
return unescape(self._inner_ele.text).replace('\xa0', ' ')
|
||||
return self._inner_ele.text_content()
|
||||
|
||||
@property
|
||||
def css_path(self) -> str:
|
||||
@ -81,9 +86,20 @@ class SessionElement(DrissionElement):
|
||||
:return: 文本列表
|
||||
"""
|
||||
if text_node_only:
|
||||
return self.eles('xpath:./*/text()')
|
||||
return self.eles('xpath:/text()')
|
||||
else:
|
||||
return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./*/node()')]
|
||||
texts = []
|
||||
|
||||
for node in self.eles('xpath:/node()'):
|
||||
if isinstance(node, str):
|
||||
text = node
|
||||
else:
|
||||
text = node.text
|
||||
|
||||
if text:
|
||||
texts.append(text)
|
||||
|
||||
return texts
|
||||
|
||||
def parents(self, num: int = 1):
|
||||
"""返回上面第num级父元素 \n
|
||||
@ -128,14 +144,14 @@ class SessionElement(DrissionElement):
|
||||
elif attr == 'src':
|
||||
return self._make_absolute(self.inner_ele.get('src'))
|
||||
|
||||
elif attr == 'text':
|
||||
elif attr in ['text', 'innerText']:
|
||||
return self.text
|
||||
|
||||
elif attr == 'outerHTML':
|
||||
return unescape(tostring(self._inner_ele).decode()).replace('\xa0', ' ')
|
||||
return self.html
|
||||
|
||||
elif attr == 'innerHTML':
|
||||
return self.html
|
||||
return self.inner_html
|
||||
|
||||
else:
|
||||
return self.inner_ele.get(attr)
|
||||
@ -167,30 +183,33 @@ class SessionElement(DrissionElement):
|
||||
"""
|
||||
if isinstance(loc_or_str, (str, tuple)):
|
||||
if isinstance(loc_or_str, str):
|
||||
loc_or_str = get_loc_from_str(loc_or_str)
|
||||
loc_or_str = str_to_loc(loc_or_str)
|
||||
else:
|
||||
if len(loc_or_str) != 2:
|
||||
raise ValueError("Len of loc_or_str must be 2 when it's a tuple.")
|
||||
loc_or_str = translate_loc_to_xpath(loc_or_str)
|
||||
loc_or_str = translate_loc(loc_or_str)
|
||||
else:
|
||||
raise ValueError('Argument loc_or_str can only be tuple or str.')
|
||||
|
||||
element = self
|
||||
if loc_or_str[0] == 'xpath':
|
||||
brackets = len(re.match(r'\(*', loc_or_str[1]).group(0))
|
||||
bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:]
|
||||
loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}'
|
||||
loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}'
|
||||
loc_str = f'{bracket}{loc_str}'
|
||||
loc_str = loc_or_str[1]
|
||||
# if loc_or_str[0] == 'xpath':
|
||||
# brackets = len(re.match(r'\(*', loc_or_str[1]).group(0))
|
||||
# bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:]
|
||||
# loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}'
|
||||
# loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}'
|
||||
# loc_str = f'{bracket}{loc_str}'
|
||||
|
||||
else: # css selector
|
||||
if loc_or_str[1][0].startswith('>'):
|
||||
loc_str = f'{self.css_path}{loc_or_str[1]}'
|
||||
element = self.page
|
||||
else:
|
||||
loc_str = loc_or_str[1]
|
||||
if loc_or_str[0] == 'xpath' and loc_or_str[1].lstrip().startswith('/'):
|
||||
loc_str = f'.{loc_str}'
|
||||
|
||||
# 若css以>开头,表示找元素的直接子元素,要用page以绝对路径才能找到
|
||||
if loc_or_str[0] == 'css selector' and loc_or_str[1].lstrip().startswith('>'):
|
||||
loc_str = f'{self.css_path}{loc_or_str[1]}'
|
||||
element = self.page
|
||||
|
||||
loc_or_str = loc_or_str[0], loc_str
|
||||
|
||||
return execute_session_find(element, loc_or_str, mode)
|
||||
|
||||
def eles(self, loc_or_str: Union[Tuple[str, str], str]):
|
||||
@ -298,7 +317,7 @@ class SessionElement(DrissionElement):
|
||||
|
||||
def execute_session_find(page_or_ele,
|
||||
loc: Tuple[str, str],
|
||||
mode: str = 'single', ) -> Union[SessionElement, List[SessionElement or str], None]:
|
||||
mode: str = 'single', ) -> Union[SessionElement, List[SessionElement or str], str, None]:
|
||||
"""执行session模式元素的查找 \n
|
||||
页面查找元素及元素查找下级元素皆使用此方法 \n
|
||||
:param page_or_ele: SessionPage对象或SessionElement对象
|
||||
@ -316,7 +335,7 @@ def execute_session_find(page_or_ele,
|
||||
page_or_ele = page_or_ele.inner_ele
|
||||
else: # 传入的是SessionPage对象
|
||||
page = page_or_ele
|
||||
page_or_ele = HTML(page_or_ele.response.text)
|
||||
page_or_ele = fromstring(page_or_ele.html)
|
||||
|
||||
try:
|
||||
# 用lxml内置方法获取lxml的元素对象列表
|
||||
@ -329,20 +348,19 @@ def execute_session_find(page_or_ele,
|
||||
if mode == 'single':
|
||||
ele = ele[0] if ele else None
|
||||
|
||||
if isinstance(ele, _Element):
|
||||
if isinstance(ele, HtmlElement):
|
||||
return SessionElement(ele, page)
|
||||
elif isinstance(ele, str):
|
||||
return unescape(ele).replace('\xa0', ' ')
|
||||
return ele
|
||||
else:
|
||||
return None
|
||||
|
||||
elif mode == 'all':
|
||||
# 去除元素间换行符并替换空格
|
||||
ele = (unescape(x).replace('\xa0', ' ') if isinstance(x, str) else x for x in ele if x != '\n')
|
||||
return [SessionElement(e, page) if isinstance(e, _Element) else e for e in ele]
|
||||
# 去除元素间换行符
|
||||
return [SessionElement(e, page) if isinstance(e, HtmlElement) else e for e in ele if e != '\n']
|
||||
|
||||
except XPathEvalError:
|
||||
raise SyntaxError('Invalid xpath syntax.', loc)
|
||||
raise SyntaxError(f'Invalid xpath syntax. {loc}')
|
||||
|
||||
except SelectorSyntaxError:
|
||||
raise SyntaxError('Invalid css selector syntax.', loc)
|
||||
raise SyntaxError(f'Invalid css selector syntax. {loc}')
|
||||
|
Loading…
x
Reference in New Issue
Block a user