From eb866ba29eda29a40efaa3090e9d3eaf0ef69692 Mon Sep 17 00:00:00 2001 From: g1879 Date: Thu, 12 Nov 2020 18:09:01 +0800 Subject: [PATCH] =?UTF-8?q?=E5=85=83=E7=B4=A0=E7=9A=84html=E5=B1=9E?= =?UTF-8?q?=E6=80=A7=E6=94=B9=E4=B8=BA=E8=BF=94=E5=9B=9EouterHTML=EF=BC=8C?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0inner=5Fhtml=E5=B1=9E=E6=80=A7=EF=BC=9B=20?= =?UTF-8?q?=E5=A4=84=E7=90=86xpath=E6=97=B6=E5=8F=AA=E5=AF=B9/=E5=BC=80?= =?UTF-8?q?=E5=A4=B4=E7=9A=84=E6=83=85=E5=86=B5=E6=B7=BB=E5=8A=A0.?= =?UTF-8?q?=EF=BC=9B=20=E5=9C=A8=E7=BB=9F=E4=B8=80=E7=9A=84=E5=87=BD?= =?UTF-8?q?=E6=95=B0=E5=A4=84=E7=90=86html=E8=BD=AC=E7=A0=81=E5=8F=8A?= =?UTF-8?q?=E7=A9=BA=E6=A0=BC=E6=9B=BF=E6=8D=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/driver_element.py | 69 +++++++++++++++----------- DrissionPage/session_element.py | 86 ++++++++++++++++++++------------- 2 files changed, 93 insertions(+), 62 deletions(-) diff --git a/DrissionPage/driver_element.py b/DrissionPage/driver_element.py index 2ebf78a..5cddc7f 100644 --- a/DrissionPage/driver_element.py +++ b/DrissionPage/driver_element.py @@ -4,8 +4,6 @@ @Contact : g1879@qq.com @File : driver_element.py """ -import re -from html import unescape from pathlib import Path from time import sleep from typing import Union, List, Any, Tuple @@ -16,7 +14,7 @@ from selenium.webdriver.remote.webelement import WebElement from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.support.wait import WebDriverWait -from .common import DrissionElement, get_loc_from_str, get_available_file_name, translate_loc_to_xpath +from .common import DrissionElement, str_to_loc, get_available_file_name, translate_loc, format_html class DriverElement(DrissionElement): @@ -39,8 +37,13 @@ class DriverElement(DrissionElement): # -----------------共有属性------------------- @property def html(self) -> str: + """返回元素outerHTML文本""" + return self.attr('outerHTML') + + @property + def inner_html(self) -> str: """返回元素innerHTML文本""" - return unescape(self.attr('innerHTML')).replace('\xa0', ' ') + return self.attr('innerHTML') @property def tag(self) -> str: @@ -69,7 +72,7 @@ class DriverElement(DrissionElement): @property def text(self) -> str: """返回元素内所有文本""" - return unescape(self.attr('innerText')).replace('\xa0', ' ') + return self.attr('innerText') @property def css_path(self) -> str: @@ -164,7 +167,8 @@ class DriverElement(DrissionElement): :param attr: 属性名 :return: 属性值文本 """ - return self.text if attr == 'text' else self.inner_ele.get_attribute(attr) + attr = 'innerText' if attr == 'text' else attr + return format_html(self.inner_ele.get_attribute(attr)) def ele(self, loc_or_str: Union[Tuple[str, str], str], @@ -197,29 +201,35 @@ class DriverElement(DrissionElement): """ if isinstance(loc_or_str, (str, tuple)): if isinstance(loc_or_str, str): - loc_or_str = get_loc_from_str(loc_or_str) + loc_or_str = str_to_loc(loc_or_str) else: if len(loc_or_str) != 2: raise ValueError("Len of loc_or_str must be 2 when it's a tuple.") - loc_or_str = translate_loc_to_xpath(loc_or_str) + + loc_or_str = translate_loc(loc_or_str) + else: raise ValueError('Argument loc_or_str can only be tuple or str.') - if loc_or_str[0] == 'xpath': - # 处理语句最前面的( - brackets = len(re.match(r'\(*', loc_or_str[1]).group(0)) - bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:] + loc_str = loc_or_str[1] + # if loc_or_str[0] == 'xpath': + # # 处理语句最前面的( + # brackets = len(re.match(r'\(*', loc_or_str[1]).group(0)) + # bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:] + # + # # 确保查询语句最前面是. + # loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}' + # loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}' + # loc_str = f'{bracket}{loc_str}' - # 确保查询语句最前面是. - loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}' - loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}' - loc_or_str = loc_or_str[0], f'{bracket}{loc_str}' + if loc_or_str[0] == 'xpath' and loc_or_str[1].lstrip().startswith('/'): + loc_str = f'.{loc_str}' - elif loc_or_str[0] == 'css selector': - if loc_or_str[1].lstrip().startswith('>'): - loc_or_str = loc_or_str[0], f'{self.css_path}{loc_or_str[1]}' + if loc_or_str[0] == 'css selector' and loc_or_str[1].lstrip().startswith('>'): + loc_str = f'{self.css_path}{loc_or_str[1]}' timeout = timeout or self.timeout + loc_or_str = loc_or_str[0], loc_str return execute_driver_find(self, loc_or_str, mode, timeout) @@ -571,7 +581,7 @@ def execute_driver_find(page_or_ele, return [] if mode == 'all' else None except InvalidElementStateException: - raise ValueError('Invalid query syntax.', loc) + raise ValueError(f'Invalid query syntax. {loc}') class ElementsByXpath(object): @@ -641,17 +651,20 @@ class ElementsByXpath(object): if self.mode == 'single': try: e = get_nodes(the_node, xpath_txt=self.xpath, type_txt='9') - return (DriverElement(e, self.page, self.timeout) - if isinstance(e, WebElement) else unescape(e).replace('\xa0', ' ')) + if isinstance(e, WebElement): + return DriverElement(e, self.page, self.timeout) + elif isinstance(e, str): + return format_html(e) + else: + return e # 找不到目标时 except JavascriptException: return None elif self.mode == 'all': - e = get_nodes(the_node, xpath_txt=self.xpath) - - # 去除元素间换行符并替换空格 - e = (unescape(x).replace('\xa0', ' ') if isinstance(x, str) else x for x in e if x != '\n') - - return [DriverElement(x, self.page, self.timeout) if isinstance(x, WebElement) else x for x in e] + # 去除元素间换行符 + return ([DriverElement(x, self.page, self.timeout) if isinstance(x, WebElement) + else format_html(x) + for x in get_nodes(the_node, xpath_txt=self.xpath) + if x != '\n']) diff --git a/DrissionPage/session_element.py b/DrissionPage/session_element.py index 4a2613b..2bcd3c0 100644 --- a/DrissionPage/session_element.py +++ b/DrissionPage/session_element.py @@ -5,20 +5,20 @@ @File : session_element.py """ import re -from html import unescape from typing import Union, List, Tuple from urllib.parse import urlparse, urljoin, urlunparse from cssselect import SelectorSyntaxError -from lxml.etree import tostring, HTML, _Element, XPathEvalError +from lxml.etree import tostring, XPathEvalError +from lxml.html import HtmlElement, fromstring -from .common import DrissionElement, get_loc_from_str, translate_loc_to_xpath +from .common import DrissionElement, str_to_loc, translate_loc, format_html class SessionElement(DrissionElement): """session模式的元素对象,包装了一个lxml的Element对象,并封装了常用功能""" - def __init__(self, ele: _Element, page=None): + def __init__(self, ele: HtmlElement, page=None): super().__init__(ele, page) def __repr__(self): @@ -30,8 +30,13 @@ class SessionElement(DrissionElement): @property def html(self) -> str: + """返回元素outerHTML文本""" + return format_html(tostring(self._inner_ele).decode()) + + @property + def inner_html(self) -> str: """返回元素innerHTML文本""" - html = unescape(tostring(self._inner_ele).decode()).replace('\xa0', ' ') + html = format_html(tostring(self._inner_ele).decode()) r = re.match(r'<.*?>(.*)', html, flags=re.DOTALL) return None if not r else r.group(1) @@ -48,7 +53,7 @@ class SessionElement(DrissionElement): @property def text(self) -> str: """返回元素内所有文本""" - return unescape(self._inner_ele.text).replace('\xa0', ' ') + return self._inner_ele.text_content() @property def css_path(self) -> str: @@ -81,9 +86,20 @@ class SessionElement(DrissionElement): :return: 文本列表 """ if text_node_only: - return self.eles('xpath:./*/text()') + return self.eles('xpath:/text()') else: - return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./*/node()')] + texts = [] + + for node in self.eles('xpath:/node()'): + if isinstance(node, str): + text = node + else: + text = node.text + + if text: + texts.append(text) + + return texts def parents(self, num: int = 1): """返回上面第num级父元素 \n @@ -128,14 +144,14 @@ class SessionElement(DrissionElement): elif attr == 'src': return self._make_absolute(self.inner_ele.get('src')) - elif attr == 'text': + elif attr in ['text', 'innerText']: return self.text elif attr == 'outerHTML': - return unescape(tostring(self._inner_ele).decode()).replace('\xa0', ' ') + return self.html elif attr == 'innerHTML': - return self.html + return self.inner_html else: return self.inner_ele.get(attr) @@ -167,30 +183,33 @@ class SessionElement(DrissionElement): """ if isinstance(loc_or_str, (str, tuple)): if isinstance(loc_or_str, str): - loc_or_str = get_loc_from_str(loc_or_str) + loc_or_str = str_to_loc(loc_or_str) else: if len(loc_or_str) != 2: raise ValueError("Len of loc_or_str must be 2 when it's a tuple.") - loc_or_str = translate_loc_to_xpath(loc_or_str) + loc_or_str = translate_loc(loc_or_str) else: raise ValueError('Argument loc_or_str can only be tuple or str.') element = self - if loc_or_str[0] == 'xpath': - brackets = len(re.match(r'\(*', loc_or_str[1]).group(0)) - bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:] - loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}' - loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}' - loc_str = f'{bracket}{loc_str}' + loc_str = loc_or_str[1] + # if loc_or_str[0] == 'xpath': + # brackets = len(re.match(r'\(*', loc_or_str[1]).group(0)) + # bracket, loc_str = '(' * brackets, loc_or_str[1][brackets:] + # loc_str = loc_str if loc_str.startswith(('.', '/')) else f'.//{loc_str}' + # loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}' + # loc_str = f'{bracket}{loc_str}' - else: # css selector - if loc_or_str[1][0].startswith('>'): - loc_str = f'{self.css_path}{loc_or_str[1]}' - element = self.page - else: - loc_str = loc_or_str[1] + if loc_or_str[0] == 'xpath' and loc_or_str[1].lstrip().startswith('/'): + loc_str = f'.{loc_str}' + + # 若css以>开头,表示找元素的直接子元素,要用page以绝对路径才能找到 + if loc_or_str[0] == 'css selector' and loc_or_str[1].lstrip().startswith('>'): + loc_str = f'{self.css_path}{loc_or_str[1]}' + element = self.page loc_or_str = loc_or_str[0], loc_str + return execute_session_find(element, loc_or_str, mode) def eles(self, loc_or_str: Union[Tuple[str, str], str]): @@ -298,7 +317,7 @@ class SessionElement(DrissionElement): def execute_session_find(page_or_ele, loc: Tuple[str, str], - mode: str = 'single', ) -> Union[SessionElement, List[SessionElement or str], None]: + mode: str = 'single', ) -> Union[SessionElement, List[SessionElement or str], str, None]: """执行session模式元素的查找 \n 页面查找元素及元素查找下级元素皆使用此方法 \n :param page_or_ele: SessionPage对象或SessionElement对象 @@ -316,7 +335,7 @@ def execute_session_find(page_or_ele, page_or_ele = page_or_ele.inner_ele else: # 传入的是SessionPage对象 page = page_or_ele - page_or_ele = HTML(page_or_ele.response.text) + page_or_ele = fromstring(page_or_ele.html) try: # 用lxml内置方法获取lxml的元素对象列表 @@ -329,20 +348,19 @@ def execute_session_find(page_or_ele, if mode == 'single': ele = ele[0] if ele else None - if isinstance(ele, _Element): + if isinstance(ele, HtmlElement): return SessionElement(ele, page) elif isinstance(ele, str): - return unescape(ele).replace('\xa0', ' ') + return ele else: return None elif mode == 'all': - # 去除元素间换行符并替换空格 - ele = (unescape(x).replace('\xa0', ' ') if isinstance(x, str) else x for x in ele if x != '\n') - return [SessionElement(e, page) if isinstance(e, _Element) else e for e in ele] + # 去除元素间换行符 + return [SessionElement(e, page) if isinstance(e, HtmlElement) else e for e in ele if e != '\n'] except XPathEvalError: - raise SyntaxError('Invalid xpath syntax.', loc) + raise SyntaxError(f'Invalid xpath syntax. {loc}') except SelectorSyntaxError: - raise SyntaxError('Invalid css selector syntax.', loc) + raise SyntaxError(f'Invalid css selector syntax. {loc}')