From 744e09c6492ffc9db43ae735acbe570b86919139 Mon Sep 17 00:00:00 2001 From: g1879 Date: Mon, 4 Jan 2021 17:23:13 +0800 Subject: [PATCH] =?UTF-8?q?=E7=BB=A7=E7=BB=AD=E5=AE=8C=E5=96=84text?= =?UTF-8?q?=E5=B1=9E=E6=80=A7=E8=BF=94=E5=9B=9E=E5=86=85=E5=AE=B9=EF=BC=9B?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0comments=E5=B1=9E=E6=80=A7=EF=BC=9B=E5=AE=8C?= =?UTF-8?q?=E5=96=84=E5=AF=B9=E5=85=83=E7=B4=A0=E5=86=85=E6=9C=89=E6=B3=A8?= =?UTF-8?q?=E9=87=8A=E6=97=B6=E8=8A=82=E7=82=B9=E7=9A=84=E8=8E=B7=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/driver_element.py | 25 ++++++----- DrissionPage/session_element.py | 73 ++++++++++++++++----------------- 2 files changed, 51 insertions(+), 47 deletions(-) diff --git a/DrissionPage/driver_element.py b/DrissionPage/driver_element.py index 65a3b37..01cc60e 100644 --- a/DrissionPage/driver_element.py +++ b/DrissionPage/driver_element.py @@ -4,7 +4,6 @@ @Contact : g1879@qq.com @File : driver_element.py """ -import re from pathlib import Path from time import sleep from typing import Union, List, Any, Tuple @@ -79,13 +78,13 @@ class DriverElement(DrissionElement): @property def text(self) -> str: """返回元素内所有文本""" + return format_html(self.inner_ele.get_attribute('innerText'), False) # return self.inner_ele.get_attribute('innerText') - re_str = self.inner_ele.get_attribute('innerText') - re_str = re.sub(r'\n{2,}', '\n', re_str) - re_str = re.sub(r' {2,}', ' ', re_str) - - return format_html(re_str.strip('\n ')) - # return re_str.strip('\n ') + # re_str = self.inner_ele.get_attribute('innerText') + # re_str = re.sub(r'\n{2,}', '\n', re_str) + # re_str = re.sub(r' {2,}', ' ', re_str) + # + # return format_html(re_str.strip('\n ')) @property def link(self) -> str: @@ -116,6 +115,10 @@ class DriverElement(DrissionElement): """返回前一个兄弟元素""" return self._get_brother(1, 'ele', 'prev') + @property + def comments(self): + return self.eles('xpath:.//comment()') + # -----------------driver独占属性------------------- @property def size(self) -> dict: @@ -152,9 +155,9 @@ class DriverElement(DrissionElement): :return: 文本列表 """ if text_node_only: - return self.eles('xpath:./text()') + return self.eles('xpath:/text()') else: - return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./node()')] + return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')] def parents(self, num: int = 1): """返回上面第num级父元素 \n @@ -576,7 +579,7 @@ class DriverElement(DrissionElement): ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout) # 跳过元素间的换行符 - while ele_or_node == '\n': + while isinstance(ele_or_node, str) and ele_or_node.replace('\n', '').replace('\t', '').replace(' ', '') == '': num += 1 ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout) @@ -662,6 +665,7 @@ class ElementsByXpath(object): return_txt = ''' if(e.singleNodeValue.constructor.name=="Text"){return e.singleNodeValue.data;} else if(e.singleNodeValue.constructor.name=="Attr"){return e.singleNodeValue.nodeValue;} + else if(e.singleNodeValue.constructor.name=="Comment"){return e.singleNodeValue.nodeValue;} else{return e.singleNodeValue;} ''' @@ -672,6 +676,7 @@ class ElementsByXpath(object): for(var i = 0; i str: """返回元素内所有文本""" + # 为尽量保证与浏览器结果一致,弄得比较复杂 + def get_node(ele, pre: bool = False): + str_list = [] + if ele.tag == 'pre': + pre = True + for el in ele.eles('xpath:./text() | *'): + if isinstance(el, str): + if el.replace(' ', '').replace('\n', '') != '': + if pre: + str_list.append(el) + else: + str_list.append(el.replace('\n', ' ').strip(' \t')) + + elif '\n' in el and str_list and str_list[-1] != '\n': + str_list.append('\n') + else: + str_list.append(' ') + else: + str_list.extend(get_node(el, pre)) + if el.tag in ('br', 'p',) and str_list and str_list[-1] != '\n': + str_list.append('\n') + + return str_list + + re_str = ''.join(get_node(self)) + re_str = re.sub(r' {2,}', ' ', re_str) + return format_html(re_str, False) + # re_str = str(self._inner_ele.text_content()) # # re_str = re.sub(r'
', '\n', re_str) # re_str = re.sub(r'\n{2,}', '\n', re_str) @@ -56,30 +84,6 @@ class SessionElement(DrissionElement): # return format_html(re_str.strip('\n ')) # # return format_html(re_str) - # 为尽量保证与浏览器结果一致,弄得比较复杂 - def get_node(ele): - str_list = [] - for el in ele.eles('xpath:./node()'): - if isinstance(el, str): - if el.replace(' ', '').replace('\n', '') != '': - # str_list.append(el.replace('\xa0', ' ').replace('\n', ' ').strip()) - str_list.append(el.replace('\n', ' ').strip(' ')) - elif '\n' in el: - str_list.append('\n') - else: - str_list.append(' ') - else: - str_list.extend(get_node(el)) - if el.tag in ('br', 'p',): - str_list.append('\n') - - return str_list - - re_str = ''.join(get_node(self)) - re_str = re.sub(r'\n{2,}', '\n', re_str) - re_str = re.sub(r' {2,}', ' ', re_str) - return format_html(re_str.strip('\n ')) - @property def tag(self) -> str: """返回元素类型""" @@ -120,26 +124,21 @@ class SessionElement(DrissionElement): """返回前一个兄弟元素""" return self._get_brother(1, 'ele', 'prev') + @property + def comments(self): + return self.eles('xpath:.//comment()') + def texts(self, text_node_only: bool = False) -> list: """返回元素内所有直接子节点的文本,包括元素和文本节点 \n :param text_node_only: 是否只返回文本节点 :return: 文本列表 """ if text_node_only: - return self.eles('xpath:/text()') + texts = self.eles('xpath:/text()') else: - texts = [] + texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')] - for node in self.eles('xpath:/node()'): - if isinstance(node, str): - text = node - else: - text = node.text - - if text: - texts.append(text) - - return texts + return [format_html(x) for x in texts if x and x.replace('\n', '').replace('\t', '').replace(' ', '') != ''] def parents(self, num: int = 1): """返回上面第num级父元素 \n @@ -392,7 +391,7 @@ def execute_session_find(page_or_ele, page_or_ele = page_or_ele.inner_ele else: # 传入的是SessionPage对象 page = page_or_ele - page_or_ele = fromstring(page_or_ele.response.text) + page_or_ele = fromstring(re.sub(r' ?', ' ', page_or_ele.response.text)) try: # 用lxml内置方法获取lxml的元素对象列表