From e03a67c3a5dc0429c82ea2d1ad5d523c599bb529 Mon Sep 17 00:00:00 2001 From: g1879 Date: Mon, 4 Jan 2021 17:42:45 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=96=84texts()=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/driver_element.py | 6 ++++-- DrissionPage/session_element.py | 12 +++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/DrissionPage/driver_element.py b/DrissionPage/driver_element.py index 01cc60e..e8376c0 100644 --- a/DrissionPage/driver_element.py +++ b/DrissionPage/driver_element.py @@ -155,9 +155,11 @@ class DriverElement(DrissionElement): :return: 文本列表 """ if text_node_only: - return self.eles('xpath:/text()') + texts = self.eles('xpath:/text()') else: - return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')] + texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')] + + return [x.strip(' ') for x in texts if x and x.replace('\n', '').replace('\t', '').replace(' ', '') != ''] def parents(self, num: int = 1): """返回上面第num级父元素 \n diff --git a/DrissionPage/session_element.py b/DrissionPage/session_element.py index 498c7d3..c76d9e3 100644 --- a/DrissionPage/session_element.py +++ b/DrissionPage/session_element.py @@ -54,7 +54,12 @@ class SessionElement(DrissionElement): str_list = [] if ele.tag == 'pre': pre = True + + current_tag = None for el in ele.eles('xpath:./text() | *'): + if current_tag in ('br', 'p') and str_list and str_list[-1] != '\n': + str_list.append('\n') + if isinstance(el, str): if el.replace(' ', '').replace('\n', '') != '': if pre: @@ -66,10 +71,10 @@ class SessionElement(DrissionElement): str_list.append('\n') else: str_list.append(' ') + current_tag = None else: str_list.extend(get_node(el, pre)) - if el.tag in ('br', 'p',) and str_list and str_list[-1] != '\n': - str_list.append('\n') + current_tag = el.tag return str_list @@ -138,7 +143,8 @@ class SessionElement(DrissionElement): else: texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')] - return [format_html(x) for x in texts if x and x.replace('\n', '').replace('\t', '').replace(' ', '') != ''] + return [format_html(x.strip(' ')) for x in texts if + x and x.replace('\n', '').replace('\t', '').replace(' ', '') != ''] def parents(self, num: int = 1): """返回上面第num级父元素 \n