From 348b4e164f33a98b73e0828d16de4fc1767c7ca8 Mon Sep 17 00:00:00 2001 From: g1879 Date: Tue, 30 Nov 2021 22:26:20 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9C=AA=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/session_element.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/DrissionPage/session_element.py b/DrissionPage/session_element.py index f15fc89..5588953 100644 --- a/DrissionPage/session_element.py +++ b/DrissionPage/session_element.py @@ -60,10 +60,15 @@ class SessionElement(DrissionElement): """返回元素内所有文本""" # 为尽量保证与浏览器结果一致,弄得比较复杂 - nowrap_list = ('a', 'font', 'b', 'span', 'style', 's', 'i', 'del', 'br', 'img', 'td') # 前面无须换行的元素 - wrap_after_list = ('p', 'div', 'br') # 后面添加换行的元素 - noText_list = ('script',) # 不获取文本的元素 - tab_list = ('td',) + # 前面无须换行的元素 + nowrap_list = ('sub', 'em', 'strong', 'a', 'font', 'b', 'span', 's', 'i', 'del', 'ins', 'br', 'img', 'td', 'th', + 'abbr', 'bdi', 'bdo', 'cite', 'code', 'data', 'dfn', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby', + 'samp', 'small', 'sub', 'time', 'u', 'var', 'wbr', 'button', 'slot', 'content') + # 后面添加换行的元素 + wrap_after_list = ('p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'li', 'blockquote') + noText_list = ( + 'script', 'style', 'video', 'audio', 'iframe', 'embed', 'noscript', 'canvas', 'template') # 不获取文本的元素 + tab_list = ('td', 'th') def get_node_txt(ele, pre: bool = False): str_list = [] @@ -75,7 +80,8 @@ class SessionElement(DrissionElement): pre = True nodes = ele.eles('xpath:./text() | *') - for k, el in enumerate(nodes): + prev_ele = '' + for el in nodes: if isinstance(el, str): # 字符节点 if pre: str_list.append(el) @@ -91,9 +97,10 @@ class SessionElement(DrissionElement): else: # 元素节点 if el.tag.lower() not in nowrap_list and str_list and str_list[-1] != '\n': # 元素间换行的情况 str_list.append('\n') - if el.tag.lower() in tab_list: + if el.tag.lower() in tab_list and prev_ele in tab_list: str_list.append('\t') str_list.extend(get_node_txt(el, pre)) + prev_ele = el.tag.lower() if tag in wrap_after_list: # 有些元素后面要添加回车 str_list.append('\n')