From 219e2c8cc30d07b9ce25b74abd536959d3703feb Mon Sep 17 00:00:00 2001 From: g1879 Date: Wed, 1 Dec 2021 00:10:44 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=96=84=E8=8E=B7=E5=8F=96=E5=85=83?= =?UTF-8?q?=E7=B4=A0text=EF=BC=8C=E7=BB=9F=E4=B8=80=E4=B8=A4=E7=A7=8D?= =?UTF-8?q?=E6=A8=A1=E5=BC=8F=E8=8E=B7=E5=8F=96text=E7=9A=84=E6=96=B9?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/common.py | 60 +++++++++++++++++++++++++++++++++ DrissionPage/driver_element.py | 9 ++--- DrissionPage/session_element.py | 56 ++---------------------------- 3 files changed, 65 insertions(+), 60 deletions(-) diff --git a/DrissionPage/common.py b/DrissionPage/common.py index 5fb44c0..c842529 100644 --- a/DrissionPage/common.py +++ b/DrissionPage/common.py @@ -115,6 +115,66 @@ def str_to_loc(loc: str) -> tuple: return loc_by, loc_str +def get_ele_txt(e) -> str: + # 前面无须换行的元素 + nowrap_list = ('sub', 'em', 'strong', 'a', 'font', 'b', 'span', 's', 'i', 'del', 'ins', 'img', 'td', 'th', + 'abbr', 'bdi', 'bdo', 'cite', 'code', 'data', 'dfn', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby', + 'samp', 'small', 'sub', 'time', 'u', 'var', 'wbr', 'button', 'slot', 'content') + # 后面添加换行的元素 + wrap_after_list = ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'li', 'blockquote', 'header', + 'footer', 'address' 'article', 'aside', 'main', 'nav', 'section', 'figcaption', 'summary') + # 不获取文本的元素 + noText_list = ('script', 'style', 'video', 'audio', 'iframe', 'embed', 'noscript', 'canvas', 'template') + # 用/t分隔的元素 + tab_list = ('td', 'th') + + if e.tag in noText_list: + return e.raw_text + + def get_node_txt(ele, pre: bool = False): + str_list = [] + tag = ele.tag.lower() + + if tag in noText_list: # script标签内的文本不返回 + return str_list + if tag == 'br': + return '\n' + if tag == 'pre': + pre = True + + nodes = ele.eles('xpath:./text() | *') + prev_ele = '' + for el in nodes: + if isinstance(el, str): # 字符节点 + if pre: + str_list.append(el) + + else: + if sub('[ \n]', '', el) != '': # 字符除了回车和空格还有其它内容 + txt = el + if not pre: + txt = txt.replace('\n', ' ').strip(' ') + txt = sub(r' {2,}', ' ', txt) + str_list.append(txt) + + else: # 元素节点 + if el.tag.lower() not in nowrap_list and str_list and str_list[-1] != '\n': # 元素间换行的情况 + str_list.append('\n') + if el.tag.lower() in tab_list and prev_ele in tab_list: # 表格的行 + str_list.append('\t') + + str_list.extend(get_node_txt(el, pre)) + prev_ele = el.tag.lower() + + if tag in wrap_after_list and str_list and str_list[-1] != '\n': # 有些元素后面要添加回车 + str_list.append('\n') + + return str_list + + re_str = ''.join(get_node_txt(e)) + return format_html(re_str, False).strip(' \n') + + def _make_xpath_str(tag: str, arg: str, val: str, mode: str = 'fuzzy') -> str: """生成xpath语句 \n :param tag: 标签名 diff --git a/DrissionPage/driver_element.py b/DrissionPage/driver_element.py index 3035b0e..ab7f4e6 100644 --- a/DrissionPage/driver_element.py +++ b/DrissionPage/driver_element.py @@ -16,7 +16,7 @@ from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.support.wait import WebDriverWait from .base import DrissionElement, BaseElement -from .common import str_to_loc, get_usable_path, translate_loc, format_html +from .common import str_to_loc, get_usable_path, translate_loc, format_html, get_ele_txt from .session_element import make_session_ele @@ -80,12 +80,7 @@ class DriverElement(DrissionElement): @property def text(self) -> str: """返回元素内所有文本""" - # re_str = self.inner_ele.text - re_str = self.inner_ele.get_attribute('innerText') - # re_str = sub(r'\n{2,}', '\n', re_str) - # re_str = sub(r' {2,}', ' ', re_str) - - return format_html(re_str.strip('\n '), False) + return get_ele_txt(self) @property def raw_text(self) -> str: diff --git a/DrissionPage/session_element.py b/DrissionPage/session_element.py index 5588953..05ecaac 100644 --- a/DrissionPage/session_element.py +++ b/DrissionPage/session_element.py @@ -4,7 +4,7 @@ @Contact : g1879@qq.com @File : session_element.py """ -from re import match, DOTALL, sub +from re import match, DOTALL from typing import Union, List, Tuple from urllib.parse import urlparse, urljoin, urlunparse @@ -12,7 +12,7 @@ from lxml.etree import tostring from lxml.html import HtmlElement, fromstring from .base import DrissionElement, BasePage, BaseElement -from .common import str_to_loc, translate_loc, format_html +from .common import str_to_loc, translate_loc, format_html, get_ele_txt class SessionElement(DrissionElement): @@ -58,57 +58,7 @@ class SessionElement(DrissionElement): @property def text(self) -> str: """返回元素内所有文本""" - - # 为尽量保证与浏览器结果一致,弄得比较复杂 - # 前面无须换行的元素 - nowrap_list = ('sub', 'em', 'strong', 'a', 'font', 'b', 'span', 's', 'i', 'del', 'ins', 'br', 'img', 'td', 'th', - 'abbr', 'bdi', 'bdo', 'cite', 'code', 'data', 'dfn', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby', - 'samp', 'small', 'sub', 'time', 'u', 'var', 'wbr', 'button', 'slot', 'content') - # 后面添加换行的元素 - wrap_after_list = ('p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'li', 'blockquote') - noText_list = ( - 'script', 'style', 'video', 'audio', 'iframe', 'embed', 'noscript', 'canvas', 'template') # 不获取文本的元素 - tab_list = ('td', 'th') - - def get_node_txt(ele, pre: bool = False): - str_list = [] - tag = ele.tag.lower() - if tag in noText_list: # script标签内的文本不返回 - return str_list - - if tag == 'pre': - pre = True - - nodes = ele.eles('xpath:./text() | *') - prev_ele = '' - for el in nodes: - if isinstance(el, str): # 字符节点 - if pre: - str_list.append(el) - - else: - if sub('[ \n]', '', el) != '': # 字符除了回车和空格还有其它内容 - txt = el - if not pre: - txt = txt.replace('\n', ' ').strip(' \t') - txt = sub(r' {2,}', ' ', txt) - str_list.append(txt) - - else: # 元素节点 - if el.tag.lower() not in nowrap_list and str_list and str_list[-1] != '\n': # 元素间换行的情况 - str_list.append('\n') - if el.tag.lower() in tab_list and prev_ele in tab_list: - str_list.append('\t') - str_list.extend(get_node_txt(el, pre)) - prev_ele = el.tag.lower() - - if tag in wrap_after_list: # 有些元素后面要添加回车 - str_list.append('\n') - - return str_list - - re_str = ''.join(get_node_txt(self)) - return format_html(re_str, False).strip('\n') + return get_ele_txt(self) @property def raw_text(self) -> str: