From 219e2c8cc30d07b9ce25b74abd536959d3703feb Mon Sep 17 00:00:00 2001
From: g1879 <g1879@qq.com>
Date: Wed, 1 Dec 2021 00:10:44 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=96=84=E8=8E=B7=E5=8F=96=E5=85=83?=
 =?UTF-8?q?=E7=B4=A0text=EF=BC=8C=E7=BB=9F=E4=B8=80=E4=B8=A4=E7=A7=8D?=
 =?UTF-8?q?=E6=A8=A1=E5=BC=8F=E8=8E=B7=E5=8F=96text=E7=9A=84=E6=96=B9?=
 =?UTF-8?q?=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 DrissionPage/common.py          | 60 +++++++++++++++++++++++++++++++++
 DrissionPage/driver_element.py  |  9 ++---
 DrissionPage/session_element.py | 56 ++----------------------------
 3 files changed, 65 insertions(+), 60 deletions(-)

diff --git a/DrissionPage/common.py b/DrissionPage/common.py
index 5fb44c0..c842529 100644
--- a/DrissionPage/common.py
+++ b/DrissionPage/common.py
@@ -115,6 +115,66 @@ def str_to_loc(loc: str) -> tuple:
     return loc_by, loc_str
 
 
+def get_ele_txt(e) -> str:
+    # 前面无须换行的元素
+    nowrap_list = ('sub', 'em', 'strong', 'a', 'font', 'b', 'span', 's', 'i', 'del', 'ins', 'img', 'td', 'th',
+                   'abbr', 'bdi', 'bdo', 'cite', 'code', 'data', 'dfn', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby',
+                   'samp', 'small', 'sub', 'time', 'u', 'var', 'wbr', 'button', 'slot', 'content')
+    # 后面添加换行的元素
+    wrap_after_list = ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'li', 'blockquote', 'header',
+                       'footer', 'address' 'article', 'aside', 'main', 'nav', 'section', 'figcaption', 'summary')
+    # 不获取文本的元素
+    noText_list = ('script', 'style', 'video', 'audio', 'iframe', 'embed', 'noscript', 'canvas', 'template')
+    # 用/t分隔的元素
+    tab_list = ('td', 'th')
+
+    if e.tag in noText_list:
+        return e.raw_text
+
+    def get_node_txt(ele, pre: bool = False):
+        str_list = []
+        tag = ele.tag.lower()
+
+        if tag in noText_list:  # script标签内的文本不返回
+            return str_list
+        if tag == 'br':
+            return '\n'
+        if tag == 'pre':
+            pre = True
+
+        nodes = ele.eles('xpath:./text() | *')
+        prev_ele = ''
+        for el in nodes:
+            if isinstance(el, str):  # 字符节点
+                if pre:
+                    str_list.append(el)
+
+                else:
+                    if sub('[ \n]', '', el) != '':  # 字符除了回车和空格还有其它内容
+                        txt = el
+                        if not pre:
+                            txt = txt.replace('\n', ' ').strip(' ')
+                            txt = sub(r' {2,}', ' ', txt)
+                        str_list.append(txt)
+
+            else:  # 元素节点
+                if el.tag.lower() not in nowrap_list and str_list and str_list[-1] != '\n':  # 元素间换行的情况
+                    str_list.append('\n')
+                if el.tag.lower() in tab_list and prev_ele in tab_list:  # 表格的行
+                    str_list.append('\t')
+
+                str_list.extend(get_node_txt(el, pre))
+                prev_ele = el.tag.lower()
+
+        if tag in wrap_after_list and str_list and str_list[-1] != '\n':  # 有些元素后面要添加回车
+            str_list.append('\n')
+
+        return str_list
+
+    re_str = ''.join(get_node_txt(e))
+    return format_html(re_str, False).strip(' \n')
+
+
 def _make_xpath_str(tag: str, arg: str, val: str, mode: str = 'fuzzy') -> str:
     """生成xpath语句                                          \n
     :param tag: 标签名
diff --git a/DrissionPage/driver_element.py b/DrissionPage/driver_element.py
index 3035b0e..ab7f4e6 100644
--- a/DrissionPage/driver_element.py
+++ b/DrissionPage/driver_element.py
@@ -16,7 +16,7 @@ from selenium.webdriver.support import expected_conditions as ec
 from selenium.webdriver.support.wait import WebDriverWait
 
 from .base import DrissionElement, BaseElement
-from .common import str_to_loc, get_usable_path, translate_loc, format_html
+from .common import str_to_loc, get_usable_path, translate_loc, format_html, get_ele_txt
 from .session_element import make_session_ele
 
 
@@ -80,12 +80,7 @@ class DriverElement(DrissionElement):
     @property
     def text(self) -> str:
         """返回元素内所有文本"""
-        # re_str = self.inner_ele.text
-        re_str = self.inner_ele.get_attribute('innerText')
-        # re_str = sub(r'\n{2,}', '\n', re_str)
-        # re_str = sub(r' {2,}', ' ', re_str)
-
-        return format_html(re_str.strip('\n '), False)
+        return get_ele_txt(self)
 
     @property
     def raw_text(self) -> str:
diff --git a/DrissionPage/session_element.py b/DrissionPage/session_element.py
index 5588953..05ecaac 100644
--- a/DrissionPage/session_element.py
+++ b/DrissionPage/session_element.py
@@ -4,7 +4,7 @@
 @Contact :   g1879@qq.com
 @File    :   session_element.py
 """
-from re import match, DOTALL, sub
+from re import match, DOTALL
 from typing import Union, List, Tuple
 from urllib.parse import urlparse, urljoin, urlunparse
 
@@ -12,7 +12,7 @@ from lxml.etree import tostring
 from lxml.html import HtmlElement, fromstring
 
 from .base import DrissionElement, BasePage, BaseElement
-from .common import str_to_loc, translate_loc, format_html
+from .common import str_to_loc, translate_loc, format_html, get_ele_txt
 
 
 class SessionElement(DrissionElement):
@@ -58,57 +58,7 @@ class SessionElement(DrissionElement):
     @property
     def text(self) -> str:
         """返回元素内所有文本"""
-
-        # 为尽量保证与浏览器结果一致，弄得比较复杂
-        # 前面无须换行的元素
-        nowrap_list = ('sub', 'em', 'strong', 'a', 'font', 'b', 'span', 's', 'i', 'del', 'ins', 'br', 'img', 'td', 'th',
-                       'abbr', 'bdi', 'bdo', 'cite', 'code', 'data', 'dfn', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby',
-                       'samp', 'small', 'sub', 'time', 'u', 'var', 'wbr', 'button', 'slot', 'content')
-        # 后面添加换行的元素
-        wrap_after_list = ('p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'li', 'blockquote')
-        noText_list = (
-        'script', 'style', 'video', 'audio', 'iframe', 'embed', 'noscript', 'canvas', 'template')  # 不获取文本的元素
-        tab_list = ('td', 'th')
-
-        def get_node_txt(ele, pre: bool = False):
-            str_list = []
-            tag = ele.tag.lower()
-            if tag in noText_list:  # script标签内的文本不返回
-                return str_list
-
-            if tag == 'pre':
-                pre = True
-
-            nodes = ele.eles('xpath:./text() | *')
-            prev_ele = ''
-            for el in nodes:
-                if isinstance(el, str):  # 字符节点
-                    if pre:
-                        str_list.append(el)
-
-                    else:
-                        if sub('[ \n]', '', el) != '':  # 字符除了回车和空格还有其它内容
-                            txt = el
-                            if not pre:
-                                txt = txt.replace('\n', ' ').strip(' \t')
-                                txt = sub(r' {2,}', ' ', txt)
-                            str_list.append(txt)
-
-                else:  # 元素节点
-                    if el.tag.lower() not in nowrap_list and str_list and str_list[-1] != '\n':  # 元素间换行的情况
-                        str_list.append('\n')
-                    if el.tag.lower() in tab_list and prev_ele in tab_list:
-                        str_list.append('\t')
-                    str_list.extend(get_node_txt(el, pre))
-                    prev_ele = el.tag.lower()
-
-            if tag in wrap_after_list:  # 有些元素后面要添加回车
-                str_list.append('\n')
-
-            return str_list
-
-        re_str = ''.join(get_node_txt(self))
-        return format_html(re_str, False).strip('\n')
+        return get_ele_txt(self)
 
     @property
     def raw_text(self) -> str: