继续完善text属性返回内容；增加comments属性；完善对元素内有注释时节点的获取

2024-12-10 04:00:23 +08:00 · 2021-01-04 17:23:13 +08:00 · 2021-01-04 17:23:13 +08:00 · 744e09c649
commit 744e09c649
parent dfc557b2df
2 changed files with 51 additions and 47 deletions
--- a/DrissionPage/driver_element.py
+++ b/DrissionPage/driver_element.py
@ -4,7 +4,6 @@
@Contact :   g1879@qq.com
@File    :   driver_element.py
 """
-import re
 from pathlib import Path
 from time import sleep
 from typing import Union, List, Any, Tuple
@ -79,13 +78,13 @@ class DriverElement(DrissionElement):
    @property
    def text(self) -> str:
        """返回元素内所有文本"""
+        return format_html(self.inner_ele.get_attribute('innerText'), False)
        # return self.inner_ele.get_attribute('innerText')
-        re_str = self.inner_ele.get_attribute('innerText')
-        re_str = re.sub(r'\n{2,}', '\n', re_str)
-        re_str = re.sub(r' {2,}', ' ', re_str)
-
-        return format_html(re_str.strip('\n '))
-        # return re_str.strip('\n ')
+        # re_str = self.inner_ele.get_attribute('innerText')
+        # re_str = re.sub(r'\n{2,}', '\n', re_str)
+        # re_str = re.sub(r' {2,}', ' ', re_str)
+        #
+        # return format_html(re_str.strip('\n '))

    @property
    def link(self) -> str:
@ -116,6 +115,10 @@ class DriverElement(DrissionElement):
        """返回前一个兄弟元素"""
        return self._get_brother(1, 'ele', 'prev')

+    @property
+    def comments(self):
+        return self.eles('xpath:.//comment()')
+
    # -----------------driver独占属性-------------------
    @property
    def size(self) -> dict:
@ -152,9 +155,9 @@ class DriverElement(DrissionElement):
        :return: 文本列表
        """
        if text_node_only:
-            return self.eles('xpath:./text()')
+            return self.eles('xpath:/text()')
        else:
-            return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./node()')]
+            return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]

    def parents(self, num: int = 1):
        """返回上面第num级父元素              \n
@ -576,7 +579,7 @@ class DriverElement(DrissionElement):
        ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)

        # 跳过元素间的换行符
-        while ele_or_node == '\n':
+        while isinstance(ele_or_node, str) and ele_or_node.replace('\n', '').replace('\t', '').replace(' ', '') == '':
            num += 1
            ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)

@ -662,6 +665,7 @@ class ElementsByXpath(object):
                return_txt = '''
                    if(e.singleNodeValue.constructor.name=="Text"){return e.singleNodeValue.data;}
                    else if(e.singleNodeValue.constructor.name=="Attr"){return e.singleNodeValue.nodeValue;}
+                    else if(e.singleNodeValue.constructor.name=="Comment"){return e.singleNodeValue.nodeValue;}
                    else{return e.singleNodeValue;}
                    '''

@ -672,6 +676,7 @@ class ElementsByXpath(object):
                    for(var i = 0; i <e.snapshotLength ; i++){
                        if(e.snapshotItem(i).constructor.name=="Text"){a.push(e.snapshotItem(i).data);}
                        else if(e.snapshotItem(i).constructor.name=="Attr"){a.push(e.snapshotItem(i).nodeValue);}
+                        else if(e.snapshotItem(i).constructor.name=="Comment"){a.push(e.snapshotItem(i).nodeValue);}
                        else{a.push(e.snapshotItem(i));}
                    }
                    """
--- a/DrissionPage/session_element.py
+++ b/DrissionPage/session_element.py
@ -49,6 +49,34 @@ class SessionElement(DrissionElement):
    def text(self) -> str:
        """返回元素内所有文本"""

+        # 为尽量保证与浏览器结果一致，弄得比较复杂
+        def get_node(ele, pre: bool = False):
+            str_list = []
+            if ele.tag == 'pre':
+                pre = True
+            for el in ele.eles('xpath:./text() | *'):
+                if isinstance(el, str):
+                    if el.replace(' ', '').replace('\n', '') != '':
+                        if pre:
+                            str_list.append(el)
+                        else:
+                            str_list.append(el.replace('\n', ' ').strip(' \t'))
+
+                    elif '\n' in el and str_list and str_list[-1] != '\n':
+                        str_list.append('\n')
+                    else:
+                        str_list.append(' ')
+                else:
+                    str_list.extend(get_node(el, pre))
+                    if el.tag in ('br', 'p',) and str_list and str_list[-1] != '\n':
+                        str_list.append('\n')
+
+            return str_list
+
+        re_str = ''.join(get_node(self))
+        re_str = re.sub(r' {2,}', ' ', re_str)
+        return format_html(re_str, False)
+
        # re_str = str(self._inner_ele.text_content())
        # # re_str = re.sub(r'<br */?>', '\n', re_str)
        # re_str = re.sub(r'\n{2,}', '\n', re_str)
@ -56,30 +84,6 @@ class SessionElement(DrissionElement):
        # return format_html(re_str.strip('\n '))
        # # return format_html(re_str)

-        # 为尽量保证与浏览器结果一致，弄得比较复杂
-        def get_node(ele):
-            str_list = []
-            for el in ele.eles('xpath:./node()'):
-                if isinstance(el, str):
-                    if el.replace(' ', '').replace('\n', '') != '':
-                        # str_list.append(el.replace('\xa0', '&nbsp;').replace('\n', ' ').strip())
-                        str_list.append(el.replace('\n', ' ').strip(' '))
-                    elif '\n' in el:
-                        str_list.append('\n')
-                    else:
-                        str_list.append(' ')
-                else:
-                    str_list.extend(get_node(el))
-                    if el.tag in ('br', 'p',):
-                        str_list.append('\n')
-
-            return str_list
-
-        re_str = ''.join(get_node(self))
-        re_str = re.sub(r'\n{2,}', '\n', re_str)
-        re_str = re.sub(r' {2,}', ' ', re_str)
-        return format_html(re_str.strip('\n '))
-
    @property
    def tag(self) -> str:
        """返回元素类型"""
@ -120,26 +124,21 @@ class SessionElement(DrissionElement):
        """返回前一个兄弟元素"""
        return self._get_brother(1, 'ele', 'prev')

+    @property
+    def comments(self):
+        return self.eles('xpath:.//comment()')
+
    def texts(self, text_node_only: bool = False) -> list:
        """返回元素内所有直接子节点的文本，包括元素和文本节点   \n
        :param text_node_only: 是否只返回文本节点
        :return: 文本列表
        """
        if text_node_only:
-            return self.eles('xpath:/text()')
+            texts = self.eles('xpath:/text()')
        else:
-            texts = []
+            texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]

-            for node in self.eles('xpath:/node()'):
-                if isinstance(node, str):
-                    text = node
-                else:
-                    text = node.text
-
-                if text:
-                    texts.append(text)
-
-            return texts
+        return [format_html(x) for x in texts if x and x.replace('\n', '').replace('\t', '').replace(' ', '') != '']

    def parents(self, num: int = 1):
        """返回上面第num级父元素                                         \n
@ -392,7 +391,7 @@ def execute_session_find(page_or_ele,
        page_or_ele = page_or_ele.inner_ele
    else:  # 传入的是SessionPage对象
        page = page_or_ele
-        page_or_ele = fromstring(page_or_ele.response.text)
+        page_or_ele = fromstring(re.sub(r'&nbsp;?', '&nbsp;', page_or_ele.response.text))

    try:
        # 用lxml内置方法获取lxml的元素对象列表