From f1eb7809635a01388fb602349c714839bdfdf95e Mon Sep 17 00:00:00 2001 From: g1879 Date: Tue, 30 Nov 2021 17:27:48 +0800 Subject: [PATCH] =?UTF-8?q?=E7=BB=A7=E7=BB=AD=E7=A0=94=E7=A9=B6SessionELem?= =?UTF-8?q?ent=E7=9A=84text=EF=BC=8C=E6=9C=AA=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/driver_element.py | 4 ++-- DrissionPage/session_element.py | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/DrissionPage/driver_element.py b/DrissionPage/driver_element.py index 2521e62..3035b0e 100644 --- a/DrissionPage/driver_element.py +++ b/DrissionPage/driver_element.py @@ -80,8 +80,8 @@ class DriverElement(DrissionElement): @property def text(self) -> str: """返回元素内所有文本""" - re_str = self.inner_ele.text - # re_str = self.inner_ele.get_attribute('innerText') + # re_str = self.inner_ele.text + re_str = self.inner_ele.get_attribute('innerText') # re_str = sub(r'\n{2,}', '\n', re_str) # re_str = sub(r' {2,}', ' ', re_str) diff --git a/DrissionPage/session_element.py b/DrissionPage/session_element.py index acfcfc1..f15fc89 100644 --- a/DrissionPage/session_element.py +++ b/DrissionPage/session_element.py @@ -60,16 +60,16 @@ class SessionElement(DrissionElement): """返回元素内所有文本""" # 为尽量保证与浏览器结果一致,弄得比较复杂 - nowrap_list = ('a', 'font', 'b', 'span', 'style', 's', 'i', 'del', 'br') # 前面无须换行的元素 + nowrap_list = ('a', 'font', 'b', 'span', 'style', 's', 'i', 'del', 'br', 'img', 'td') # 前面无须换行的元素 + wrap_after_list = ('p', 'div', 'br') # 后面添加换行的元素 noText_list = ('script',) # 不获取文本的元素 + tab_list = ('td',) def get_node_txt(ele, pre: bool = False): str_list = [] tag = ele.tag.lower() if tag in noText_list: # script标签内的文本不返回 return str_list - elif tag == 'br': - return ['\n'] if tag == 'pre': pre = True @@ -91,9 +91,11 @@ class SessionElement(DrissionElement): else: # 元素节点 if el.tag.lower() not in nowrap_list and str_list and str_list[-1] != '\n': # 元素间换行的情况 str_list.append('\n') + if el.tag.lower() in tab_list: + str_list.append('\t') str_list.extend(get_node_txt(el, pre)) - if tag in ('p', 'div'): + if tag in wrap_after_list: # 有些元素后面要添加回车 str_list.append('\n') return str_list