From 5187a6b5b12ee0166e34f67e9ee8760df4a84269 Mon Sep 17 00:00:00 2001 From: g1879 Date: Thu, 31 Dec 2020 00:33:59 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96s=E6=A8=A1=E5=BC=8Fhtml?= =?UTF-8?q?=E5=92=8Ctext=EF=BC=8C=E5=8E=BB=E9=99=A4=E7=A9=BA=E6=A0=BC?= =?UTF-8?q?=E5=92=8C=E5=9B=9E=E8=BD=A6=EF=BC=8C=E6=9C=AA=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/common.py | 7 +++++-- DrissionPage/session_element.py | 31 +++++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/DrissionPage/common.py b/DrissionPage/common.py index c062606..da2d22c 100644 --- a/DrissionPage/common.py +++ b/DrissionPage/common.py @@ -201,9 +201,12 @@ def _make_search_str(search_str: str) -> str: return search_str -def format_html(text: str) -> str: +def format_html(text: str, replace_space: bool = True) -> str: """处理html编码字符""" - return unescape(text).replace('\xa0', ' ') if text else text + if text: + return unescape(text).replace('\xa0', ' ') if replace_space else unescape(text) + else: + return text def translate_loc(loc: tuple) -> tuple: diff --git a/DrissionPage/session_element.py b/DrissionPage/session_element.py index fe01d14..bfb3f95 100644 --- a/DrissionPage/session_element.py +++ b/DrissionPage/session_element.py @@ -37,8 +37,15 @@ class SessionElement(DrissionElement): def html(self) -> str: """返回元素outerHTML文本""" # tostring()会把跟紧元素的文本节点也带上,因此要去掉 + # print(tostring(self._inner_ele, method="html").decode()) html = format_html(tostring(self._inner_ele, method="html").decode()) + # print(html) return html[:html.rfind('>') + 1] + # return format_html(html[:html.rfind('>') + 1],False) + + # def _html(self) -> str: + # html = tostring(self._inner_ele, method="html").decode() + # return html[:html.rfind('>') + 1] @property def inner_html(self) -> str: @@ -59,7 +66,19 @@ class SessionElement(DrissionElement): @property def text(self) -> str: """返回元素内所有文本""" - return str(self._inner_ele.text_content()) + html = format_html(tostring(self._inner_ele, method="html").decode(), False) + html = html[:html.rfind('>') + 1] + + txt = re.sub(r'<.*?>', '', html).replace('\n', ' ') + txt = re.sub(r' {2,}', ' ', txt).strip() + # return format_html(txt) + return txt + + # return t + # return str(self._inner_ele.text_content()) + # return self._inner_ele.text_content() + + # txt = str(self._inner_ele.text_content()).replace('\n', ' ') @property def link(self) -> str: @@ -284,12 +303,6 @@ class SessionElement(DrissionElement): ele = self while ele: - # ele_id = ele.attr('id') - - # if ele_id: - # return f'#{ele_id}{path_str}' if mode == 'css' else f'//{ele.tag}[@id="{ele_id}"]{path_str}' - # else: - if mode == 'css': brothers = len(ele.eles(f'xpath:./preceding-sibling::*')) path_str = f'>:nth-child({brothers + 1}){path_str}' @@ -357,7 +370,8 @@ def execute_session_find(page_or_ele, page_or_ele = page_or_ele.inner_ele else: # 传入的是SessionPage对象 page = page_or_ele - page_or_ele = fromstring(page_or_ele.html) + # page_or_ele = fromstring(page_or_ele.html) + page_or_ele = fromstring(page_or_ele.response.text, False) try: # 用lxml内置方法获取lxml的元素对象列表 @@ -368,6 +382,7 @@ def execute_session_find(page_or_ele, else: ele = page_or_ele.cssselect(loc[1]) + # 结果不是列表,如数字 if not isinstance(ele, list): return ele