完善获取元素text,统一两种模式获取text的方式

This commit is contained in:
g1879 2021-12-01 00:10:44 +08:00
parent 348b4e164f
commit 219e2c8cc3
3 changed files with 65 additions and 60 deletions

View File

@ -115,6 +115,66 @@ def str_to_loc(loc: str) -> tuple:
return loc_by, loc_str return loc_by, loc_str
def get_ele_txt(e) -> str:
# 前面无须换行的元素
nowrap_list = ('sub', 'em', 'strong', 'a', 'font', 'b', 'span', 's', 'i', 'del', 'ins', 'img', 'td', 'th',
'abbr', 'bdi', 'bdo', 'cite', 'code', 'data', 'dfn', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby',
'samp', 'small', 'sub', 'time', 'u', 'var', 'wbr', 'button', 'slot', 'content')
# 后面添加换行的元素
wrap_after_list = ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'li', 'blockquote', 'header',
'footer', 'address' 'article', 'aside', 'main', 'nav', 'section', 'figcaption', 'summary')
# 不获取文本的元素
noText_list = ('script', 'style', 'video', 'audio', 'iframe', 'embed', 'noscript', 'canvas', 'template')
# 用/t分隔的元素
tab_list = ('td', 'th')
if e.tag in noText_list:
return e.raw_text
def get_node_txt(ele, pre: bool = False):
str_list = []
tag = ele.tag.lower()
if tag in noText_list: # script标签内的文本不返回
return str_list
if tag == 'br':
return '\n'
if tag == 'pre':
pre = True
nodes = ele.eles('xpath:./text() | *')
prev_ele = ''
for el in nodes:
if isinstance(el, str): # 字符节点
if pre:
str_list.append(el)
else:
if sub('[ \n]', '', el) != '': # 字符除了回车和空格还有其它内容
txt = el
if not pre:
txt = txt.replace('\n', ' ').strip(' ')
txt = sub(r' {2,}', ' ', txt)
str_list.append(txt)
else: # 元素节点
if el.tag.lower() not in nowrap_list and str_list and str_list[-1] != '\n': # 元素间换行的情况
str_list.append('\n')
if el.tag.lower() in tab_list and prev_ele in tab_list: # 表格的行
str_list.append('\t')
str_list.extend(get_node_txt(el, pre))
prev_ele = el.tag.lower()
if tag in wrap_after_list and str_list and str_list[-1] != '\n': # 有些元素后面要添加回车
str_list.append('\n')
return str_list
re_str = ''.join(get_node_txt(e))
return format_html(re_str, False).strip(' \n')
def _make_xpath_str(tag: str, arg: str, val: str, mode: str = 'fuzzy') -> str: def _make_xpath_str(tag: str, arg: str, val: str, mode: str = 'fuzzy') -> str:
"""生成xpath语句 \n """生成xpath语句 \n
:param tag: 标签名 :param tag: 标签名

View File

@ -16,7 +16,7 @@ from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support.wait import WebDriverWait
from .base import DrissionElement, BaseElement from .base import DrissionElement, BaseElement
from .common import str_to_loc, get_usable_path, translate_loc, format_html from .common import str_to_loc, get_usable_path, translate_loc, format_html, get_ele_txt
from .session_element import make_session_ele from .session_element import make_session_ele
@ -80,12 +80,7 @@ class DriverElement(DrissionElement):
@property @property
def text(self) -> str: def text(self) -> str:
"""返回元素内所有文本""" """返回元素内所有文本"""
# re_str = self.inner_ele.text return get_ele_txt(self)
re_str = self.inner_ele.get_attribute('innerText')
# re_str = sub(r'\n{2,}', '\n', re_str)
# re_str = sub(r' {2,}', ' ', re_str)
return format_html(re_str.strip('\n '), False)
@property @property
def raw_text(self) -> str: def raw_text(self) -> str:

View File

@ -4,7 +4,7 @@
@Contact : g1879@qq.com @Contact : g1879@qq.com
@File : session_element.py @File : session_element.py
""" """
from re import match, DOTALL, sub from re import match, DOTALL
from typing import Union, List, Tuple from typing import Union, List, Tuple
from urllib.parse import urlparse, urljoin, urlunparse from urllib.parse import urlparse, urljoin, urlunparse
@ -12,7 +12,7 @@ from lxml.etree import tostring
from lxml.html import HtmlElement, fromstring from lxml.html import HtmlElement, fromstring
from .base import DrissionElement, BasePage, BaseElement from .base import DrissionElement, BasePage, BaseElement
from .common import str_to_loc, translate_loc, format_html from .common import str_to_loc, translate_loc, format_html, get_ele_txt
class SessionElement(DrissionElement): class SessionElement(DrissionElement):
@ -58,57 +58,7 @@ class SessionElement(DrissionElement):
@property @property
def text(self) -> str: def text(self) -> str:
"""返回元素内所有文本""" """返回元素内所有文本"""
return get_ele_txt(self)
# 为尽量保证与浏览器结果一致,弄得比较复杂
# 前面无须换行的元素
nowrap_list = ('sub', 'em', 'strong', 'a', 'font', 'b', 'span', 's', 'i', 'del', 'ins', 'br', 'img', 'td', 'th',
'abbr', 'bdi', 'bdo', 'cite', 'code', 'data', 'dfn', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby',
'samp', 'small', 'sub', 'time', 'u', 'var', 'wbr', 'button', 'slot', 'content')
# 后面添加换行的元素
wrap_after_list = ('p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'li', 'blockquote')
noText_list = (
'script', 'style', 'video', 'audio', 'iframe', 'embed', 'noscript', 'canvas', 'template') # 不获取文本的元素
tab_list = ('td', 'th')
def get_node_txt(ele, pre: bool = False):
str_list = []
tag = ele.tag.lower()
if tag in noText_list: # script标签内的文本不返回
return str_list
if tag == 'pre':
pre = True
nodes = ele.eles('xpath:./text() | *')
prev_ele = ''
for el in nodes:
if isinstance(el, str): # 字符节点
if pre:
str_list.append(el)
else:
if sub('[ \n]', '', el) != '': # 字符除了回车和空格还有其它内容
txt = el
if not pre:
txt = txt.replace('\n', ' ').strip(' \t')
txt = sub(r' {2,}', ' ', txt)
str_list.append(txt)
else: # 元素节点
if el.tag.lower() not in nowrap_list and str_list and str_list[-1] != '\n': # 元素间换行的情况
str_list.append('\n')
if el.tag.lower() in tab_list and prev_ele in tab_list:
str_list.append('\t')
str_list.extend(get_node_txt(el, pre))
prev_ele = el.tag.lower()
if tag in wrap_after_list: # 有些元素后面要添加回车
str_list.append('\n')
return str_list
re_str = ''.join(get_node_txt(self))
return format_html(re_str, False).strip('\n')
@property @property
def raw_text(self) -> str: def raw_text(self) -> str: