未完成

This commit is contained in:
g1879 2021-11-30 22:26:20 +08:00
parent f1eb780963
commit 348b4e164f

View File

@ -60,10 +60,15 @@ class SessionElement(DrissionElement):
"""返回元素内所有文本"""
# 为尽量保证与浏览器结果一致,弄得比较复杂
nowrap_list = ('a', 'font', 'b', 'span', 'style', 's', 'i', 'del', 'br', 'img', 'td') # 前面无须换行的元素
wrap_after_list = ('p', 'div', 'br') # 后面添加换行的元素
noText_list = ('script',) # 不获取文本的元素
tab_list = ('td',)
# 前面无须换行的元素
nowrap_list = ('sub', 'em', 'strong', 'a', 'font', 'b', 'span', 's', 'i', 'del', 'ins', 'br', 'img', 'td', 'th',
'abbr', 'bdi', 'bdo', 'cite', 'code', 'data', 'dfn', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby',
'samp', 'small', 'sub', 'time', 'u', 'var', 'wbr', 'button', 'slot', 'content')
# 后面添加换行的元素
wrap_after_list = ('p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'li', 'blockquote')
noText_list = (
'script', 'style', 'video', 'audio', 'iframe', 'embed', 'noscript', 'canvas', 'template') # 不获取文本的元素
tab_list = ('td', 'th')
def get_node_txt(ele, pre: bool = False):
str_list = []
@ -75,7 +80,8 @@ class SessionElement(DrissionElement):
pre = True
nodes = ele.eles('xpath:./text() | *')
for k, el in enumerate(nodes):
prev_ele = ''
for el in nodes:
if isinstance(el, str): # 字符节点
if pre:
str_list.append(el)
@ -91,9 +97,10 @@ class SessionElement(DrissionElement):
else: # 元素节点
if el.tag.lower() not in nowrap_list and str_list and str_list[-1] != '\n': # 元素间换行的情况
str_list.append('\n')
if el.tag.lower() in tab_list:
if el.tag.lower() in tab_list and prev_ele in tab_list:
str_list.append('\t')
str_list.extend(get_node_txt(el, pre))
prev_ele = el.tag.lower()
if tag in wrap_after_list: # 有些元素后面要添加回车
str_list.append('\n')