修改SessionElement的text属性,未完成

This commit is contained in:
g1879 2021-11-29 17:30:18 +08:00
parent ca494ec6e8
commit 6e761b74a2

View File

@ -60,41 +60,45 @@ class SessionElement(DrissionElement):
"""返回元素内所有文本""" """返回元素内所有文本"""
# 为尽量保证与浏览器结果一致,弄得比较复杂 # 为尽量保证与浏览器结果一致,弄得比较复杂
def get_node(ele, pre: bool = False): nowrap_list = ('a', 'font', 'b', 'span', 'style', 's', 'i', 'del', 'br') # 前面无须换行的元素
noText_list = ('script',) # 不获取文本的元素
def get_node_txt(ele, pre: bool = False):
str_list = [] str_list = []
if ele.tag == 'pre': tag = ele.tag.lower()
if tag in noText_list: # script标签内的文本不返回
return str_list
elif tag == 'br':
return ['\n']
if tag == 'pre':
pre = True pre = True
current_tag = None nodes = ele.eles('xpath:./text() | *')
for el in ele.eles('xpath:./text() | *'): for k, el in enumerate(nodes):
if current_tag in ('br', 'p') and str_list and str_list[-1] != '\n': if isinstance(el, str): # 字符节点
str_list.append('\n') if pre:
str_list.append(el)
if isinstance(el, str): else:
if sub('[ \n]', '', el) != '': # 字符除了回车和空格还有其它内容 if sub('[ \n]', '', el) != '': # 字符除了回车和空格还有其它内容
if pre: txt = el
str_list.append(el) if not pre:
else: txt = txt.replace('\n', ' ').strip(' \t')
str_list.append(el.replace('\n', ' ').strip(' \t')) txt = sub(r' {2,}', ' ', txt)
str_list.append(txt)
elif '\n' in el and str_list and str_list[-1] != '\n': # 元素间换行的情况 else: # 元素节点
if el.tag.lower() not in nowrap_list and str_list and str_list[-1] != '\n': # 元素间换行的情况
str_list.append('\n') str_list.append('\n')
else: # 整个字符由回车和空格组成 str_list.extend(get_node_txt(el, pre))
str_list.append(' ')
current_tag = None
elif el.tag.lower() == 'script': if tag in ('p', 'div'):
current_tag = None str_list.append('\n')
else:
str_list.extend(get_node(el, pre))
current_tag = el.tag
return str_list return str_list
re_str = ''.join(get_node(self)) re_str = ''.join(get_node_txt(self))
re_str = sub(r' {2,}', ' ', re_str)
re_str = sub(r'\n{2,}', '\n', re_str)
return format_html(re_str, False).strip('\n') return format_html(re_str, False).strip('\n')
@property @property