完善texts()方法

This commit is contained in:
g1879 2021-01-04 17:42:45 +08:00
parent 0f9f52b1f6
commit e03a67c3a5
2 changed files with 13 additions and 5 deletions

View File

@ -155,9 +155,11 @@ class DriverElement(DrissionElement):
:return: 文本列表 :return: 文本列表
""" """
if text_node_only: if text_node_only:
return self.eles('xpath:/text()') texts = self.eles('xpath:/text()')
else: else:
return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')] texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]
return [x.strip(' ') for x in texts if x and x.replace('\n', '').replace('\t', '').replace(' ', '') != '']
def parents(self, num: int = 1): def parents(self, num: int = 1):
"""返回上面第num级父元素 \n """返回上面第num级父元素 \n

View File

@ -54,7 +54,12 @@ class SessionElement(DrissionElement):
str_list = [] str_list = []
if ele.tag == 'pre': if ele.tag == 'pre':
pre = True pre = True
current_tag = None
for el in ele.eles('xpath:./text() | *'): for el in ele.eles('xpath:./text() | *'):
if current_tag in ('br', 'p') and str_list and str_list[-1] != '\n':
str_list.append('\n')
if isinstance(el, str): if isinstance(el, str):
if el.replace(' ', '').replace('\n', '') != '': if el.replace(' ', '').replace('\n', '') != '':
if pre: if pre:
@ -66,10 +71,10 @@ class SessionElement(DrissionElement):
str_list.append('\n') str_list.append('\n')
else: else:
str_list.append(' ') str_list.append(' ')
current_tag = None
else: else:
str_list.extend(get_node(el, pre)) str_list.extend(get_node(el, pre))
if el.tag in ('br', 'p',) and str_list and str_list[-1] != '\n': current_tag = el.tag
str_list.append('\n')
return str_list return str_list
@ -138,7 +143,8 @@ class SessionElement(DrissionElement):
else: else:
texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')] texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]
return [format_html(x) for x in texts if x and x.replace('\n', '').replace('\t', '').replace(' ', '') != ''] return [format_html(x.strip(' ')) for x in texts if
x and x.replace('\n', '').replace('\t', '').replace(' ', '') != '']
def parents(self, num: int = 1): def parents(self, num: int = 1):
"""返回上面第num级父元素 \n """返回上面第num级父元素 \n