s模式元素text尽量与d模式保持一致

This commit is contained in:
g1879 2021-01-01 00:33:45 +08:00
parent 4ff724e6d4
commit c7a8af6fe4
2 changed files with 19 additions and 42 deletions

View File

@ -201,12 +201,9 @@ def _make_search_str(search_str: str) -> str:
return search_str return search_str
def format_html(text: str, replace_space: bool = True) -> str: def format_html(text: str) -> str:
"""处理html编码字符""" """处理html编码字符"""
if text: return unescape(text) if text else text
return unescape(text).replace('\xa0', ' ') if replace_space else unescape(text)
else:
return text
def translate_loc(loc: tuple) -> tuple: def translate_loc(loc: tuple) -> tuple:

View File

@ -38,7 +38,7 @@ class SessionElement(DrissionElement):
"""返回元素outerHTML文本""" """返回元素outerHTML文本"""
# tostring()会把跟紧元素的文本节点也带上,因此要去掉 # tostring()会把跟紧元素的文本节点也带上,因此要去掉
html = format_html(tostring(self._inner_ele, method="html").decode()) html = format_html(tostring(self._inner_ele, method="html").decode())
return html[:html.rfind('>') + 1] return format_html(html[:html.rfind('>') + 1])
@property @property
def inner_html(self) -> str: def inner_html(self) -> str:
@ -50,48 +50,27 @@ class SessionElement(DrissionElement):
def text(self) -> str: def text(self) -> str:
"""返回元素内所有文本""" """返回元素内所有文本"""
# html = tostring(self._inner_ele, method="html").decode() # 为尽量保证与浏览器结果一致,弄得比较复杂
# html = html[:html.rfind('>') + 1]
# html = re.sub(r'<.*?>', '', html).strip('\n ')
# html = format_html(re.sub(r' {2,}', ' ', html))
# html = format_html(re.sub(r'\n{2,}', '\n', html))
# html = format_html(re.sub(r'( \n){2,}', '\n', html))
# html = format_html(re.sub(r'(\n ){2,}', '\n', html))
# return html
# return format_html(str(self._inner_ele.text_content()))
# return format_html(str(self._inner_ele.text_content()))#.replace('\n','')
def get_node(ele): def get_node(ele):
l = [] str_list = []
for el in ele.eles('xpath:./node()'): for el in ele.eles('xpath:./node()'):
if isinstance(el, str): if isinstance(el, str):
s = el.replace(' ', '').replace('\n', '') if el.replace(' ', '').replace('\n', '') != '':
# print('字符串', [s]) str_list.append(el.replace('\xa0', '&nbsp;').replace('\n', ' ').strip())
if s != '': elif '\n' in el:
l.append(s.strip(' \n')) str_list.append('\n')
else: else:
l.append('\n') str_list.append(' ')
else: else:
# print('元素', el) str_list.extend(get_node(el))
l.extend(get_node(el))
return l
# for i in self.eles('xpath:./*'): return str_list
# print([i])
# l = [] re_str = ''.join(get_node(self))
# for el in get_node(self): re_str = re.sub(r'\n{2,}', '\n', re_str)
# if isinstance(el,str): re_str = re.sub(r' {2,}', ' ', re_str)
# print('字符串')
# print(el) return format_html(re_str.strip('\n '))
# l.append(el)
# else:
# print('元素')
# print(el._inner_ele.text)
# l.append(el._inner_ele.text)
s = ''.join(get_node(self))
return re.sub(r'\n{2,}', '\n', s)
@property @property
def tag(self) -> str: def tag(self) -> str:
@ -393,7 +372,8 @@ def execute_session_find(page_or_ele,
page_or_ele = page_or_ele.inner_ele page_or_ele = page_or_ele.inner_ele
else: # 传入的是SessionPage对象 else: # 传入的是SessionPage对象
page = page_or_ele page = page_or_ele
page_or_ele = fromstring(format_html(page_or_ele.response.text, False)) # page_or_ele = fromstring(format_html(page_or_ele.response.text, False))
page_or_ele = fromstring(page_or_ele.response.text)
try: try:
# 用lxml内置方法获取lxml的元素对象列表 # 用lxml内置方法获取lxml的元素对象列表