继续研究text的显示,未完成

This commit is contained in:
g1879 2020-12-31 18:16:26 +08:00
parent 5187a6b5b1
commit eec2a510c4

View File

@ -37,15 +37,8 @@ class SessionElement(DrissionElement):
def html(self) -> str:
"""返回元素outerHTML文本"""
# tostring()会把跟紧元素的文本节点也带上,因此要去掉
# print(tostring(self._inner_ele, method="html").decode())
html = format_html(tostring(self._inner_ele, method="html").decode())
# print(html)
return html[:html.rfind('>') + 1]
# return format_html(html[:html.rfind('>') + 1],False)
# def _html(self) -> str:
# html = tostring(self._inner_ele, method="html").decode()
# return html[:html.rfind('>') + 1]
@property
def inner_html(self) -> str:
@ -53,6 +46,53 @@ class SessionElement(DrissionElement):
r = re.match(r'<.*?>(.*)</.*?>', self.html, flags=re.DOTALL)
return '' if not r else r.group(1)
@property
def text(self) -> str:
"""返回元素内所有文本"""
# html = tostring(self._inner_ele, method="html").decode()
# html = html[:html.rfind('>') + 1]
# html = re.sub(r'<.*?>', '', html).strip('\n ')
# html = format_html(re.sub(r' {2,}', ' ', html))
# html = format_html(re.sub(r'\n{2,}', '\n', html))
# html = format_html(re.sub(r'( \n){2,}', '\n', html))
# html = format_html(re.sub(r'(\n ){2,}', '\n', html))
# return html
# return format_html(str(self._inner_ele.text_content()))
# return format_html(str(self._inner_ele.text_content()))#.replace('\n','')
def get_node(ele):
l = []
for el in ele.eles('xpath:./node()'):
if isinstance(el, str):
s = el.replace(' ', '').replace('\n', '')
# print('字符串', [s])
if s != '':
l.append(s.strip(' \n'))
else:
l.append('\n')
else:
# print('元素', el)
l.extend(get_node(el))
return l
# for i in self.eles('xpath:./*'):
# print([i])
# l = []
# for el in get_node(self):
# if isinstance(el,str):
# print('字符串')
# print(el)
# l.append(el)
# else:
# print('元素')
# print(el._inner_ele.text)
# l.append(el._inner_ele.text)
s = ''.join(get_node(self))
return re.sub(r'\n{2,}', '\n', s)
@property
def tag(self) -> str:
"""返回元素类型"""
@ -63,23 +103,6 @@ class SessionElement(DrissionElement):
"""返回元素所有属性及值"""
return {attr: self.attr(attr) for attr, val in self.inner_ele.items()}
@property
def text(self) -> str:
"""返回元素内所有文本"""
html = format_html(tostring(self._inner_ele, method="html").decode(), False)
html = html[:html.rfind('>') + 1]
txt = re.sub(r'<.*?>', '', html).replace('\n', ' ')
txt = re.sub(r' {2,}', ' ', txt).strip()
# return format_html(txt)
return txt
# return t
# return str(self._inner_ele.text_content())
# return self._inner_ele.text_content()
# txt = str(self._inner_ele.text_content()).replace('\n', ' ')
@property
def link(self) -> str:
"""返回href或src绝对url"""
@ -315,7 +338,7 @@ class SessionElement(DrissionElement):
return path_str[1:] if mode == 'css' else path_str
def _get_brother(self, num: int = 1, mode: str = 'ele', direction: str = 'next'):
"""返回前面第num个兄弟元素或节点 \n
"""返回前面或后面第num个兄弟元素或节点 \n
:param num: 前面第几个兄弟元素或节点
:param mode: 'ele', 'node' 'text'匹配元素节点或文本节点
:param direction: 'next' 'prev'查找的方向
@ -370,8 +393,7 @@ def execute_session_find(page_or_ele,
page_or_ele = page_or_ele.inner_ele
else: # 传入的是SessionPage对象
page = page_or_ele
# page_or_ele = fromstring(page_or_ele.html)
page_or_ele = fromstring(page_or_ele.response.text, False)
page_or_ele = fromstring(format_html(page_or_ele.response.text, False))
try:
# 用lxml内置方法获取lxml的元素对象列表