继续完善text属性返回内容;增加comments属性;完善对元素内有注释时节点的获取

This commit is contained in:
g1879 2021-01-04 17:23:13 +08:00
parent dfc557b2df
commit 744e09c649
2 changed files with 51 additions and 47 deletions

View File

@ -4,7 +4,6 @@
@Contact : g1879@qq.com
@File : driver_element.py
"""
import re
from pathlib import Path
from time import sleep
from typing import Union, List, Any, Tuple
@ -79,13 +78,13 @@ class DriverElement(DrissionElement):
@property
def text(self) -> str:
"""返回元素内所有文本"""
return format_html(self.inner_ele.get_attribute('innerText'), False)
# return self.inner_ele.get_attribute('innerText')
re_str = self.inner_ele.get_attribute('innerText')
re_str = re.sub(r'\n{2,}', '\n', re_str)
re_str = re.sub(r' {2,}', ' ', re_str)
return format_html(re_str.strip('\n '))
# return re_str.strip('\n ')
# re_str = self.inner_ele.get_attribute('innerText')
# re_str = re.sub(r'\n{2,}', '\n', re_str)
# re_str = re.sub(r' {2,}', ' ', re_str)
#
# return format_html(re_str.strip('\n '))
@property
def link(self) -> str:
@ -116,6 +115,10 @@ class DriverElement(DrissionElement):
"""返回前一个兄弟元素"""
return self._get_brother(1, 'ele', 'prev')
@property
def comments(self):
return self.eles('xpath:.//comment()')
# -----------------driver独占属性-------------------
@property
def size(self) -> dict:
@ -152,9 +155,9 @@ class DriverElement(DrissionElement):
:return: 文本列表
"""
if text_node_only:
return self.eles('xpath:./text()')
return self.eles('xpath:/text()')
else:
return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./node()')]
return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]
def parents(self, num: int = 1):
"""返回上面第num级父元素 \n
@ -576,7 +579,7 @@ class DriverElement(DrissionElement):
ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
# 跳过元素间的换行符
while ele_or_node == '\n':
while isinstance(ele_or_node, str) and ele_or_node.replace('\n', '').replace('\t', '').replace(' ', '') == '':
num += 1
ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
@ -662,6 +665,7 @@ class ElementsByXpath(object):
return_txt = '''
if(e.singleNodeValue.constructor.name=="Text"){return e.singleNodeValue.data;}
else if(e.singleNodeValue.constructor.name=="Attr"){return e.singleNodeValue.nodeValue;}
else if(e.singleNodeValue.constructor.name=="Comment"){return e.singleNodeValue.nodeValue;}
else{return e.singleNodeValue;}
'''
@ -672,6 +676,7 @@ class ElementsByXpath(object):
for(var i = 0; i <e.snapshotLength ; i++){
if(e.snapshotItem(i).constructor.name=="Text"){a.push(e.snapshotItem(i).data);}
else if(e.snapshotItem(i).constructor.name=="Attr"){a.push(e.snapshotItem(i).nodeValue);}
else if(e.snapshotItem(i).constructor.name=="Comment"){a.push(e.snapshotItem(i).nodeValue);}
else{a.push(e.snapshotItem(i));}
}
"""

View File

@ -49,6 +49,34 @@ class SessionElement(DrissionElement):
def text(self) -> str:
"""返回元素内所有文本"""
# 为尽量保证与浏览器结果一致,弄得比较复杂
def get_node(ele, pre: bool = False):
str_list = []
if ele.tag == 'pre':
pre = True
for el in ele.eles('xpath:./text() | *'):
if isinstance(el, str):
if el.replace(' ', '').replace('\n', '') != '':
if pre:
str_list.append(el)
else:
str_list.append(el.replace('\n', ' ').strip(' \t'))
elif '\n' in el and str_list and str_list[-1] != '\n':
str_list.append('\n')
else:
str_list.append(' ')
else:
str_list.extend(get_node(el, pre))
if el.tag in ('br', 'p',) and str_list and str_list[-1] != '\n':
str_list.append('\n')
return str_list
re_str = ''.join(get_node(self))
re_str = re.sub(r' {2,}', ' ', re_str)
return format_html(re_str, False)
# re_str = str(self._inner_ele.text_content())
# # re_str = re.sub(r'<br */?>', '\n', re_str)
# re_str = re.sub(r'\n{2,}', '\n', re_str)
@ -56,30 +84,6 @@ class SessionElement(DrissionElement):
# return format_html(re_str.strip('\n '))
# # return format_html(re_str)
# 为尽量保证与浏览器结果一致,弄得比较复杂
def get_node(ele):
str_list = []
for el in ele.eles('xpath:./node()'):
if isinstance(el, str):
if el.replace(' ', '').replace('\n', '') != '':
# str_list.append(el.replace('\xa0', '&nbsp;').replace('\n', ' ').strip())
str_list.append(el.replace('\n', ' ').strip(' '))
elif '\n' in el:
str_list.append('\n')
else:
str_list.append(' ')
else:
str_list.extend(get_node(el))
if el.tag in ('br', 'p',):
str_list.append('\n')
return str_list
re_str = ''.join(get_node(self))
re_str = re.sub(r'\n{2,}', '\n', re_str)
re_str = re.sub(r' {2,}', ' ', re_str)
return format_html(re_str.strip('\n '))
@property
def tag(self) -> str:
"""返回元素类型"""
@ -120,26 +124,21 @@ class SessionElement(DrissionElement):
"""返回前一个兄弟元素"""
return self._get_brother(1, 'ele', 'prev')
@property
def comments(self):
return self.eles('xpath:.//comment()')
def texts(self, text_node_only: bool = False) -> list:
"""返回元素内所有直接子节点的文本,包括元素和文本节点 \n
:param text_node_only: 是否只返回文本节点
:return: 文本列表
"""
if text_node_only:
return self.eles('xpath:/text()')
texts = self.eles('xpath:/text()')
else:
texts = []
texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]
for node in self.eles('xpath:/node()'):
if isinstance(node, str):
text = node
else:
text = node.text
if text:
texts.append(text)
return texts
return [format_html(x) for x in texts if x and x.replace('\n', '').replace('\t', '').replace(' ', '') != '']
def parents(self, num: int = 1):
"""返回上面第num级父元素 \n
@ -392,7 +391,7 @@ def execute_session_find(page_or_ele,
page_or_ele = page_or_ele.inner_ele
else: # 传入的是SessionPage对象
page = page_or_ele
page_or_ele = fromstring(page_or_ele.response.text)
page_or_ele = fromstring(re.sub(r'&nbsp;?', '&nbsp;', page_or_ele.response.text))
try:
# 用lxml内置方法获取lxml的元素对象列表