继续完善text属性返回内容;增加comments属性;完善对元素内有注释时节点的获取

This commit is contained in:
g1879 2021-01-04 17:23:13 +08:00
parent dfc557b2df
commit 744e09c649
2 changed files with 51 additions and 47 deletions

View File

@ -4,7 +4,6 @@
@Contact : g1879@qq.com @Contact : g1879@qq.com
@File : driver_element.py @File : driver_element.py
""" """
import re
from pathlib import Path from pathlib import Path
from time import sleep from time import sleep
from typing import Union, List, Any, Tuple from typing import Union, List, Any, Tuple
@ -79,13 +78,13 @@ class DriverElement(DrissionElement):
@property @property
def text(self) -> str: def text(self) -> str:
"""返回元素内所有文本""" """返回元素内所有文本"""
return format_html(self.inner_ele.get_attribute('innerText'), False)
# return self.inner_ele.get_attribute('innerText') # return self.inner_ele.get_attribute('innerText')
re_str = self.inner_ele.get_attribute('innerText') # re_str = self.inner_ele.get_attribute('innerText')
re_str = re.sub(r'\n{2,}', '\n', re_str) # re_str = re.sub(r'\n{2,}', '\n', re_str)
re_str = re.sub(r' {2,}', ' ', re_str) # re_str = re.sub(r' {2,}', ' ', re_str)
#
return format_html(re_str.strip('\n ')) # return format_html(re_str.strip('\n '))
# return re_str.strip('\n ')
@property @property
def link(self) -> str: def link(self) -> str:
@ -116,6 +115,10 @@ class DriverElement(DrissionElement):
"""返回前一个兄弟元素""" """返回前一个兄弟元素"""
return self._get_brother(1, 'ele', 'prev') return self._get_brother(1, 'ele', 'prev')
@property
def comments(self):
return self.eles('xpath:.//comment()')
# -----------------driver独占属性------------------- # -----------------driver独占属性-------------------
@property @property
def size(self) -> dict: def size(self) -> dict:
@ -152,9 +155,9 @@ class DriverElement(DrissionElement):
:return: 文本列表 :return: 文本列表
""" """
if text_node_only: if text_node_only:
return self.eles('xpath:./text()') return self.eles('xpath:/text()')
else: else:
return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./node()')] return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]
def parents(self, num: int = 1): def parents(self, num: int = 1):
"""返回上面第num级父元素 \n """返回上面第num级父元素 \n
@ -576,7 +579,7 @@ class DriverElement(DrissionElement):
ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout) ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
# 跳过元素间的换行符 # 跳过元素间的换行符
while ele_or_node == '\n': while isinstance(ele_or_node, str) and ele_or_node.replace('\n', '').replace('\t', '').replace(' ', '') == '':
num += 1 num += 1
ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout) ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
@ -662,6 +665,7 @@ class ElementsByXpath(object):
return_txt = ''' return_txt = '''
if(e.singleNodeValue.constructor.name=="Text"){return e.singleNodeValue.data;} if(e.singleNodeValue.constructor.name=="Text"){return e.singleNodeValue.data;}
else if(e.singleNodeValue.constructor.name=="Attr"){return e.singleNodeValue.nodeValue;} else if(e.singleNodeValue.constructor.name=="Attr"){return e.singleNodeValue.nodeValue;}
else if(e.singleNodeValue.constructor.name=="Comment"){return e.singleNodeValue.nodeValue;}
else{return e.singleNodeValue;} else{return e.singleNodeValue;}
''' '''
@ -672,6 +676,7 @@ class ElementsByXpath(object):
for(var i = 0; i <e.snapshotLength ; i++){ for(var i = 0; i <e.snapshotLength ; i++){
if(e.snapshotItem(i).constructor.name=="Text"){a.push(e.snapshotItem(i).data);} if(e.snapshotItem(i).constructor.name=="Text"){a.push(e.snapshotItem(i).data);}
else if(e.snapshotItem(i).constructor.name=="Attr"){a.push(e.snapshotItem(i).nodeValue);} else if(e.snapshotItem(i).constructor.name=="Attr"){a.push(e.snapshotItem(i).nodeValue);}
else if(e.snapshotItem(i).constructor.name=="Comment"){a.push(e.snapshotItem(i).nodeValue);}
else{a.push(e.snapshotItem(i));} else{a.push(e.snapshotItem(i));}
} }
""" """

View File

@ -49,6 +49,34 @@ class SessionElement(DrissionElement):
def text(self) -> str: def text(self) -> str:
"""返回元素内所有文本""" """返回元素内所有文本"""
# 为尽量保证与浏览器结果一致,弄得比较复杂
def get_node(ele, pre: bool = False):
str_list = []
if ele.tag == 'pre':
pre = True
for el in ele.eles('xpath:./text() | *'):
if isinstance(el, str):
if el.replace(' ', '').replace('\n', '') != '':
if pre:
str_list.append(el)
else:
str_list.append(el.replace('\n', ' ').strip(' \t'))
elif '\n' in el and str_list and str_list[-1] != '\n':
str_list.append('\n')
else:
str_list.append(' ')
else:
str_list.extend(get_node(el, pre))
if el.tag in ('br', 'p',) and str_list and str_list[-1] != '\n':
str_list.append('\n')
return str_list
re_str = ''.join(get_node(self))
re_str = re.sub(r' {2,}', ' ', re_str)
return format_html(re_str, False)
# re_str = str(self._inner_ele.text_content()) # re_str = str(self._inner_ele.text_content())
# # re_str = re.sub(r'<br */?>', '\n', re_str) # # re_str = re.sub(r'<br */?>', '\n', re_str)
# re_str = re.sub(r'\n{2,}', '\n', re_str) # re_str = re.sub(r'\n{2,}', '\n', re_str)
@ -56,30 +84,6 @@ class SessionElement(DrissionElement):
# return format_html(re_str.strip('\n ')) # return format_html(re_str.strip('\n '))
# # return format_html(re_str) # # return format_html(re_str)
# 为尽量保证与浏览器结果一致,弄得比较复杂
def get_node(ele):
str_list = []
for el in ele.eles('xpath:./node()'):
if isinstance(el, str):
if el.replace(' ', '').replace('\n', '') != '':
# str_list.append(el.replace('\xa0', '&nbsp;').replace('\n', ' ').strip())
str_list.append(el.replace('\n', ' ').strip(' '))
elif '\n' in el:
str_list.append('\n')
else:
str_list.append(' ')
else:
str_list.extend(get_node(el))
if el.tag in ('br', 'p',):
str_list.append('\n')
return str_list
re_str = ''.join(get_node(self))
re_str = re.sub(r'\n{2,}', '\n', re_str)
re_str = re.sub(r' {2,}', ' ', re_str)
return format_html(re_str.strip('\n '))
@property @property
def tag(self) -> str: def tag(self) -> str:
"""返回元素类型""" """返回元素类型"""
@ -120,26 +124,21 @@ class SessionElement(DrissionElement):
"""返回前一个兄弟元素""" """返回前一个兄弟元素"""
return self._get_brother(1, 'ele', 'prev') return self._get_brother(1, 'ele', 'prev')
@property
def comments(self):
return self.eles('xpath:.//comment()')
def texts(self, text_node_only: bool = False) -> list: def texts(self, text_node_only: bool = False) -> list:
"""返回元素内所有直接子节点的文本,包括元素和文本节点 \n """返回元素内所有直接子节点的文本,包括元素和文本节点 \n
:param text_node_only: 是否只返回文本节点 :param text_node_only: 是否只返回文本节点
:return: 文本列表 :return: 文本列表
""" """
if text_node_only: if text_node_only:
return self.eles('xpath:/text()') texts = self.eles('xpath:/text()')
else: else:
texts = [] texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]
for node in self.eles('xpath:/node()'): return [format_html(x) for x in texts if x and x.replace('\n', '').replace('\t', '').replace(' ', '') != '']
if isinstance(node, str):
text = node
else:
text = node.text
if text:
texts.append(text)
return texts
def parents(self, num: int = 1): def parents(self, num: int = 1):
"""返回上面第num级父元素 \n """返回上面第num级父元素 \n
@ -392,7 +391,7 @@ def execute_session_find(page_or_ele,
page_or_ele = page_or_ele.inner_ele page_or_ele = page_or_ele.inner_ele
else: # 传入的是SessionPage对象 else: # 传入的是SessionPage对象
page = page_or_ele page = page_or_ele
page_or_ele = fromstring(page_or_ele.response.text) page_or_ele = fromstring(re.sub(r'&nbsp;?', '&nbsp;', page_or_ele.response.text))
try: try:
# 用lxml内置方法获取lxml的元素对象列表 # 用lxml内置方法获取lxml的元素对象列表