mirror of
https://gitee.com/g1879/DrissionPage.git
synced 2024-12-10 04:00:23 +08:00
继续完善text属性返回内容;增加comments属性;完善对元素内有注释时节点的获取
This commit is contained in:
parent
dfc557b2df
commit
744e09c649
@ -4,7 +4,6 @@
|
|||||||
@Contact : g1879@qq.com
|
@Contact : g1879@qq.com
|
||||||
@File : driver_element.py
|
@File : driver_element.py
|
||||||
"""
|
"""
|
||||||
import re
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from typing import Union, List, Any, Tuple
|
from typing import Union, List, Any, Tuple
|
||||||
@ -79,13 +78,13 @@ class DriverElement(DrissionElement):
|
|||||||
@property
|
@property
|
||||||
def text(self) -> str:
|
def text(self) -> str:
|
||||||
"""返回元素内所有文本"""
|
"""返回元素内所有文本"""
|
||||||
|
return format_html(self.inner_ele.get_attribute('innerText'), False)
|
||||||
# return self.inner_ele.get_attribute('innerText')
|
# return self.inner_ele.get_attribute('innerText')
|
||||||
re_str = self.inner_ele.get_attribute('innerText')
|
# re_str = self.inner_ele.get_attribute('innerText')
|
||||||
re_str = re.sub(r'\n{2,}', '\n', re_str)
|
# re_str = re.sub(r'\n{2,}', '\n', re_str)
|
||||||
re_str = re.sub(r' {2,}', ' ', re_str)
|
# re_str = re.sub(r' {2,}', ' ', re_str)
|
||||||
|
#
|
||||||
return format_html(re_str.strip('\n '))
|
# return format_html(re_str.strip('\n '))
|
||||||
# return re_str.strip('\n ')
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def link(self) -> str:
|
def link(self) -> str:
|
||||||
@ -116,6 +115,10 @@ class DriverElement(DrissionElement):
|
|||||||
"""返回前一个兄弟元素"""
|
"""返回前一个兄弟元素"""
|
||||||
return self._get_brother(1, 'ele', 'prev')
|
return self._get_brother(1, 'ele', 'prev')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def comments(self):
|
||||||
|
return self.eles('xpath:.//comment()')
|
||||||
|
|
||||||
# -----------------driver独占属性-------------------
|
# -----------------driver独占属性-------------------
|
||||||
@property
|
@property
|
||||||
def size(self) -> dict:
|
def size(self) -> dict:
|
||||||
@ -152,9 +155,9 @@ class DriverElement(DrissionElement):
|
|||||||
:return: 文本列表
|
:return: 文本列表
|
||||||
"""
|
"""
|
||||||
if text_node_only:
|
if text_node_only:
|
||||||
return self.eles('xpath:./text()')
|
return self.eles('xpath:/text()')
|
||||||
else:
|
else:
|
||||||
return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./node()')]
|
return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]
|
||||||
|
|
||||||
def parents(self, num: int = 1):
|
def parents(self, num: int = 1):
|
||||||
"""返回上面第num级父元素 \n
|
"""返回上面第num级父元素 \n
|
||||||
@ -576,7 +579,7 @@ class DriverElement(DrissionElement):
|
|||||||
ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
|
ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
|
||||||
|
|
||||||
# 跳过元素间的换行符
|
# 跳过元素间的换行符
|
||||||
while ele_or_node == '\n':
|
while isinstance(ele_or_node, str) and ele_or_node.replace('\n', '').replace('\t', '').replace(' ', '') == '':
|
||||||
num += 1
|
num += 1
|
||||||
ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
|
ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
|
||||||
|
|
||||||
@ -662,6 +665,7 @@ class ElementsByXpath(object):
|
|||||||
return_txt = '''
|
return_txt = '''
|
||||||
if(e.singleNodeValue.constructor.name=="Text"){return e.singleNodeValue.data;}
|
if(e.singleNodeValue.constructor.name=="Text"){return e.singleNodeValue.data;}
|
||||||
else if(e.singleNodeValue.constructor.name=="Attr"){return e.singleNodeValue.nodeValue;}
|
else if(e.singleNodeValue.constructor.name=="Attr"){return e.singleNodeValue.nodeValue;}
|
||||||
|
else if(e.singleNodeValue.constructor.name=="Comment"){return e.singleNodeValue.nodeValue;}
|
||||||
else{return e.singleNodeValue;}
|
else{return e.singleNodeValue;}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
@ -672,6 +676,7 @@ class ElementsByXpath(object):
|
|||||||
for(var i = 0; i <e.snapshotLength ; i++){
|
for(var i = 0; i <e.snapshotLength ; i++){
|
||||||
if(e.snapshotItem(i).constructor.name=="Text"){a.push(e.snapshotItem(i).data);}
|
if(e.snapshotItem(i).constructor.name=="Text"){a.push(e.snapshotItem(i).data);}
|
||||||
else if(e.snapshotItem(i).constructor.name=="Attr"){a.push(e.snapshotItem(i).nodeValue);}
|
else if(e.snapshotItem(i).constructor.name=="Attr"){a.push(e.snapshotItem(i).nodeValue);}
|
||||||
|
else if(e.snapshotItem(i).constructor.name=="Comment"){a.push(e.snapshotItem(i).nodeValue);}
|
||||||
else{a.push(e.snapshotItem(i));}
|
else{a.push(e.snapshotItem(i));}
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
@ -49,6 +49,34 @@ class SessionElement(DrissionElement):
|
|||||||
def text(self) -> str:
|
def text(self) -> str:
|
||||||
"""返回元素内所有文本"""
|
"""返回元素内所有文本"""
|
||||||
|
|
||||||
|
# 为尽量保证与浏览器结果一致,弄得比较复杂
|
||||||
|
def get_node(ele, pre: bool = False):
|
||||||
|
str_list = []
|
||||||
|
if ele.tag == 'pre':
|
||||||
|
pre = True
|
||||||
|
for el in ele.eles('xpath:./text() | *'):
|
||||||
|
if isinstance(el, str):
|
||||||
|
if el.replace(' ', '').replace('\n', '') != '':
|
||||||
|
if pre:
|
||||||
|
str_list.append(el)
|
||||||
|
else:
|
||||||
|
str_list.append(el.replace('\n', ' ').strip(' \t'))
|
||||||
|
|
||||||
|
elif '\n' in el and str_list and str_list[-1] != '\n':
|
||||||
|
str_list.append('\n')
|
||||||
|
else:
|
||||||
|
str_list.append(' ')
|
||||||
|
else:
|
||||||
|
str_list.extend(get_node(el, pre))
|
||||||
|
if el.tag in ('br', 'p',) and str_list and str_list[-1] != '\n':
|
||||||
|
str_list.append('\n')
|
||||||
|
|
||||||
|
return str_list
|
||||||
|
|
||||||
|
re_str = ''.join(get_node(self))
|
||||||
|
re_str = re.sub(r' {2,}', ' ', re_str)
|
||||||
|
return format_html(re_str, False)
|
||||||
|
|
||||||
# re_str = str(self._inner_ele.text_content())
|
# re_str = str(self._inner_ele.text_content())
|
||||||
# # re_str = re.sub(r'<br */?>', '\n', re_str)
|
# # re_str = re.sub(r'<br */?>', '\n', re_str)
|
||||||
# re_str = re.sub(r'\n{2,}', '\n', re_str)
|
# re_str = re.sub(r'\n{2,}', '\n', re_str)
|
||||||
@ -56,30 +84,6 @@ class SessionElement(DrissionElement):
|
|||||||
# return format_html(re_str.strip('\n '))
|
# return format_html(re_str.strip('\n '))
|
||||||
# # return format_html(re_str)
|
# # return format_html(re_str)
|
||||||
|
|
||||||
# 为尽量保证与浏览器结果一致,弄得比较复杂
|
|
||||||
def get_node(ele):
|
|
||||||
str_list = []
|
|
||||||
for el in ele.eles('xpath:./node()'):
|
|
||||||
if isinstance(el, str):
|
|
||||||
if el.replace(' ', '').replace('\n', '') != '':
|
|
||||||
# str_list.append(el.replace('\xa0', ' ').replace('\n', ' ').strip())
|
|
||||||
str_list.append(el.replace('\n', ' ').strip(' '))
|
|
||||||
elif '\n' in el:
|
|
||||||
str_list.append('\n')
|
|
||||||
else:
|
|
||||||
str_list.append(' ')
|
|
||||||
else:
|
|
||||||
str_list.extend(get_node(el))
|
|
||||||
if el.tag in ('br', 'p',):
|
|
||||||
str_list.append('\n')
|
|
||||||
|
|
||||||
return str_list
|
|
||||||
|
|
||||||
re_str = ''.join(get_node(self))
|
|
||||||
re_str = re.sub(r'\n{2,}', '\n', re_str)
|
|
||||||
re_str = re.sub(r' {2,}', ' ', re_str)
|
|
||||||
return format_html(re_str.strip('\n '))
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tag(self) -> str:
|
def tag(self) -> str:
|
||||||
"""返回元素类型"""
|
"""返回元素类型"""
|
||||||
@ -120,26 +124,21 @@ class SessionElement(DrissionElement):
|
|||||||
"""返回前一个兄弟元素"""
|
"""返回前一个兄弟元素"""
|
||||||
return self._get_brother(1, 'ele', 'prev')
|
return self._get_brother(1, 'ele', 'prev')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def comments(self):
|
||||||
|
return self.eles('xpath:.//comment()')
|
||||||
|
|
||||||
def texts(self, text_node_only: bool = False) -> list:
|
def texts(self, text_node_only: bool = False) -> list:
|
||||||
"""返回元素内所有直接子节点的文本,包括元素和文本节点 \n
|
"""返回元素内所有直接子节点的文本,包括元素和文本节点 \n
|
||||||
:param text_node_only: 是否只返回文本节点
|
:param text_node_only: 是否只返回文本节点
|
||||||
:return: 文本列表
|
:return: 文本列表
|
||||||
"""
|
"""
|
||||||
if text_node_only:
|
if text_node_only:
|
||||||
return self.eles('xpath:/text()')
|
texts = self.eles('xpath:/text()')
|
||||||
else:
|
else:
|
||||||
texts = []
|
texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]
|
||||||
|
|
||||||
for node in self.eles('xpath:/node()'):
|
return [format_html(x) for x in texts if x and x.replace('\n', '').replace('\t', '').replace(' ', '') != '']
|
||||||
if isinstance(node, str):
|
|
||||||
text = node
|
|
||||||
else:
|
|
||||||
text = node.text
|
|
||||||
|
|
||||||
if text:
|
|
||||||
texts.append(text)
|
|
||||||
|
|
||||||
return texts
|
|
||||||
|
|
||||||
def parents(self, num: int = 1):
|
def parents(self, num: int = 1):
|
||||||
"""返回上面第num级父元素 \n
|
"""返回上面第num级父元素 \n
|
||||||
@ -392,7 +391,7 @@ def execute_session_find(page_or_ele,
|
|||||||
page_or_ele = page_or_ele.inner_ele
|
page_or_ele = page_or_ele.inner_ele
|
||||||
else: # 传入的是SessionPage对象
|
else: # 传入的是SessionPage对象
|
||||||
page = page_or_ele
|
page = page_or_ele
|
||||||
page_or_ele = fromstring(page_or_ele.response.text)
|
page_or_ele = fromstring(re.sub(r' ?', ' ', page_or_ele.response.text))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 用lxml内置方法获取lxml的元素对象列表
|
# 用lxml内置方法获取lxml的元素对象列表
|
||||||
|
Loading…
x
Reference in New Issue
Block a user