mirror of
https://gitee.com/g1879/DrissionPage.git
synced 2024-12-10 04:00:23 +08:00
继续完善text属性返回内容;增加comments属性;完善对元素内有注释时节点的获取
This commit is contained in:
parent
dfc557b2df
commit
744e09c649
@ -4,7 +4,6 @@
|
||||
@Contact : g1879@qq.com
|
||||
@File : driver_element.py
|
||||
"""
|
||||
import re
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
from typing import Union, List, Any, Tuple
|
||||
@ -79,13 +78,13 @@ class DriverElement(DrissionElement):
|
||||
@property
|
||||
def text(self) -> str:
|
||||
"""返回元素内所有文本"""
|
||||
return format_html(self.inner_ele.get_attribute('innerText'), False)
|
||||
# return self.inner_ele.get_attribute('innerText')
|
||||
re_str = self.inner_ele.get_attribute('innerText')
|
||||
re_str = re.sub(r'\n{2,}', '\n', re_str)
|
||||
re_str = re.sub(r' {2,}', ' ', re_str)
|
||||
|
||||
return format_html(re_str.strip('\n '))
|
||||
# return re_str.strip('\n ')
|
||||
# re_str = self.inner_ele.get_attribute('innerText')
|
||||
# re_str = re.sub(r'\n{2,}', '\n', re_str)
|
||||
# re_str = re.sub(r' {2,}', ' ', re_str)
|
||||
#
|
||||
# return format_html(re_str.strip('\n '))
|
||||
|
||||
@property
|
||||
def link(self) -> str:
|
||||
@ -116,6 +115,10 @@ class DriverElement(DrissionElement):
|
||||
"""返回前一个兄弟元素"""
|
||||
return self._get_brother(1, 'ele', 'prev')
|
||||
|
||||
@property
|
||||
def comments(self):
|
||||
return self.eles('xpath:.//comment()')
|
||||
|
||||
# -----------------driver独占属性-------------------
|
||||
@property
|
||||
def size(self) -> dict:
|
||||
@ -152,9 +155,9 @@ class DriverElement(DrissionElement):
|
||||
:return: 文本列表
|
||||
"""
|
||||
if text_node_only:
|
||||
return self.eles('xpath:./text()')
|
||||
return self.eles('xpath:/text()')
|
||||
else:
|
||||
return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./node()')]
|
||||
return [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]
|
||||
|
||||
def parents(self, num: int = 1):
|
||||
"""返回上面第num级父元素 \n
|
||||
@ -576,7 +579,7 @@ class DriverElement(DrissionElement):
|
||||
ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
|
||||
|
||||
# 跳过元素间的换行符
|
||||
while ele_or_node == '\n':
|
||||
while isinstance(ele_or_node, str) and ele_or_node.replace('\n', '').replace('\t', '').replace(' ', '') == '':
|
||||
num += 1
|
||||
ele_or_node = self.ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
|
||||
|
||||
@ -662,6 +665,7 @@ class ElementsByXpath(object):
|
||||
return_txt = '''
|
||||
if(e.singleNodeValue.constructor.name=="Text"){return e.singleNodeValue.data;}
|
||||
else if(e.singleNodeValue.constructor.name=="Attr"){return e.singleNodeValue.nodeValue;}
|
||||
else if(e.singleNodeValue.constructor.name=="Comment"){return e.singleNodeValue.nodeValue;}
|
||||
else{return e.singleNodeValue;}
|
||||
'''
|
||||
|
||||
@ -672,6 +676,7 @@ class ElementsByXpath(object):
|
||||
for(var i = 0; i <e.snapshotLength ; i++){
|
||||
if(e.snapshotItem(i).constructor.name=="Text"){a.push(e.snapshotItem(i).data);}
|
||||
else if(e.snapshotItem(i).constructor.name=="Attr"){a.push(e.snapshotItem(i).nodeValue);}
|
||||
else if(e.snapshotItem(i).constructor.name=="Comment"){a.push(e.snapshotItem(i).nodeValue);}
|
||||
else{a.push(e.snapshotItem(i));}
|
||||
}
|
||||
"""
|
||||
|
@ -49,6 +49,34 @@ class SessionElement(DrissionElement):
|
||||
def text(self) -> str:
|
||||
"""返回元素内所有文本"""
|
||||
|
||||
# 为尽量保证与浏览器结果一致,弄得比较复杂
|
||||
def get_node(ele, pre: bool = False):
|
||||
str_list = []
|
||||
if ele.tag == 'pre':
|
||||
pre = True
|
||||
for el in ele.eles('xpath:./text() | *'):
|
||||
if isinstance(el, str):
|
||||
if el.replace(' ', '').replace('\n', '') != '':
|
||||
if pre:
|
||||
str_list.append(el)
|
||||
else:
|
||||
str_list.append(el.replace('\n', ' ').strip(' \t'))
|
||||
|
||||
elif '\n' in el and str_list and str_list[-1] != '\n':
|
||||
str_list.append('\n')
|
||||
else:
|
||||
str_list.append(' ')
|
||||
else:
|
||||
str_list.extend(get_node(el, pre))
|
||||
if el.tag in ('br', 'p',) and str_list and str_list[-1] != '\n':
|
||||
str_list.append('\n')
|
||||
|
||||
return str_list
|
||||
|
||||
re_str = ''.join(get_node(self))
|
||||
re_str = re.sub(r' {2,}', ' ', re_str)
|
||||
return format_html(re_str, False)
|
||||
|
||||
# re_str = str(self._inner_ele.text_content())
|
||||
# # re_str = re.sub(r'<br */?>', '\n', re_str)
|
||||
# re_str = re.sub(r'\n{2,}', '\n', re_str)
|
||||
@ -56,30 +84,6 @@ class SessionElement(DrissionElement):
|
||||
# return format_html(re_str.strip('\n '))
|
||||
# # return format_html(re_str)
|
||||
|
||||
# 为尽量保证与浏览器结果一致,弄得比较复杂
|
||||
def get_node(ele):
|
||||
str_list = []
|
||||
for el in ele.eles('xpath:./node()'):
|
||||
if isinstance(el, str):
|
||||
if el.replace(' ', '').replace('\n', '') != '':
|
||||
# str_list.append(el.replace('\xa0', ' ').replace('\n', ' ').strip())
|
||||
str_list.append(el.replace('\n', ' ').strip(' '))
|
||||
elif '\n' in el:
|
||||
str_list.append('\n')
|
||||
else:
|
||||
str_list.append(' ')
|
||||
else:
|
||||
str_list.extend(get_node(el))
|
||||
if el.tag in ('br', 'p',):
|
||||
str_list.append('\n')
|
||||
|
||||
return str_list
|
||||
|
||||
re_str = ''.join(get_node(self))
|
||||
re_str = re.sub(r'\n{2,}', '\n', re_str)
|
||||
re_str = re.sub(r' {2,}', ' ', re_str)
|
||||
return format_html(re_str.strip('\n '))
|
||||
|
||||
@property
|
||||
def tag(self) -> str:
|
||||
"""返回元素类型"""
|
||||
@ -120,26 +124,21 @@ class SessionElement(DrissionElement):
|
||||
"""返回前一个兄弟元素"""
|
||||
return self._get_brother(1, 'ele', 'prev')
|
||||
|
||||
@property
|
||||
def comments(self):
|
||||
return self.eles('xpath:.//comment()')
|
||||
|
||||
def texts(self, text_node_only: bool = False) -> list:
|
||||
"""返回元素内所有直接子节点的文本,包括元素和文本节点 \n
|
||||
:param text_node_only: 是否只返回文本节点
|
||||
:return: 文本列表
|
||||
"""
|
||||
if text_node_only:
|
||||
return self.eles('xpath:/text()')
|
||||
texts = self.eles('xpath:/text()')
|
||||
else:
|
||||
texts = []
|
||||
texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]
|
||||
|
||||
for node in self.eles('xpath:/node()'):
|
||||
if isinstance(node, str):
|
||||
text = node
|
||||
else:
|
||||
text = node.text
|
||||
|
||||
if text:
|
||||
texts.append(text)
|
||||
|
||||
return texts
|
||||
return [format_html(x) for x in texts if x and x.replace('\n', '').replace('\t', '').replace(' ', '') != '']
|
||||
|
||||
def parents(self, num: int = 1):
|
||||
"""返回上面第num级父元素 \n
|
||||
@ -392,7 +391,7 @@ def execute_session_find(page_or_ele,
|
||||
page_or_ele = page_or_ele.inner_ele
|
||||
else: # 传入的是SessionPage对象
|
||||
page = page_or_ele
|
||||
page_or_ele = fromstring(page_or_ele.response.text)
|
||||
page_or_ele = fromstring(re.sub(r' ?', ' ', page_or_ele.response.text))
|
||||
|
||||
try:
|
||||
# 用lxml内置方法获取lxml的元素对象列表
|
||||
|
Loading…
x
Reference in New Issue
Block a user