mirror of
https://gitee.com/g1879/DrissionPage.git
synced 2024-12-10 04:00:23 +08:00
274 lines
7.5 KiB
Python
274 lines
7.5 KiB
Python
# -*- coding:utf-8 -*-
|
||
"""
|
||
@Author : g1879
|
||
@Contact : g1879@qq.com
|
||
@File : base.py
|
||
"""
|
||
from abc import abstractmethod
|
||
from re import sub
|
||
from typing import Union, Tuple
|
||
|
||
from lxml.html import HtmlElement
|
||
from selenium.webdriver.remote.webelement import WebElement
|
||
|
||
from .common import format_html
|
||
|
||
|
||
class BaseParser(object):
|
||
"""所有页面、元素类的基类"""
|
||
|
||
def __call__(self, loc_or_str):
|
||
return self.ele(loc_or_str)
|
||
|
||
def ele(self, loc_or_ele, timeout=None):
|
||
return self._ele(loc_or_ele, timeout, True)
|
||
|
||
def eles(self, loc_or_str: Union[Tuple[str, str], str], timeout=None):
|
||
return self._ele(loc_or_str, timeout, False)
|
||
|
||
# ----------------以下属性或方法待后代实现----------------
|
||
@property
|
||
def html(self) -> str:
|
||
return ''
|
||
|
||
@abstractmethod
|
||
def s_ele(self, loc_or_ele):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def s_eles(self, loc_or_str):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def _ele(self, loc_or_ele, timeout=None, single=True):
|
||
pass
|
||
|
||
|
||
class BaseElement(BaseParser):
|
||
"""各元素类的基类"""
|
||
|
||
def __init__(self, ele: Union[WebElement, HtmlElement], page=None):
|
||
self._inner_ele = ele
|
||
self.page = page
|
||
|
||
@property
|
||
def inner_ele(self) -> Union[WebElement, HtmlElement]:
|
||
return self._inner_ele
|
||
|
||
@property
|
||
def next(self):
|
||
"""返回后一个兄弟元素"""
|
||
return self.nexts()
|
||
|
||
# ----------------以下属性或方法由后代实现----------------
|
||
@property
|
||
def tag(self):
|
||
return
|
||
|
||
@property
|
||
def parent(self):
|
||
return
|
||
|
||
@property
|
||
def prev(self):
|
||
return
|
||
|
||
@property
|
||
def is_valid(self):
|
||
return True
|
||
|
||
@abstractmethod
|
||
def nexts(self, num: int = 1):
|
||
pass
|
||
|
||
|
||
class DrissionElement(BaseElement):
|
||
"""DriverElement 和 SessionElement的基类,但不是ShadowRootElement的基类"""
|
||
|
||
@property
|
||
def parent(self):
|
||
"""返回父级元素"""
|
||
return self.parents()
|
||
|
||
@property
|
||
def prev(self):
|
||
"""返回前一个兄弟元素"""
|
||
return self.prevs()
|
||
|
||
@property
|
||
def link(self) -> str:
|
||
"""返回href或src绝对url"""
|
||
return self.attr('href') or self.attr('src')
|
||
|
||
@property
|
||
def css_path(self) -> str:
|
||
"""返回css path路径"""
|
||
return self._get_ele_path('css')
|
||
|
||
@property
|
||
def xpath(self) -> str:
|
||
"""返回xpath路径"""
|
||
return self._get_ele_path('xpath')
|
||
|
||
@property
|
||
def comments(self) -> list:
|
||
"""返回元素注释文本组成的列表"""
|
||
return self.eles('xpath:.//comment()')
|
||
|
||
def texts(self, text_node_only: bool = False) -> list:
|
||
"""返回元素内所有直接子节点的文本,包括元素和文本节点 \n
|
||
:param text_node_only: 是否只返回文本节点
|
||
:return: 文本列表
|
||
"""
|
||
if text_node_only:
|
||
texts = self.eles('xpath:/text()')
|
||
else:
|
||
texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]
|
||
|
||
return [format_html(x.strip(' ').rstrip('\n')) for x in texts if x and sub('[\n\t ]', '', x) != '']
|
||
|
||
def nexts(self, num: int = 1, mode: str = 'ele'):
|
||
"""返回后面第num个兄弟元素或节点 \n
|
||
:param num: 后面第几个兄弟元素或节点
|
||
:param mode: 'ele', 'node' 或 'text',匹配元素、节点、或文本节点
|
||
:return: SessionElement对象
|
||
"""
|
||
return self._get_brother(num, mode, 'next')
|
||
|
||
def prevs(self, num: int = 1, mode: str = 'ele'):
|
||
"""返回前面第num个兄弟元素或节点 \n
|
||
:param num: 前面第几个兄弟元素或节点
|
||
:param mode: 'ele', 'node' 或 'text',匹配元素、节点、或文本节点
|
||
:return: SessionElement对象
|
||
"""
|
||
return self._get_brother(num, mode, 'prev')
|
||
|
||
def _get_brother(self, num: int = 1, mode: str = 'ele', direction: str = 'next'):
|
||
"""返回前面第num个兄弟节点或元素 \n
|
||
:param num: 前面第几个兄弟节点或元素
|
||
:param mode: 'ele', 'node' 或 'text',匹配元素、节点、或文本节点
|
||
:param direction: 'next' 或 'prev',查找的方向
|
||
:return: DriverElement对象或字符串
|
||
"""
|
||
# 查找节点的类型
|
||
if mode == 'ele':
|
||
node_txt = '*'
|
||
elif mode == 'node':
|
||
node_txt = 'node()'
|
||
elif mode == 'text':
|
||
node_txt = 'text()'
|
||
else:
|
||
raise ValueError(f"mode参数只能是'node'、'ele'或'text',现在是:'{mode}'。")
|
||
|
||
# 查找节点的方向
|
||
if direction == 'next':
|
||
direction_txt = 'following'
|
||
elif direction == 'prev':
|
||
direction_txt = 'preceding'
|
||
else:
|
||
raise ValueError(f"direction参数只能是'next'或'prev',现在是:'{direction}'。")
|
||
|
||
timeout = 0 if direction == 'prev' else .5
|
||
|
||
# 获取节点
|
||
ele_or_node = self._ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
|
||
|
||
# 跳过元素间的换行符
|
||
while isinstance(ele_or_node, str) and sub('[\n\t ]', '', ele_or_node) == '':
|
||
num += 1
|
||
ele_or_node = self._ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
|
||
|
||
return ele_or_node
|
||
|
||
# ----------------以下属性或方法由后代实现----------------
|
||
@property
|
||
def attrs(self):
|
||
return
|
||
|
||
@property
|
||
def text(self):
|
||
return
|
||
|
||
@property
|
||
def raw_text(self):
|
||
return
|
||
|
||
@abstractmethod
|
||
def parents(self, num: int = 1):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def attr(self, attr: str):
|
||
return ''
|
||
|
||
def _get_ele_path(self, mode):
|
||
return ''
|
||
|
||
|
||
class BasePage(BaseParser):
|
||
"""页面类的基类"""
|
||
|
||
def __init__(self, timeout: float = 10):
|
||
"""初始化函数"""
|
||
self._url = None
|
||
self.timeout = timeout
|
||
self.retry_times = 3
|
||
self.retry_interval = 2
|
||
self._url_available = None
|
||
|
||
@property
|
||
def title(self) -> Union[str, None]:
|
||
"""返回网页title"""
|
||
ele = self.ele('xpath:/html/head/title')
|
||
return ele.text if ele else None
|
||
|
||
@property
|
||
def timeout(self) -> float:
|
||
"""返回查找元素时等待的秒数"""
|
||
return self._timeout
|
||
|
||
@timeout.setter
|
||
def timeout(self, second: float) -> None:
|
||
"""设置查找元素时等待的秒数"""
|
||
self._timeout = second
|
||
|
||
@property
|
||
def cookies(self) -> dict:
|
||
"""返回cookies"""
|
||
return self.get_cookies(True)
|
||
|
||
@property
|
||
def url_available(self) -> bool:
|
||
"""返回当前访问的url有效性"""
|
||
return self._url_available
|
||
|
||
# ----------------以下属性或方法由后代实现----------------
|
||
@property
|
||
def url(self):
|
||
return
|
||
|
||
@property
|
||
def json(self):
|
||
return
|
||
|
||
@abstractmethod
|
||
def get_cookies(self, as_dict: bool = False):
|
||
return {}
|
||
|
||
@abstractmethod
|
||
def get(self,
|
||
url: str,
|
||
go_anyway: bool = False,
|
||
show_errmsg: bool = False,
|
||
retry: int = None,
|
||
interval: float = None):
|
||
pass
|
||
|
||
@abstractmethod
|
||
def _try_to_connect(self,
|
||
to_url: str,
|
||
times: int = 0,
|
||
interval: float = 1,
|
||
show_errmsg: bool = False, ):
|
||
pass
|