274 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding:utf-8 -*-
"""
@Author : g1879
@Contact : g1879@qq.com
@File : base.py
"""
from abc import abstractmethod
from re import sub
from typing import Union, Tuple
from lxml.html import HtmlElement
from selenium.webdriver.remote.webelement import WebElement
from .common import format_html
class BaseParser(object):
"""所有页面、元素类的基类"""
def __call__(self, loc_or_str):
return self.ele(loc_or_str)
def ele(self, loc_or_ele, timeout=None):
return self._ele(loc_or_ele, timeout, True)
def eles(self, loc_or_str: Union[Tuple[str, str], str], timeout=None):
return self._ele(loc_or_str, timeout, False)
# ----------------以下属性或方法待后代实现----------------
@property
def html(self) -> str:
return ''
@abstractmethod
def s_ele(self, loc_or_ele):
pass
@abstractmethod
def s_eles(self, loc_or_str):
pass
@abstractmethod
def _ele(self, loc_or_ele, timeout=None, single=True):
pass
class BaseElement(BaseParser):
"""各元素类的基类"""
def __init__(self, ele: Union[WebElement, HtmlElement], page=None):
self._inner_ele = ele
self.page = page
@property
def inner_ele(self) -> Union[WebElement, HtmlElement]:
return self._inner_ele
@property
def next(self):
"""返回后一个兄弟元素"""
return self.nexts()
# ----------------以下属性或方法由后代实现----------------
@property
def tag(self):
return
@property
def parent(self):
return
@property
def prev(self):
return
@property
def is_valid(self):
return True
@abstractmethod
def nexts(self, num: int = 1):
pass
class DrissionElement(BaseElement):
"""DriverElement 和 SessionElement的基类但不是ShadowRootElement的基类"""
@property
def parent(self):
"""返回父级元素"""
return self.parents()
@property
def prev(self):
"""返回前一个兄弟元素"""
return self.prevs()
@property
def link(self) -> str:
"""返回href或src绝对url"""
return self.attr('href') or self.attr('src')
@property
def css_path(self) -> str:
"""返回css path路径"""
return self._get_ele_path('css')
@property
def xpath(self) -> str:
"""返回xpath路径"""
return self._get_ele_path('xpath')
@property
def comments(self) -> list:
"""返回元素注释文本组成的列表"""
return self.eles('xpath:.//comment()')
def texts(self, text_node_only: bool = False) -> list:
"""返回元素内所有直接子节点的文本,包括元素和文本节点 \n
:param text_node_only: 是否只返回文本节点
:return: 文本列表
"""
if text_node_only:
texts = self.eles('xpath:/text()')
else:
texts = [x if isinstance(x, str) else x.text for x in self.eles('xpath:./text() | *')]
return [format_html(x.strip(' ').rstrip('\n')) for x in texts if x and sub('[\n\t ]', '', x) != '']
def nexts(self, num: int = 1, mode: str = 'ele'):
"""返回后面第num个兄弟元素或节点 \n
:param num: 后面第几个兄弟元素或节点
:param mode: 'ele', 'node''text',匹配元素、节点、或文本节点
:return: SessionElement对象
"""
return self._get_brother(num, mode, 'next')
def prevs(self, num: int = 1, mode: str = 'ele'):
"""返回前面第num个兄弟元素或节点 \n
:param num: 前面第几个兄弟元素或节点
:param mode: 'ele', 'node''text',匹配元素、节点、或文本节点
:return: SessionElement对象
"""
return self._get_brother(num, mode, 'prev')
def _get_brother(self, num: int = 1, mode: str = 'ele', direction: str = 'next'):
"""返回前面第num个兄弟节点或元素 \n
:param num: 前面第几个兄弟节点或元素
:param mode: 'ele', 'node''text',匹配元素、节点、或文本节点
:param direction: 'next''prev',查找的方向
:return: DriverElement对象或字符串
"""
# 查找节点的类型
if mode == 'ele':
node_txt = '*'
elif mode == 'node':
node_txt = 'node()'
elif mode == 'text':
node_txt = 'text()'
else:
raise ValueError(f"mode参数只能是'node''ele''text',现在是:'{mode}'")
# 查找节点的方向
if direction == 'next':
direction_txt = 'following'
elif direction == 'prev':
direction_txt = 'preceding'
else:
raise ValueError(f"direction参数只能是'next''prev',现在是:'{direction}'")
timeout = 0 if direction == 'prev' else .5
# 获取节点
ele_or_node = self._ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
# 跳过元素间的换行符
while isinstance(ele_or_node, str) and sub('[\n\t ]', '', ele_or_node) == '':
num += 1
ele_or_node = self._ele(f'xpath:./{direction_txt}-sibling::{node_txt}[{num}]', timeout=timeout)
return ele_or_node
# ----------------以下属性或方法由后代实现----------------
@property
def attrs(self):
return
@property
def text(self):
return
@property
def raw_text(self):
return
@abstractmethod
def parents(self, num: int = 1):
pass
@abstractmethod
def attr(self, attr: str):
return ''
def _get_ele_path(self, mode):
return ''
class BasePage(BaseParser):
"""页面类的基类"""
def __init__(self, timeout: float = 10):
"""初始化函数"""
self._url = None
self.timeout = timeout
self.retry_times = 3
self.retry_interval = 2
self._url_available = None
@property
def title(self) -> Union[str, None]:
"""返回网页title"""
ele = self.ele('xpath:/html/head/title')
return ele.text if ele else None
@property
def timeout(self) -> float:
"""返回查找元素时等待的秒数"""
return self._timeout
@timeout.setter
def timeout(self, second: float) -> None:
"""设置查找元素时等待的秒数"""
self._timeout = second
@property
def cookies(self) -> dict:
"""返回cookies"""
return self.get_cookies(True)
@property
def url_available(self) -> bool:
"""返回当前访问的url有效性"""
return self._url_available
# ----------------以下属性或方法由后代实现----------------
@property
def url(self):
return
@property
def json(self):
return
@abstractmethod
def get_cookies(self, as_dict: bool = False):
return {}
@abstractmethod
def get(self,
url: str,
go_anyway: bool = False,
show_errmsg: bool = False,
retry: int = None,
interval: float = None):
pass
@abstractmethod
def _try_to_connect(self,
to_url: str,
times: int = 0,
interval: float = 1,
show_errmsg: bool = False, ):
pass