添加s_ele(),未完成

This commit is contained in:
g1879 2021-11-19 21:48:00 +08:00
parent 4241ac000d
commit 391d042635
9 changed files with 99 additions and 80 deletions

View File

@ -6,7 +6,7 @@
""" """
from abc import abstractmethod from abc import abstractmethod
from re import sub from re import sub
from typing import Union from typing import Union, Tuple
from lxml.html import HtmlElement from lxml.html import HtmlElement
from selenium.webdriver.remote.webelement import WebElement from selenium.webdriver.remote.webelement import WebElement
@ -21,14 +21,26 @@ class BaseParser(object):
timeout: float = None): timeout: float = None):
return self.ele(loc_or_str, mode, timeout) return self.ele(loc_or_str, mode, timeout)
def eles(self, loc_or_str, timeout): def eles(self,
loc_or_str: Union[Tuple[str, str], str],
timeout: float = None):
return self.ele(loc_or_str, mode='all', timeout=timeout) return self.ele(loc_or_str, mode='all', timeout=timeout)
def s_eles(self,
loc_or_str: Union[Tuple[str, str], str],
timeout: float = None):
"""查找并以SessionElement方式返回元素"""
return self.s_ele(loc_or_str, mode='all', timeout=timeout)
# ----------------以下属性或方法待后代实现---------------- # ----------------以下属性或方法待后代实现----------------
@property @property
def html(self): def html(self):
return return
@abstractmethod
def s_ele(self, loc_or_ele, mode='single', timeout=None):
pass
@abstractmethod @abstractmethod
def ele(self, loc_or_ele, mode='single', timeout=None): def ele(self, loc_or_ele, mode='single', timeout=None):
pass pass
@ -50,8 +62,8 @@ class BaseElement(BaseParser):
"""返回后一个兄弟元素""" """返回后一个兄弟元素"""
return self.nexts() return self.nexts()
def eles(self, loc_or_str, timeout): # def eles(self, loc_or_str, timeout):
return super().eles(loc_or_str, timeout) # return super().eles(loc_or_str, timeout)
# ----------------以下属性或方法由后代实现---------------- # ----------------以下属性或方法由后代实现----------------
@property @property
@ -251,8 +263,8 @@ class BasePage(BaseParser):
"""返回当前访问的url有效性""" """返回当前访问的url有效性"""
return self._url_available return self._url_available
def eles(self, loc_or_str, timeout): # def eles(self, loc_or_str, timeout):
return super().eles(loc_or_str, timeout) # return super().eles(loc_or_str, timeout)
# ----------------以下属性或方法由后代实现---------------- # ----------------以下属性或方法由后代实现----------------
@property @property

View File

@ -158,7 +158,11 @@ def _make_search_str(search_str: str) -> str:
def format_html(text: str, trans: bool = True) -> str: def format_html(text: str, trans: bool = True) -> str:
"""处理html编码字符""" """处理html编码字符 \n
:param text: html文本
:param trans: 是否转码
:return: 格式化后的html文本
"""
if not text: if not text:
return text return text
@ -173,8 +177,10 @@ def translate_loc(loc: tuple) -> tuple:
:param loc: By类型的loc元组 :param loc: By类型的loc元组
:return: css selector或xpath类型的loc元组 :return: css selector或xpath类型的loc元组
""" """
if len(loc) != 2:
raise ValueError('定位符长度必须为2。')
loc_by = 'xpath' loc_by = 'xpath'
loc_str = None
if loc[0] == 'xpath': if loc[0] == 'xpath':
loc_str = loc[1] loc_str = loc[1]
@ -201,6 +207,9 @@ def translate_loc(loc: tuple) -> tuple:
elif loc[0] == 'partial link text': elif loc[0] == 'partial link text':
loc_str = f'//a[contains(text(),"{loc[1]}")]' loc_str = f'//a[contains(text(),"{loc[1]}")]'
else:
raise ValueError('无法识别的定位符。')
return loc_by, loc_str return loc_by, loc_str

View File

@ -1,19 +1,19 @@
[paths] [paths]
chromedriver_path = chromedriver_path = D:\python\Google Chrome\Chrome\chromedriver75.exe
tmp_path = tmp_path = D:\python\projects\DrissionPage\DrissionPage\tmp
[chrome_options] [chrome_options]
debugger_address = debugger_address = 127.0.0.1:9222
binary_location = binary_location = D:\python\Google Chrome\Chrome\chrome.exe
arguments = ['--no-sandbox', '--disable-gpu', '--ignore-certificate-errors', '--disable-infobars'] arguments = ['--no-sandbox', '--disable-gpu', '--ignore-certificate-errors', '--disable-infobars']
extensions = [] extensions = []
experimental_options = {'prefs': {'profile.default_content_settings.popups': 0, 'profile.default_content_setting_values': {'notifications': 2}, 'plugins.plugins_list': [{'enabled': False, 'name': 'Chrome PDF Viewer'}]}, 'useAutomationExtension': False, 'excludeSwitches': ['enable-automation']} experimental_options = {'prefs': {'profile.default_content_settings.popups': 0, 'profile.default_content_setting_values': {'notifications': 2}, 'plugins.plugins_list': [{'enabled': False, 'name': 'Chrome PDF Viewer'}]}, 'useAutomationExtension': False, 'excludeSwitches': ['enable-automation']}
[session_options] [session_options]
headers = { headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Connection": "keep-alive", "Connection": "keep-alive",
"Accept-Charset": "GB2312,utf-8;q=0.7,*;q=0.7" "Accept-Charset": "GB2312,utf-8;q=0.7,*;q=0.7"
} }

View File

@ -18,6 +18,7 @@ from selenium.webdriver.support.wait import WebDriverWait
from .base import DrissionElement, BaseElement from .base import DrissionElement, BaseElement
from .common import str_to_loc, get_available_file_name, translate_loc, format_html from .common import str_to_loc, get_available_file_name, translate_loc, format_html
from .session_element import make_session_ele
class DriverElement(DrissionElement): class DriverElement(DrissionElement):
@ -123,16 +124,7 @@ class DriverElement(DrissionElement):
:param timeout: 查找元素超时时间 :param timeout: 查找元素超时时间
:return: DriverElement对象 :return: DriverElement对象
""" """
if isinstance(loc_or_str, (str, tuple)): loc_or_str = str_to_loc(loc_or_str) if isinstance(loc_or_str, str) else translate_loc(loc_or_str)
if isinstance(loc_or_str, str):
loc_or_str = str_to_loc(loc_or_str)
else:
if len(loc_or_str) != 2:
raise ValueError("Len of loc_or_str must be 2 when it's a tuple.")
loc_or_str = translate_loc(loc_or_str)
else:
raise ValueError('Argument loc_or_str can only be tuple or str.')
loc_str = loc_or_str[1] loc_str = loc_or_str[1]
if loc_or_str[0] == 'xpath' and loc_or_str[1].lstrip().startswith('/'): if loc_or_str[0] == 'xpath' and loc_or_str[1].lstrip().startswith('/'):
@ -142,9 +134,11 @@ class DriverElement(DrissionElement):
loc_str = f'{self.css_path}{loc_or_str[1]}' loc_str = f'{self.css_path}{loc_or_str[1]}'
loc_or_str = loc_or_str[0], loc_str loc_or_str = loc_or_str[0], loc_str
return execute_driver_find(self, loc_or_str, mode, timeout) return execute_driver_find(self, loc_or_str, mode, timeout)
def s_ele(self, loc_or_ele, mode='single', timeout=None):
return make_session_ele(self, loc_or_ele, mode)
def eles(self, def eles(self,
loc_or_str: Union[Tuple[str, str], str], loc_or_str: Union[Tuple[str, str], str],
timeout: float = None): timeout: float = None):

View File

@ -19,6 +19,7 @@ from selenium.webdriver.support.wait import WebDriverWait
from .base import BasePage from .base import BasePage
from .common import str_to_loc, get_available_file_name, translate_loc, format_html from .common import str_to_loc, get_available_file_name, translate_loc, format_html
from .driver_element import DriverElement, execute_driver_find, _wait_ele from .driver_element import DriverElement, execute_driver_find, _wait_ele
from .session_element import make_session_ele
class DriverPage(BasePage): class DriverPage(BasePage):
@ -127,6 +128,9 @@ class DriverPage(BasePage):
return execute_driver_find(self, loc_or_ele, mode, timeout) return execute_driver_find(self, loc_or_ele, mode, timeout)
def s_ele(self, loc_or_ele, mode='single', timeout=None):
return make_session_ele(self, loc_or_ele, mode)
def eles(self, def eles(self,
loc_or_str: Union[Tuple[str, str], str], loc_or_str: Union[Tuple[str, str], str],
timeout: float = None) -> List[DriverElement]: timeout: float = None) -> List[DriverElement]:
@ -135,9 +139,6 @@ class DriverPage(BasePage):
:param timeout: 查找元素超时时间 :param timeout: 查找元素超时时间
:return: DriverElement对象组成的列表 :return: DriverElement对象组成的列表
""" """
if not isinstance(loc_or_str, (tuple, str)):
raise TypeError('Type of loc_or_str can only be tuple or str.')
return super().eles(loc_or_str, timeout) return super().eles(loc_or_str, timeout)
def get_cookies(self, as_dict: bool = False) -> Union[list, dict]: def get_cookies(self, as_dict: bool = False) -> Union[list, dict]:

View File

@ -133,6 +133,12 @@ class MixPage(SessionPage, DriverPage, BasePage):
elif self._mode == 'd': elif self._mode == 'd':
return super(SessionPage, self).ele(loc_or_ele, mode=mode, timeout=timeout) return super(SessionPage, self).ele(loc_or_ele, mode=mode, timeout=timeout)
def s_ele(self, loc_or_ele, mode='single', timeout=None):
if self._mode == 's':
return super().s_ele(loc_or_ele, mode=mode)
elif self._mode == 'd':
return super(SessionPage, self).s_ele(loc_or_ele, mode=mode, timeout=timeout)
def eles(self, def eles(self,
loc_or_str: Union[Tuple[str, str], str], loc_or_str: Union[Tuple[str, str], str],
timeout: float = None) -> Union[List[DriverElement], List[SessionElement]]: timeout: float = None) -> Union[List[DriverElement], List[SessionElement]]:

View File

@ -145,16 +145,7 @@ class SessionElement(DrissionElement):
:param timeout: 不起实际作用用于和父类对应 :param timeout: 不起实际作用用于和父类对应
:return: SessionElement对象 :return: SessionElement对象
""" """
if isinstance(loc_or_str, (str, tuple)): loc_or_str = str_to_loc(loc_or_str) if isinstance(loc_or_str, str) else translate_loc(loc_or_str)
if isinstance(loc_or_str, str):
loc_or_str = str_to_loc(loc_or_str)
else:
if len(loc_or_str) != 2:
raise ValueError("Len of loc_or_str must be 2 when it's a tuple.")
loc_or_str = translate_loc(loc_or_str)
else:
raise ValueError('Argument loc_or_str can only be tuple or str.')
element = self element = self
loc_str = loc_or_str[1] loc_str = loc_or_str[1]
@ -168,7 +159,7 @@ class SessionElement(DrissionElement):
loc_or_str = loc_or_str[0], loc_str loc_or_str = loc_or_str[0], loc_str
return execute_session_find(element, loc_or_str, mode) return make_session_ele(element, loc_or_str, mode)
def eles(self, loc_or_str: Union[Tuple[str, str], str], timeout=None): def eles(self, loc_or_str: Union[Tuple[str, str], str], timeout=None):
"""返回当前元素下级所有符合条件的子元素、属性或节点文本 \n """返回当前元素下级所有符合条件的子元素、属性或节点文本 \n
@ -178,6 +169,9 @@ class SessionElement(DrissionElement):
""" """
return self.ele(loc_or_str, mode='all') return self.ele(loc_or_str, mode='all')
def s_ele(self, loc_or_str: Union[Tuple[str, str], str], mode: str = None, timeout=None):
return self.ele(loc_or_str, mode=mode, timeout=timeout)
def _get_ele_path(self, mode) -> str: def _get_ele_path(self, mode) -> str:
"""获取css路径或xpath路径 """获取css路径或xpath路径
:param mode: 'css' 'xpath' :param mode: 'css' 'xpath'
@ -223,9 +217,9 @@ class SessionElement(DrissionElement):
return link return link
def execute_session_find(page_or_ele, def make_session_ele(page_or_ele,
loc: Tuple[str, str], loc: Union[str, Tuple[str, str]],
mode: str = 'single', ) -> Union[SessionElement, List[SessionElement], str, None]: mode: str = 'single', ) -> Union[SessionElement, List[SessionElement], str, None]:
"""执行session模式元素的查找 \n """执行session模式元素的查找 \n
页面查找元素及元素查找下级元素皆使用此方法 \n 页面查找元素及元素查找下级元素皆使用此方法 \n
:param page_or_ele: SessionPage对象或SessionElement对象 :param page_or_ele: SessionPage对象或SessionElement对象
@ -238,30 +232,44 @@ def execute_session_find(page_or_ele,
raise ValueError(f"Argument mode can only be 'single' or 'all', not '{mode}'.") raise ValueError(f"Argument mode can only be 'single' or 'all', not '{mode}'.")
# 根据传入对象类型获取页面对象和lxml元素对象 # 根据传入对象类型获取页面对象和lxml元素对象
if isinstance(page_or_ele, SessionElement): type_str = str(type(page_or_ele))
if isinstance(page_or_ele, str): # 直接传入html文本
page = None
page_or_ele = fromstring(page_or_ele)
elif type_str.endswith("SessionElement'>"): # SessionElement
page = page_or_ele.page page = page_or_ele.page
page_or_ele = page_or_ele.inner_ele page_or_ele = page_or_ele.inner_ele
else: # 传入的是SessionPage对象 elif "Page" in type_str: # MixPage, DriverPage 或 SessionPage
page = page_or_ele page = page_or_ele
page_or_ele = fromstring(sub(r' ?', ' ', page_or_ele.response.text)) page_or_ele = fromstring(page_or_ele.html)
else: # DrissionElement 或 ShadowRootElement
page = page_or_ele.page
page_or_ele = fromstring(page_or_ele.html)
# else: # 传入的是SessionPage对象
# page = page_or_ele
# page_or_ele = fromstring(sub(r' ?', ' ', page_or_ele.response.text))
# ---------------处理定位符---------------
if isinstance(loc, str):
loc = str_to_loc(loc)
elif isinstance(loc, tuple):
loc = translate_loc(loc)
else:
raise ValueError("定位符必须为str或长度为2的tuple。")
# ---------------执行搜索-----------------
try: try:
# 用lxml内置方法获取lxml的元素对象列表 if loc[0] == 'xpath': # 用lxml内置方法获取lxml的元素对象列表
if loc[0] == 'xpath':
ele = page_or_ele.xpath(loc[1]) ele = page_or_ele.xpath(loc[1])
else: # 用css selector获取元素对象列表
# 用css selector获取元素对象列表
else:
ele = page_or_ele.cssselect(loc[1]) ele = page_or_ele.cssselect(loc[1])
# 结果不是列表,如数字 if not isinstance(ele, list): # 结果不是列表,如数字
if not isinstance(ele, list):
return ele return ele
# 把lxml元素对象包装成SessionElement对象并按需要返回第一个或全部 # 把lxml元素对象包装成SessionElement对象并按需要返回第一个或全部
if mode == 'single': if mode == 'single':
ele = ele[0] if ele else None ele = ele[0] if ele else None
if isinstance(ele, HtmlElement): if isinstance(ele, HtmlElement):
return SessionElement(ele, page) return SessionElement(ele, page)
elif isinstance(ele, str): elif isinstance(ele, str):
@ -273,7 +281,6 @@ def execute_session_find(page_or_ele,
return [SessionElement(e, page) if isinstance(e, HtmlElement) else e for e in ele if e != '\n'] return [SessionElement(e, page) if isinstance(e, HtmlElement) else e for e in ele if e != '\n']
except Exception as e: except Exception as e:
if 'Invalid expression' in str(e): if 'Invalid expression' in str(e):
raise SyntaxError(f'Invalid xpath syntax. {loc}') raise SyntaxError(f'Invalid xpath syntax. {loc}')
elif 'Expected selector' in str(e): elif 'Expected selector' in str(e):

View File

@ -16,9 +16,9 @@ from requests import Session, Response
from tldextract import extract from tldextract import extract
from .base import BasePage from .base import BasePage
from .common import str_to_loc, translate_loc, get_available_file_name, format_html from .common import get_available_file_name, format_html
from .config import _cookie_to_dict from .config import _cookie_to_dict
from .session_element import SessionElement, execute_session_find from .session_element import SessionElement, make_session_ele
class SessionPage(BasePage): class SessionPage(BasePage):
@ -102,41 +102,27 @@ class SessionPage(BasePage):
def ele(self, def ele(self,
loc_or_ele: Union[Tuple[str, str], str, SessionElement], loc_or_ele: Union[Tuple[str, str], str, SessionElement],
mode: str = None, timeout=None) -> Union[SessionElement, List[SessionElement], str, None]: mode: str = None,
timeout=None) -> Union[SessionElement, List[SessionElement], str, None]:
"""返回页面中符合条件的元素、属性或节点文本,默认返回第一个 \n """返回页面中符合条件的元素、属性或节点文本,默认返回第一个 \n
:param loc_or_ele: 元素的定位信息可以是元素对象loc元组或查询字符串 :param loc_or_ele: 元素的定位信息可以是元素对象loc元组或查询字符串
:param mode: 'single' 'all对应查找一个或全部 :param mode: 'single' 'all对应查找一个或全部
:param timeout: 不起实际作用用于和父类对应 :param timeout: 不起实际作用用于和父类对应
:return: SessionElement对象 :return: SessionElement对象
""" """
if isinstance(loc_or_ele, (str, tuple)): return loc_or_ele if isinstance(loc_or_ele, SessionElement) else make_session_ele(self, loc_or_ele, mode)
if isinstance(loc_or_ele, str):
loc_or_ele = str_to_loc(loc_or_ele)
else:
if len(loc_or_ele) != 2:
raise ValueError("Len of loc_or_ele must be 2 when it's a tuple.")
loc_or_ele = translate_loc(loc_or_ele)
elif isinstance(loc_or_ele, SessionElement): def eles(self, loc_or_str: Union[Tuple[str, str], str], timeout=None) -> List[SessionElement]:
return loc_or_ele
else:
raise ValueError('Argument loc_or_str can only be tuple, str, SessionElement, Element.')
return execute_session_find(self, loc_or_ele, mode)
def eles(self,
loc_or_str: Union[Tuple[str, str], str], timeout=None) -> List[SessionElement]:
"""返回页面中所有符合条件的元素、属性或节点文本 \n """返回页面中所有符合条件的元素、属性或节点文本 \n
:param loc_or_str: 元素的定位信息可以是loc元组或查询字符串 :param loc_or_str: 元素的定位信息可以是loc元组或查询字符串
:param timeout: 不起实际作用用于和父类对应 :param timeout: 不起实际作用用于和父类对应
:return: SessionElement对象组成的列表 :return: SessionElement对象组成的列表
""" """
if not isinstance(loc_or_str, (tuple, str)):
raise TypeError('Type of loc_or_str can only be tuple or str.')
return super().eles(loc_or_str, timeout) return super().eles(loc_or_str, timeout)
def s_ele(self, loc_or_str: Union[Tuple[str, str], str], mode: str = None, timeout=None):
return self.ele(loc_or_str, mode=mode, timeout=timeout)
def get_cookies(self, as_dict: bool = False, all_domains: bool = False) -> Union[dict, list]: def get_cookies(self, as_dict: bool = False, all_domains: bool = False) -> Union[dict, list]:
"""返回cookies \n """返回cookies \n
:param as_dict: 是否以字典方式返回 :param as_dict: 是否以字典方式返回

View File

@ -12,6 +12,7 @@ from selenium.webdriver.remote.webelement import WebElement
from .base import BaseElement from .base import BaseElement
from .common import format_html from .common import format_html
from .driver_element import execute_driver_find, DriverElement from .driver_element import execute_driver_find, DriverElement
from .session_element import make_session_ele
class ShadowRootElement(BaseElement): class ShadowRootElement(BaseElement):
@ -89,6 +90,9 @@ class ShadowRootElement(BaseElement):
elif loc_or_str[0] == 'text': elif loc_or_str[0] == 'text':
return self._find_eles_by_text(loc_or_str[1], loc_or_str[2], loc_or_str[3], mode) return self._find_eles_by_text(loc_or_str[1], loc_or_str[2], loc_or_str[3], mode)
def s_ele(self, loc_or_ele, mode='single', timeout=None):
return make_session_ele(self, loc_or_ele, mode)
def eles(self, def eles(self,
loc_or_str: Union[Tuple[str, str], str], loc_or_str: Union[Tuple[str, str], str],
timeout: float = None) -> List[DriverElement]: timeout: float = None) -> List[DriverElement]: