From 391d0426355dfa141b381411cd7c94e65edc68eb Mon Sep 17 00:00:00 2001 From: g1879 Date: Fri, 19 Nov 2021 21:48:00 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0s=5Fele()=EF=BC=8C=E6=9C=AA?= =?UTF-8?q?=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/base.py | 24 +++++++++--- DrissionPage/common.py | 13 ++++++- DrissionPage/configs.ini | 18 ++++----- DrissionPage/driver_element.py | 16 +++----- DrissionPage/driver_page.py | 7 ++-- DrissionPage/mix_page.py | 6 +++ DrissionPage/session_element.py | 59 ++++++++++++++++------------- DrissionPage/session_page.py | 32 +++++----------- DrissionPage/shadow_root_element.py | 4 ++ 9 files changed, 99 insertions(+), 80 deletions(-) diff --git a/DrissionPage/base.py b/DrissionPage/base.py index 5090b60..f22fde5 100644 --- a/DrissionPage/base.py +++ b/DrissionPage/base.py @@ -6,7 +6,7 @@ """ from abc import abstractmethod from re import sub -from typing import Union +from typing import Union, Tuple from lxml.html import HtmlElement from selenium.webdriver.remote.webelement import WebElement @@ -21,14 +21,26 @@ class BaseParser(object): timeout: float = None): return self.ele(loc_or_str, mode, timeout) - def eles(self, loc_or_str, timeout): + def eles(self, + loc_or_str: Union[Tuple[str, str], str], + timeout: float = None): return self.ele(loc_or_str, mode='all', timeout=timeout) + def s_eles(self, + loc_or_str: Union[Tuple[str, str], str], + timeout: float = None): + """查找并以SessionElement方式返回元素""" + return self.s_ele(loc_or_str, mode='all', timeout=timeout) + # ----------------以下属性或方法待后代实现---------------- @property def html(self): return + @abstractmethod + def s_ele(self, loc_or_ele, mode='single', timeout=None): + pass + @abstractmethod def ele(self, loc_or_ele, mode='single', timeout=None): pass @@ -50,8 +62,8 @@ class BaseElement(BaseParser): """返回后一个兄弟元素""" return self.nexts() - def eles(self, loc_or_str, timeout): - return super().eles(loc_or_str, timeout) + # def eles(self, loc_or_str, timeout): + # return super().eles(loc_or_str, timeout) # ----------------以下属性或方法由后代实现---------------- @property @@ -251,8 +263,8 @@ class BasePage(BaseParser): """返回当前访问的url有效性""" return self._url_available - def eles(self, loc_or_str, timeout): - return super().eles(loc_or_str, timeout) + # def eles(self, loc_or_str, timeout): + # return super().eles(loc_or_str, timeout) # ----------------以下属性或方法由后代实现---------------- @property diff --git a/DrissionPage/common.py b/DrissionPage/common.py index 9ca1779..1920f08 100644 --- a/DrissionPage/common.py +++ b/DrissionPage/common.py @@ -158,7 +158,11 @@ def _make_search_str(search_str: str) -> str: def format_html(text: str, trans: bool = True) -> str: - """处理html编码字符""" + """处理html编码字符 \n + :param text: html文本 + :param trans: 是否转码 + :return: 格式化后的html文本 + """ if not text: return text @@ -173,8 +177,10 @@ def translate_loc(loc: tuple) -> tuple: :param loc: By类型的loc元组 :return: css selector或xpath类型的loc元组 """ + if len(loc) != 2: + raise ValueError('定位符长度必须为2。') + loc_by = 'xpath' - loc_str = None if loc[0] == 'xpath': loc_str = loc[1] @@ -201,6 +207,9 @@ def translate_loc(loc: tuple) -> tuple: elif loc[0] == 'partial link text': loc_str = f'//a[contains(text(),"{loc[1]}")]' + else: + raise ValueError('无法识别的定位符。') + return loc_by, loc_str diff --git a/DrissionPage/configs.ini b/DrissionPage/configs.ini index 21eee64..b8411b3 100644 --- a/DrissionPage/configs.ini +++ b/DrissionPage/configs.ini @@ -1,19 +1,19 @@ [paths] -chromedriver_path = -tmp_path = +chromedriver_path = D:\python\Google Chrome\Chrome\chromedriver75.exe +tmp_path = D:\python\projects\DrissionPage\DrissionPage\tmp [chrome_options] -debugger_address = -binary_location = +debugger_address = 127.0.0.1:9222 +binary_location = D:\python\Google Chrome\Chrome\chrome.exe arguments = ['--no-sandbox', '--disable-gpu', '--ignore-certificate-errors', '--disable-infobars'] extensions = [] experimental_options = {'prefs': {'profile.default_content_settings.popups': 0, 'profile.default_content_setting_values': {'notifications': 2}, 'plugins.plugins_list': [{'enabled': False, 'name': 'Chrome PDF Viewer'}]}, 'useAutomationExtension': False, 'excludeSwitches': ['enable-automation']} [session_options] headers = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Connection": "keep-alive", - "Accept-Charset": "GB2312,utf-8;q=0.7,*;q=0.7" - } + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Connection": "keep-alive", + "Accept-Charset": "GB2312,utf-8;q=0.7,*;q=0.7" + } diff --git a/DrissionPage/driver_element.py b/DrissionPage/driver_element.py index dac2f68..53daca4 100644 --- a/DrissionPage/driver_element.py +++ b/DrissionPage/driver_element.py @@ -18,6 +18,7 @@ from selenium.webdriver.support.wait import WebDriverWait from .base import DrissionElement, BaseElement from .common import str_to_loc, get_available_file_name, translate_loc, format_html +from .session_element import make_session_ele class DriverElement(DrissionElement): @@ -123,16 +124,7 @@ class DriverElement(DrissionElement): :param timeout: 查找元素超时时间 :return: DriverElement对象 """ - if isinstance(loc_or_str, (str, tuple)): - if isinstance(loc_or_str, str): - loc_or_str = str_to_loc(loc_or_str) - else: - if len(loc_or_str) != 2: - raise ValueError("Len of loc_or_str must be 2 when it's a tuple.") - loc_or_str = translate_loc(loc_or_str) - else: - raise ValueError('Argument loc_or_str can only be tuple or str.') - + loc_or_str = str_to_loc(loc_or_str) if isinstance(loc_or_str, str) else translate_loc(loc_or_str) loc_str = loc_or_str[1] if loc_or_str[0] == 'xpath' and loc_or_str[1].lstrip().startswith('/'): @@ -142,9 +134,11 @@ class DriverElement(DrissionElement): loc_str = f'{self.css_path}{loc_or_str[1]}' loc_or_str = loc_or_str[0], loc_str - return execute_driver_find(self, loc_or_str, mode, timeout) + def s_ele(self, loc_or_ele, mode='single', timeout=None): + return make_session_ele(self, loc_or_ele, mode) + def eles(self, loc_or_str: Union[Tuple[str, str], str], timeout: float = None): diff --git a/DrissionPage/driver_page.py b/DrissionPage/driver_page.py index 4b63f19..c28bdd1 100644 --- a/DrissionPage/driver_page.py +++ b/DrissionPage/driver_page.py @@ -19,6 +19,7 @@ from selenium.webdriver.support.wait import WebDriverWait from .base import BasePage from .common import str_to_loc, get_available_file_name, translate_loc, format_html from .driver_element import DriverElement, execute_driver_find, _wait_ele +from .session_element import make_session_ele class DriverPage(BasePage): @@ -127,6 +128,9 @@ class DriverPage(BasePage): return execute_driver_find(self, loc_or_ele, mode, timeout) + def s_ele(self, loc_or_ele, mode='single', timeout=None): + return make_session_ele(self, loc_or_ele, mode) + def eles(self, loc_or_str: Union[Tuple[str, str], str], timeout: float = None) -> List[DriverElement]: @@ -135,9 +139,6 @@ class DriverPage(BasePage): :param timeout: 查找元素超时时间 :return: DriverElement对象组成的列表 """ - if not isinstance(loc_or_str, (tuple, str)): - raise TypeError('Type of loc_or_str can only be tuple or str.') - return super().eles(loc_or_str, timeout) def get_cookies(self, as_dict: bool = False) -> Union[list, dict]: diff --git a/DrissionPage/mix_page.py b/DrissionPage/mix_page.py index dfced89..513558e 100644 --- a/DrissionPage/mix_page.py +++ b/DrissionPage/mix_page.py @@ -133,6 +133,12 @@ class MixPage(SessionPage, DriverPage, BasePage): elif self._mode == 'd': return super(SessionPage, self).ele(loc_or_ele, mode=mode, timeout=timeout) + def s_ele(self, loc_or_ele, mode='single', timeout=None): + if self._mode == 's': + return super().s_ele(loc_or_ele, mode=mode) + elif self._mode == 'd': + return super(SessionPage, self).s_ele(loc_or_ele, mode=mode, timeout=timeout) + def eles(self, loc_or_str: Union[Tuple[str, str], str], timeout: float = None) -> Union[List[DriverElement], List[SessionElement]]: diff --git a/DrissionPage/session_element.py b/DrissionPage/session_element.py index f3eafa6..0d977f0 100644 --- a/DrissionPage/session_element.py +++ b/DrissionPage/session_element.py @@ -145,16 +145,7 @@ class SessionElement(DrissionElement): :param timeout: 不起实际作用,用于和父类对应 :return: SessionElement对象 """ - if isinstance(loc_or_str, (str, tuple)): - if isinstance(loc_or_str, str): - loc_or_str = str_to_loc(loc_or_str) - else: - if len(loc_or_str) != 2: - raise ValueError("Len of loc_or_str must be 2 when it's a tuple.") - loc_or_str = translate_loc(loc_or_str) - else: - raise ValueError('Argument loc_or_str can only be tuple or str.') - + loc_or_str = str_to_loc(loc_or_str) if isinstance(loc_or_str, str) else translate_loc(loc_or_str) element = self loc_str = loc_or_str[1] @@ -168,7 +159,7 @@ class SessionElement(DrissionElement): loc_or_str = loc_or_str[0], loc_str - return execute_session_find(element, loc_or_str, mode) + return make_session_ele(element, loc_or_str, mode) def eles(self, loc_or_str: Union[Tuple[str, str], str], timeout=None): """返回当前元素下级所有符合条件的子元素、属性或节点文本 \n @@ -178,6 +169,9 @@ class SessionElement(DrissionElement): """ return self.ele(loc_or_str, mode='all') + def s_ele(self, loc_or_str: Union[Tuple[str, str], str], mode: str = None, timeout=None): + return self.ele(loc_or_str, mode=mode, timeout=timeout) + def _get_ele_path(self, mode) -> str: """获取css路径或xpath路径 :param mode: 'css' 或 'xpath' @@ -223,9 +217,9 @@ class SessionElement(DrissionElement): return link -def execute_session_find(page_or_ele, - loc: Tuple[str, str], - mode: str = 'single', ) -> Union[SessionElement, List[SessionElement], str, None]: +def make_session_ele(page_or_ele, + loc: Union[str, Tuple[str, str]], + mode: str = 'single', ) -> Union[SessionElement, List[SessionElement], str, None]: """执行session模式元素的查找 \n 页面查找元素及元素查找下级元素皆使用此方法 \n :param page_or_ele: SessionPage对象或SessionElement对象 @@ -238,30 +232,44 @@ def execute_session_find(page_or_ele, raise ValueError(f"Argument mode can only be 'single' or 'all', not '{mode}'.") # 根据传入对象类型获取页面对象和lxml元素对象 - if isinstance(page_or_ele, SessionElement): + type_str = str(type(page_or_ele)) + if isinstance(page_or_ele, str): # 直接传入html文本 + page = None + page_or_ele = fromstring(page_or_ele) + elif type_str.endswith("SessionElement'>"): # SessionElement page = page_or_ele.page page_or_ele = page_or_ele.inner_ele - else: # 传入的是SessionPage对象 + elif "Page" in type_str: # MixPage, DriverPage 或 SessionPage page = page_or_ele - page_or_ele = fromstring(sub(r' ?', ' ', page_or_ele.response.text)) + page_or_ele = fromstring(page_or_ele.html) + else: # DrissionElement 或 ShadowRootElement + page = page_or_ele.page + page_or_ele = fromstring(page_or_ele.html) + # else: # 传入的是SessionPage对象 + # page = page_or_ele + # page_or_ele = fromstring(sub(r' ?', ' ', page_or_ele.response.text)) + # ---------------处理定位符--------------- + if isinstance(loc, str): + loc = str_to_loc(loc) + elif isinstance(loc, tuple): + loc = translate_loc(loc) + else: + raise ValueError("定位符必须为str或长度为2的tuple。") + + # ---------------执行搜索----------------- try: - # 用lxml内置方法获取lxml的元素对象列表 - if loc[0] == 'xpath': + if loc[0] == 'xpath': # 用lxml内置方法获取lxml的元素对象列表 ele = page_or_ele.xpath(loc[1]) - - # 用css selector获取元素对象列表 - else: + else: # 用css selector获取元素对象列表 ele = page_or_ele.cssselect(loc[1]) - # 结果不是列表,如数字 - if not isinstance(ele, list): + if not isinstance(ele, list): # 结果不是列表,如数字 return ele # 把lxml元素对象包装成SessionElement对象并按需要返回第一个或全部 if mode == 'single': ele = ele[0] if ele else None - if isinstance(ele, HtmlElement): return SessionElement(ele, page) elif isinstance(ele, str): @@ -273,7 +281,6 @@ def execute_session_find(page_or_ele, return [SessionElement(e, page) if isinstance(e, HtmlElement) else e for e in ele if e != '\n'] except Exception as e: - if 'Invalid expression' in str(e): raise SyntaxError(f'Invalid xpath syntax. {loc}') elif 'Expected selector' in str(e): diff --git a/DrissionPage/session_page.py b/DrissionPage/session_page.py index fcfe4d0..b4c2b8c 100644 --- a/DrissionPage/session_page.py +++ b/DrissionPage/session_page.py @@ -16,9 +16,9 @@ from requests import Session, Response from tldextract import extract from .base import BasePage -from .common import str_to_loc, translate_loc, get_available_file_name, format_html +from .common import get_available_file_name, format_html from .config import _cookie_to_dict -from .session_element import SessionElement, execute_session_find +from .session_element import SessionElement, make_session_ele class SessionPage(BasePage): @@ -102,41 +102,27 @@ class SessionPage(BasePage): def ele(self, loc_or_ele: Union[Tuple[str, str], str, SessionElement], - mode: str = None, timeout=None) -> Union[SessionElement, List[SessionElement], str, None]: + mode: str = None, + timeout=None) -> Union[SessionElement, List[SessionElement], str, None]: """返回页面中符合条件的元素、属性或节点文本,默认返回第一个 \n :param loc_or_ele: 元素的定位信息,可以是元素对象,loc元组,或查询字符串 :param mode: 'single' 或 'all‘,对应查找一个或全部 :param timeout: 不起实际作用,用于和父类对应 :return: SessionElement对象 """ - if isinstance(loc_or_ele, (str, tuple)): - if isinstance(loc_or_ele, str): - loc_or_ele = str_to_loc(loc_or_ele) - else: - if len(loc_or_ele) != 2: - raise ValueError("Len of loc_or_ele must be 2 when it's a tuple.") - loc_or_ele = translate_loc(loc_or_ele) + return loc_or_ele if isinstance(loc_or_ele, SessionElement) else make_session_ele(self, loc_or_ele, mode) - elif isinstance(loc_or_ele, SessionElement): - return loc_or_ele - - else: - raise ValueError('Argument loc_or_str can only be tuple, str, SessionElement, Element.') - - return execute_session_find(self, loc_or_ele, mode) - - def eles(self, - loc_or_str: Union[Tuple[str, str], str], timeout=None) -> List[SessionElement]: + def eles(self, loc_or_str: Union[Tuple[str, str], str], timeout=None) -> List[SessionElement]: """返回页面中所有符合条件的元素、属性或节点文本 \n :param loc_or_str: 元素的定位信息,可以是loc元组,或查询字符串 :param timeout: 不起实际作用,用于和父类对应 :return: SessionElement对象组成的列表 """ - if not isinstance(loc_or_str, (tuple, str)): - raise TypeError('Type of loc_or_str can only be tuple or str.') - return super().eles(loc_or_str, timeout) + def s_ele(self, loc_or_str: Union[Tuple[str, str], str], mode: str = None, timeout=None): + return self.ele(loc_or_str, mode=mode, timeout=timeout) + def get_cookies(self, as_dict: bool = False, all_domains: bool = False) -> Union[dict, list]: """返回cookies \n :param as_dict: 是否以字典方式返回 diff --git a/DrissionPage/shadow_root_element.py b/DrissionPage/shadow_root_element.py index d06224d..39e68ee 100644 --- a/DrissionPage/shadow_root_element.py +++ b/DrissionPage/shadow_root_element.py @@ -12,6 +12,7 @@ from selenium.webdriver.remote.webelement import WebElement from .base import BaseElement from .common import format_html from .driver_element import execute_driver_find, DriverElement +from .session_element import make_session_ele class ShadowRootElement(BaseElement): @@ -89,6 +90,9 @@ class ShadowRootElement(BaseElement): elif loc_or_str[0] == 'text': return self._find_eles_by_text(loc_or_str[1], loc_or_str[2], loc_or_str[3], mode) + def s_ele(self, loc_or_ele, mode='single', timeout=None): + return make_session_ele(self, loc_or_ele, mode) + def eles(self, loc_or_str: Union[Tuple[str, str], str], timeout: float = None) -> List[DriverElement]: