From 56683468a62557b9e0a97e3095c0297f9cc2ff5c Mon Sep 17 00:00:00 2001 From: g1879 Date: Fri, 21 Aug 2020 00:55:56 +0800 Subject: [PATCH 01/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=AF=B9shadow-dom?= =?UTF-8?q?=E6=94=AF=E6=8C=81=EF=BC=8C=E6=9C=AA=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/driver_element.py | 36 ++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/DrissionPage/driver_element.py b/DrissionPage/driver_element.py index 1c16104..8561344 100644 --- a/DrissionPage/driver_element.py +++ b/DrissionPage/driver_element.py @@ -123,6 +123,11 @@ class DriverElement(DrissionElement): ''' return self.run_script(js) + @property + def shadow_root(self): + e = self.run_script('return arguments[0].shadowRoot') + return ShadowRootElement(e, self) if e else None + @property def parent(self): """返回父级元素""" @@ -445,6 +450,37 @@ class DriverElement(DrissionElement): ActionChains(self._driver).move_to_element(self.inner_ele).perform() +class ShadowRootElement(DrissionElement): + def __init__(self, inner_ele: WebElement, parent_ele: DriverElement): + super().__init__(inner_ele) + self.parent_ele = parent_ele + + def ele(self, loc: Union[tuple, str], mode: str = None, show_errmsg: bool = True): + pass + + def eles(self, loc: Union[tuple, str], show_errmsg: bool = True): + pass + + def attr(self, attr: str): + return self.html if attr == 'innerHTML' else None + + def run_script(self, script: str, *args) -> Any: + """执行js代码,传入自己为第一个参数 \n + :param script: js文本 + :param args: 传入的参数 + :return: js执行结果 + """ + return self.inner_ele.parent.execute_script(script, self.inner_ele, *args) + + @property + def html(self): + return unescape(self.attr('innerHTML')).replace('\xa0', ' ') + + @property + def parent(self) -> DriverElement: + return self.parent_ele + + def execute_driver_find(page_or_ele: Union[WebElement, WebDriver], loc: tuple, mode: str = 'single', From dc47dfcecc158cfa2090add4fcf6c99f49a3f712 Mon Sep 17 00:00:00 2001 From: g1879 Date: Mon, 24 Aug 2020 21:56:27 +0800 Subject: [PATCH 02/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=AF=B9shadow-dom?= =?UTF-8?q?=E6=94=AF=E6=8C=81=EF=BC=8C=E6=9C=AA=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/driver_element.py | 37 +++----------------- DrissionPage/shadow_root_element.py | 54 +++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 32 deletions(-) create mode 100644 DrissionPage/shadow_root_element.py diff --git a/DrissionPage/driver_element.py b/DrissionPage/driver_element.py index 8561344..f442705 100644 --- a/DrissionPage/driver_element.py +++ b/DrissionPage/driver_element.py @@ -126,7 +126,11 @@ class DriverElement(DrissionElement): @property def shadow_root(self): e = self.run_script('return arguments[0].shadowRoot') - return ShadowRootElement(e, self) if e else None + if e: + from shadow_root_element import ShadowRootElement + return ShadowRootElement(e, self) + else: + return None @property def parent(self): @@ -450,37 +454,6 @@ class DriverElement(DrissionElement): ActionChains(self._driver).move_to_element(self.inner_ele).perform() -class ShadowRootElement(DrissionElement): - def __init__(self, inner_ele: WebElement, parent_ele: DriverElement): - super().__init__(inner_ele) - self.parent_ele = parent_ele - - def ele(self, loc: Union[tuple, str], mode: str = None, show_errmsg: bool = True): - pass - - def eles(self, loc: Union[tuple, str], show_errmsg: bool = True): - pass - - def attr(self, attr: str): - return self.html if attr == 'innerHTML' else None - - def run_script(self, script: str, *args) -> Any: - """执行js代码,传入自己为第一个参数 \n - :param script: js文本 - :param args: 传入的参数 - :return: js执行结果 - """ - return self.inner_ele.parent.execute_script(script, self.inner_ele, *args) - - @property - def html(self): - return unescape(self.attr('innerHTML')).replace('\xa0', ' ') - - @property - def parent(self) -> DriverElement: - return self.parent_ele - - def execute_driver_find(page_or_ele: Union[WebElement, WebDriver], loc: tuple, mode: str = 'single', diff --git a/DrissionPage/shadow_root_element.py b/DrissionPage/shadow_root_element.py new file mode 100644 index 0000000..ca9f847 --- /dev/null +++ b/DrissionPage/shadow_root_element.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +from html import unescape +from typing import Union, Any + +from selenium.webdriver.remote.webelement import WebElement + +from common import DrissionElement + + +# from driver_element import DriverElement + + +class ShadowRootElement(DrissionElement): + def __init__(self, inner_ele: WebElement, parent_ele): + super().__init__(inner_ele) + self.parent_ele = parent_ele + + def ele(self, loc: Union[tuple, str], mode: str = None, show_errmsg: bool = True): + pass + + def eles(self, loc: Union[tuple, str], show_errmsg: bool = True): + pass + + def attr(self, attr: str): + return self.html if attr == 'innerHTML' else None + + def run_script(self, script: str, *args) -> Any: + """执行js代码,传入自己为第一个参数 \n + :param script: js文本 + :param args: 传入的参数 + :return: js执行结果 + """ + return self.inner_ele.parent.execute_script(script, self.inner_ele, *args) + + @property + def html(self): + return unescape(self.attr('innerHTML')).replace('\xa0', ' ') + + @property + def parent(self): + return self.parent_ele + + def is_enabled(self) -> bool: + """是否可用""" + return self.inner_ele.is_enabled() + + def is_valid(self) -> bool: + """用于判断元素是否还能用,应对页面跳转元素不能用的情况""" + try: + self.is_enabled() + return True + except: + return False From 65077efd17f51864c18d2597f29be7a8e64bd14a Mon Sep 17 00:00:00 2001 From: kkjj828 <18022238500@189.cn> Date: Fri, 28 Aug 2020 14:25:04 +0800 Subject: [PATCH 03/13] =?UTF-8?q?MixPage=E7=B1=BB=E5=8F=AF=E6=8E=A5?= =?UTF-8?q?=E6=94=B6=E6=B5=8F=E8=A7=88=E5=99=A8=E5=92=8Crequests=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E8=87=AA=E5=8A=A8=E5=88=9B=E5=BB=BADrission=E5=AF=B9?= =?UTF-8?q?=E8=B1=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/mix_page.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/DrissionPage/mix_page.py b/DrissionPage/mix_page.py index 9d27e43..b6cab5e 100644 --- a/DrissionPage/mix_page.py +++ b/DrissionPage/mix_page.py @@ -11,6 +11,7 @@ from requests_html import HTMLSession, Element from selenium.webdriver.chrome.webdriver import WebDriver from selenium.webdriver.remote.webelement import WebElement +from .config import DriverOptions from .drission import Drission from .driver_element import DriverElement from .driver_page import DriverPage @@ -32,16 +33,23 @@ class MixPage(Null, SessionPage, DriverPage): 这些功能由DriverPage和SessionPage类实现。 """ - def __init__(self, drission: Union[Drission, str] = None, mode: str = 'd', timeout: float = 10): - """初始化函数 \n + def __init__(self, + drission: Union[Drission, str] = None, + mode: str = 'd', + timeout: float = 10, + driver_options: Union[dict, DriverOptions] = None, + session_options: dict = None): + """初始化函数 \n :param drission: 整合了driver和session的类,传入's'或'd'时快速配置相应模式 :param mode: 默认使用selenium的d模式 + :param driver_options: 浏览器设置,没有传入drission参数时会用这个设置新建Drission对象 + :param session_options: requests设置,没有传入drission参数时会用这个设置新建Drission对象 """ super().__init__() if drission in ['s', 'd', 'S', 'D']: mode = drission.lower() drission = None - self._drission = drission or Drission() + self._drission = drission or Drission(driver_options, session_options) self._session = None self._driver = None self._url = None From 7186620c970bc72ce7efc937e2da5e6b083ab84c Mon Sep 17 00:00:00 2001 From: g1879 Date: Fri, 28 Aug 2020 19:44:19 +0800 Subject: [PATCH 04/13] =?UTF-8?q?=E5=B0=9D=E8=AF=95=E6=94=AF=E6=8C=81shado?= =?UTF-8?q?w-dom=EF=BC=8C=E6=9C=AA=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/common.py | 26 ++++----- DrissionPage/driver_element.py | 2 +- DrissionPage/shadow_root_element.py | 91 +++++++++++++++++++++++------ 3 files changed, 87 insertions(+), 32 deletions(-) diff --git a/DrissionPage/common.py b/DrissionPage/common.py index 3a7f594..d2a1f66 100644 --- a/DrissionPage/common.py +++ b/DrissionPage/common.py @@ -28,9 +28,9 @@ class DrissionElement(object): def is_valid(self): return True - @property - def text(self): - return + # @property + # def text(self): + # return @property def html(self): @@ -52,13 +52,13 @@ class DrissionElement(object): def prev(self): return - @property - def css_path(self): - return - - @property - def xpath(self): - return + # @property + # def css_path(self): + # return + # + # @property + # def xpath(self): + # return @abstractmethod def ele(self, loc: Union[tuple, str], mode: str = None, show_errmsg: bool = True): @@ -68,9 +68,9 @@ class DrissionElement(object): def eles(self, loc: Union[tuple, str], show_errmsg: bool = True): pass - @abstractmethod - def attr(self, attr: str): - pass + # @abstractmethod + # def attr(self, attr: str): + # pass def get_loc_from_str(loc: str) -> tuple: diff --git a/DrissionPage/driver_element.py b/DrissionPage/driver_element.py index f442705..b94497d 100644 --- a/DrissionPage/driver_element.py +++ b/DrissionPage/driver_element.py @@ -127,7 +127,7 @@ class DriverElement(DrissionElement): def shadow_root(self): e = self.run_script('return arguments[0].shadowRoot') if e: - from shadow_root_element import ShadowRootElement + from .shadow_root_element import ShadowRootElement return ShadowRootElement(e, self) else: return None diff --git a/DrissionPage/shadow_root_element.py b/DrissionPage/shadow_root_element.py index ca9f847..f75cf79 100644 --- a/DrissionPage/shadow_root_element.py +++ b/DrissionPage/shadow_root_element.py @@ -5,26 +5,89 @@ from typing import Union, Any from selenium.webdriver.remote.webelement import WebElement -from common import DrissionElement - - -# from driver_element import DriverElement +from .common import DrissionElement, get_loc_from_str +from .driver_element import execute_driver_find class ShadowRootElement(DrissionElement): - def __init__(self, inner_ele: WebElement, parent_ele): + def __init__(self, inner_ele: WebElement, parent_ele, timeout: float = 10): super().__init__(inner_ele) self.parent_ele = parent_ele + self.timeout = timeout + self._driver = inner_ele.parent - def ele(self, loc: Union[tuple, str], mode: str = None, show_errmsg: bool = True): - pass + @property + def driver(self): + """返回控制元素的WebDriver对象""" + return self._driver + + @property + def tag(self): + return 'shadow-root' + + @property + def html(self): + return unescape(self.inner_ele.get_attribute('innerHTML')).replace('\xa0', ' ') + + @property + def parent(self): + return self.parent_ele + + def parents(self, num: int = 1): + """返回上面第num级父元素 \n + :param num: 第几级父元素 + :return: DriverElement对象 + """ + loc = 'xpath', f'.{"/.." * (num - 1)}' + return self.parent_ele.ele(loc, timeout=0.01, show_errmsg=False) + + # @property + # def next(self): + # """返回后一个兄弟元素""" + # return + # + # def nexts(self, num: int = 1): + # """返回后面第num个兄弟元素 \n + # :param num: 后面第几个兄弟元素 + # :return: DriverElement对象 + # """ + # # loc = 'xpath', f'./following-sibling::*[{num}]' + # return + + def ele(self, + loc_or_str: Union[tuple, str], + mode: str = None, + timeout: float = None, + show_errmsg: bool = False): + if isinstance(loc_or_str, str): + loc_or_str = get_loc_from_str(loc_or_str) + elif isinstance(loc_or_str, tuple) and len(loc_or_str) == 2: + pass + else: + raise ValueError('Argument loc_or_str can only be tuple or str.') + + if loc_or_str[0] == 'xpath': + # 确保查询语句最前面是. + # loc_str = loc_or_str[1] if loc_or_str[1].startswith(('.', '/')) else f'.//{loc_or_str[1]}' + # loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}' + loc_str = loc_or_str[1] + # print(self.inner_ele) + # print(loc_str) + js = f'''return document.evaluate('{loc_str}', arguments[0]).iterateNext()''' # + print(js) + return self.inner_ele.parent.execute_script(js, self.inner_ele) + # return self.run_script(js) + # else: + # if loc_or_str[1].lstrip().startswith('>'): + # loc_or_str = loc_or_str[0], f'{self.css_path}{loc_or_str[1]}' + + timeout = timeout or self.timeout + + return execute_driver_find(self.inner_ele, loc_or_str, mode, show_errmsg, timeout) def eles(self, loc: Union[tuple, str], show_errmsg: bool = True): pass - def attr(self, attr: str): - return self.html if attr == 'innerHTML' else None - def run_script(self, script: str, *args) -> Any: """执行js代码,传入自己为第一个参数 \n :param script: js文本 @@ -33,14 +96,6 @@ class ShadowRootElement(DrissionElement): """ return self.inner_ele.parent.execute_script(script, self.inner_ele, *args) - @property - def html(self): - return unescape(self.attr('innerHTML')).replace('\xa0', ' ') - - @property - def parent(self): - return self.parent_ele - def is_enabled(self) -> bool: """是否可用""" return self.inner_ele.is_enabled() From ca9d2124d93363f52b24733123bbfeb40144c2f7 Mon Sep 17 00:00:00 2001 From: g1879 Date: Sun, 30 Aug 2020 20:09:48 +0800 Subject: [PATCH 05/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=AF=B9shadow-dom?= =?UTF-8?q?=E6=94=AF=E6=8C=81=EF=BC=8C=E5=88=9D=E6=AD=A5=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/shadow_root_element.py | 100 +++++++++++++++++++--------- 1 file changed, 68 insertions(+), 32 deletions(-) diff --git a/DrissionPage/shadow_root_element.py b/DrissionPage/shadow_root_element.py index f75cf79..36c5a84 100644 --- a/DrissionPage/shadow_root_element.py +++ b/DrissionPage/shadow_root_element.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- from html import unescape +from re import split as re_SPLIT from typing import Union, Any from selenium.webdriver.remote.webelement import WebElement -from .common import DrissionElement, get_loc_from_str +from .common import DrissionElement from .driver_element import execute_driver_find @@ -16,6 +17,9 @@ class ShadowRootElement(DrissionElement): self.timeout = timeout self._driver = inner_ele.parent + def __repr__(self): + return f'' + @property def driver(self): """返回控制元素的WebDriver对象""" @@ -41,18 +45,18 @@ class ShadowRootElement(DrissionElement): loc = 'xpath', f'.{"/.." * (num - 1)}' return self.parent_ele.ele(loc, timeout=0.01, show_errmsg=False) - # @property - # def next(self): - # """返回后一个兄弟元素""" - # return - # - # def nexts(self, num: int = 1): - # """返回后面第num个兄弟元素 \n - # :param num: 后面第几个兄弟元素 - # :return: DriverElement对象 - # """ - # # loc = 'xpath', f'./following-sibling::*[{num}]' - # return + @property + def next(self): + """返回后一个兄弟元素""" + return self.nexts() + + def nexts(self, num: int = 1): + """返回后面第num个兄弟元素 \n + :param num: 后面第几个兄弟元素 + :return: DriverElement对象 + """ + loc = 'css selector', f':nth-child({num})' + return self.parent_ele.ele(loc) def ele(self, loc_or_str: Union[tuple, str], @@ -60,33 +64,21 @@ class ShadowRootElement(DrissionElement): timeout: float = None, show_errmsg: bool = False): if isinstance(loc_or_str, str): - loc_or_str = get_loc_from_str(loc_or_str) + loc_or_str = get_css_from_str(loc_or_str) elif isinstance(loc_or_str, tuple) and len(loc_or_str) == 2: pass else: raise ValueError('Argument loc_or_str can only be tuple or str.') - if loc_or_str[0] == 'xpath': - # 确保查询语句最前面是. - # loc_str = loc_or_str[1] if loc_or_str[1].startswith(('.', '/')) else f'.//{loc_or_str[1]}' - # loc_str = loc_str if loc_str.startswith('.') else f'.{loc_str}' - loc_str = loc_or_str[1] - # print(self.inner_ele) - # print(loc_str) - js = f'''return document.evaluate('{loc_str}', arguments[0]).iterateNext()''' # - print(js) - return self.inner_ele.parent.execute_script(js, self.inner_ele) - # return self.run_script(js) - # else: - # if loc_or_str[1].lstrip().startswith('>'): - # loc_or_str = loc_or_str[0], f'{self.css_path}{loc_or_str[1]}' - + raise ValueError('不支持xpath') timeout = timeout or self.timeout - return execute_driver_find(self.inner_ele, loc_or_str, mode, show_errmsg, timeout) - def eles(self, loc: Union[tuple, str], show_errmsg: bool = True): - pass + def eles(self, + loc_or_str: Union[tuple, str], + timeout: float = None, + show_errmsg: bool = False): + return self.ele(loc_or_str, mode='all', show_errmsg=show_errmsg, timeout=timeout) def run_script(self, script: str, *args) -> Any: """执行js代码,传入自己为第一个参数 \n @@ -107,3 +99,47 @@ class ShadowRootElement(DrissionElement): return True except: return False + + +def get_css_from_str(loc: str) -> tuple: + """处理元素查找语句 \n + 查找方式:属性、tag name及属性、css selector \n + =表示精确匹配,:表示模糊匹配,无控制字符串时默认搜索该字符串 \n + 示例: \n + @class:ele_class - class含有ele_class的元素 \n + @class=ele_class - class等于ele_class的元素 \n + @class - 带class属性的元素 \n + tag:div - div元素 \n + tag:div@class:ele_class - class含有ele_class的div元素 \n + tag:div@class=ele_class - class等于ele_class的div元素 \n + css:div.ele_class \n + """ + if loc.startswith('@'): # 根据属性查找 + r = re_SPLIT(r'([:=])', loc[1:], maxsplit=1) + if len(r) == 3: + mode = '=' if r[1] == '=' else '*=' + loc_str = f'*[{r[0]}{mode}{r[2]}]' + else: + loc_str = f'*[{loc[1:]}]' + elif loc.startswith(('tag=', 'tag:')): # 根据tag name查找 + if '@' not in loc[4:]: + loc_str = f'{loc[4:]}' + else: + at_lst = loc[4:].split('@', maxsplit=1) + r = re_SPLIT(r'([:=])', at_lst[1], maxsplit=1) + if len(r) == 3: + if r[0] == 'text()': + raise ValueError('不支持按文本查找') + mode = '=' if r[1] == '=' else '*=' + loc_str = f'{at_lst[0]}[{r[0]}{mode}"{r[2]}"]' + else: + loc_str = f'{at_lst[0]}[{r[0]}]' + elif loc.startswith(('css=', 'css:')): # 用css selector查找 + loc_str = loc[4:] + elif loc.startswith(('text=', 'text:')): # 根据文本查找 + raise ValueError('不支持按文本查找') + elif loc.startswith(('xpath=', 'xpath:')): # 用xpath查找 + raise ValueError('不支持xpath') + else: + raise ValueError('不支持的查询语句') + return 'css selector', loc_str From 159f1bf55a4c7df41507c4e3688b7e9f8740e297 Mon Sep 17 00:00:00 2001 From: g1879 Date: Tue, 1 Sep 2020 17:19:07 +0800 Subject: [PATCH 06/13] =?UTF-8?q?=E5=BE=AE=E8=B0=83download()=E9=87=8D?= =?UTF-8?q?=E5=91=BD=E5=90=8D=E8=A7=84=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/session_page.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/DrissionPage/session_page.py b/DrissionPage/session_page.py index cc80246..aa20cc9 100644 --- a/DrissionPage/session_page.py +++ b/DrissionPage/session_page.py @@ -247,31 +247,29 @@ class SessionPage(object): raise ConnectionError(f'Status code: {r.status_code}.') return False, f'Status code: {r.status_code}.' # -------------------获取文件名------------------- - # header里有文件名,则使用它,否则在url里截取,但不能保证url包含文件名 - if 'Content-disposition' in r.headers: + if 'Content-disposition' in r.headers: # header里有文件名,则使用它 file_name = r.headers['Content-disposition'].split('"')[1].encode('ISO-8859-1').decode('utf-8') - elif os_PATH.basename(file_url): + elif os_PATH.basename(file_url): # 在url里获取文件名 file_name = os_PATH.basename(file_url).split("?")[0] - else: + else: # 找不到则用时间和随机数生成文件名 file_name = f'untitled_{time()}_{randint(0, 100)}' - - file_name = re_SUB(r'[\\/*:|<>?"]', '', file_name).strip() + file_name = re_SUB(r'[\\/*:|<>?"]', '', file_name).strip() # 去除非法字符 + # -------------------重命名文件名------------------- if rename: # 重命名文件,不改变扩展名 rename = re_SUB(r'[\\/*:|<>?"]', '', rename).strip() ext_name = file_name.split('.')[-1] - if rename.lower().endswith(f'.{ext_name}'.lower()) or ext_name == file_name: + if '.' in rename or ext_name == file_name: full_name = rename else: full_name = f'{rename}.{ext_name}' else: full_name = file_name - + # -------------------生成路径------------------- goal_Path = Path(goal_path) goal_path = '' for key, i in enumerate(goal_Path.parts): # 去除路径中的非法字符 goal_path += goal_Path.drive if key == 0 and goal_Path.drive else re_SUB(r'[*:|<>?"]', '', i).strip() goal_path += '\\' if i != '\\' and key < len(goal_Path.parts) - 1 else '' - goal_Path = Path(goal_path) goal_Path.mkdir(parents=True, exist_ok=True) goal_path = goal_Path.absolute() @@ -287,8 +285,8 @@ class SessionPage(object): full_path = Path(f'{goal_path}\\{full_name}') else: raise ValueError("Argument file_exists can only be 'skip', 'overwrite', 'rename'.") - - if show_msg: # 打印要下载的文件 + # -------------------打印要下载的文件------------------- + if show_msg: print(full_name if file_name == full_name else f'{file_name} -> {full_name}') print(f'Downloading to: {goal_path}') @@ -317,9 +315,8 @@ class SessionPage(object): else: download_status, info = True, 'Success.' finally: - # 删除下载出错文件 if not download_status and full_path.exists(): - full_path.unlink() + full_path.unlink() # 删除下载出错文件 r.close() # -------------------显示并返回值------------------- if show_msg: From 0dcfcf5cd8232472e6b50b8536ebc73a664fe56a Mon Sep 17 00:00:00 2001 From: g1879 Date: Tue, 1 Sep 2020 22:44:34 +0800 Subject: [PATCH 07/13] =?UTF-8?q?get()=E5=A2=9E=E5=8A=A0=E9=87=8D=E8=AF=95?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/driver_page.py | 40 ++++++++++++++++++++++++++++++------ DrissionPage/mix_page.py | 14 ++++++++++--- DrissionPage/session_page.py | 30 +++++++++++++++++++++++++-- 3 files changed, 73 insertions(+), 11 deletions(-) diff --git a/DrissionPage/driver_page.py b/DrissionPage/driver_page.py index ee8b7f0..4be52e0 100644 --- a/DrissionPage/driver_page.py +++ b/DrissionPage/driver_page.py @@ -6,7 +6,7 @@ """ from glob import glob from pathlib import Path -from time import time +from time import time, sleep from typing import Union, List, Any from urllib.parse import quote @@ -60,21 +60,49 @@ class DriverPage(object): """返回网页title""" return self.driver.title - def get(self, url: str, go_anyway: bool = False, show_errmsg: bool = False) -> Union[None, bool]: + def _try_to_get(self, + to_url: str, + times: int = 0, + interval: float = 1, + show_errmsg: bool = False, ): + """ + :param to_url: 要访问的url + :param times: 重试次数 + :param interval: 重试间隔(秒) + :param show_errmsg: 是否抛出异常 + :return: + """ + self.driver.get(to_url) + is_ok = self.check_page() + while times and is_ok is False: + sleep(interval) + self.driver.get(to_url) + is_ok = self.check_page() + times -= 1 + if is_ok is False and show_errmsg: + raise ConnectionError('Connect error.') + return is_ok + + def get(self, + url: str, + go_anyway: bool = False, + show_errmsg: bool = False, + retry: int = 0, + interval: float = 1, + ) -> Union[None, bool]: """访问url \n :param url: 目标url :param go_anyway: 若目标url与当前url一致,是否强制跳转 :param show_errmsg: 是否显示和抛出异常 + :param retry: 重试次数 + :param interval: 重试间隔(秒) :return: 目标url是否可用 """ to_url = quote(url, safe='/:&?=%;#@') if not url or (not go_anyway and self.url == to_url): return self._url = to_url - self.driver.get(to_url) - self._url_available = self.check_page() - if self._url_available is False and show_errmsg: - raise ConnectionError('Connect error.') + self._url_available = self._try_to_get(to_url, times=retry, interval=interval, show_errmsg=show_errmsg) return self._url_available def ele(self, diff --git a/DrissionPage/mix_page.py b/DrissionPage/mix_page.py index b6cab5e..3153b56 100644 --- a/DrissionPage/mix_page.py +++ b/DrissionPage/mix_page.py @@ -236,19 +236,27 @@ class MixPage(Null, SessionPage, DriverPage): # ----------------以下为共用函数----------------------- - def get(self, url: str, go_anyway=False, show_errmsg: bool = False, **kwargs) -> Union[bool, None]: + def get(self, + url: str, + go_anyway=False, + show_errmsg: bool = False, + retry: int = 0, + interval: float = 1, + **kwargs) -> Union[bool, None]: """跳转到一个url \n 跳转前先同步cookies,跳转后判断目标url是否可用 :param url: 目标url :param go_anyway: 若目标url与当前url一致,是否强制跳转 :param show_errmsg: 是否显示和抛出异常 + :param retry: 重试次数 + :param interval: 重试间隔(秒) :param kwargs: 连接参数,s模式专用 :return: url是否可用 """ if self._mode == 'd': - return super(SessionPage, self).get(url, go_anyway, show_errmsg) + return super(SessionPage, self).get(url, go_anyway, show_errmsg, retry, interval) elif self._mode == 's': - return super().get(url, go_anyway, show_errmsg, **kwargs) + return super().get(url, go_anyway, show_errmsg, retry, interval, **kwargs) def ele(self, loc_or_ele: Union[tuple, str, DriverElement, SessionElement, Element, WebElement], diff --git a/DrissionPage/session_page.py b/DrissionPage/session_page.py index aa20cc9..1002f7c 100644 --- a/DrissionPage/session_page.py +++ b/DrissionPage/session_page.py @@ -9,7 +9,7 @@ from pathlib import Path from random import randint from re import search as re_SEARCH from re import sub as re_SUB -from time import time +from time import time, sleep from typing import Union, List from urllib.parse import urlparse, quote @@ -142,15 +142,41 @@ class SessionPage(object): raise TypeError('Type of loc_or_str can only be tuple or str.') return self.ele(loc_or_str, mode='all', show_errmsg=True) + def _try_to_get(self, + to_url: str, + times: int = 0, + interval: float = 1, + show_errmsg: bool = False, + **kwargs) -> HTMLResponse: + """尝试连接,重试若干次 + :param to_url: 要访问的url + :param times: 重试次数 + :param interval: 重试间隔(秒) + :param show_errmsg: 是否抛出异常 + :param kwargs: 连接参数 + :return: HTMLResponse对象 + """ + r = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0] + while times and (not r or r.content == b''): + print('重试', to_url) + sleep(interval) + r = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0] + times -= 1 + return r + def get(self, url: str, go_anyway: bool = False, show_errmsg: bool = False, + retry: int = 0, + interval: float = 1, **kwargs) -> Union[bool, None]: """用get方式跳转到url \n :param url: 目标url :param go_anyway: 若目标url与当前url一致,是否强制跳转 :param show_errmsg: 是否显示和抛出异常 + :param retry: 重试次数 + :param interval: 重试间隔(秒) :param kwargs: 连接参数 :return: url是否可用 """ @@ -158,7 +184,7 @@ class SessionPage(object): if not url or (not go_anyway and self.url == to_url): return self._url = to_url - self._response = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0] + self._response = self._try_to_get(to_url, times=retry, interval=interval, show_errmsg=show_errmsg, **kwargs) if self._response is None: self._url_available = False else: From 37bf43ac3650960ec12645afb5ea5cafba71807f Mon Sep 17 00:00:00 2001 From: g1879 Date: Tue, 1 Sep 2020 23:27:11 +0800 Subject: [PATCH 08/13] =?UTF-8?q?get()=E5=A2=9E=E5=8A=A0=E9=87=8D=E8=AF=95?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/driver_page.py | 4 ++-- DrissionPage/mix_page.py | 20 +++++++++++++++++++- DrissionPage/session_page.py | 2 +- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/DrissionPage/driver_page.py b/DrissionPage/driver_page.py index 4be52e0..21693f5 100644 --- a/DrissionPage/driver_page.py +++ b/DrissionPage/driver_page.py @@ -65,12 +65,12 @@ class DriverPage(object): times: int = 0, interval: float = 1, show_errmsg: bool = False, ): - """ + """尝试连接,重试若干次 \n :param to_url: 要访问的url :param times: 重试次数 :param interval: 重试间隔(秒) :param show_errmsg: 是否抛出异常 - :return: + :return: 是否成功 """ self.driver.get(to_url) is_ok = self.check_page() diff --git a/DrissionPage/mix_page.py b/DrissionPage/mix_page.py index 3153b56..9a30870 100644 --- a/DrissionPage/mix_page.py +++ b/DrissionPage/mix_page.py @@ -235,12 +235,30 @@ class MixPage(Null, SessionPage, DriverPage): return super().chrome_downloading(path) # ----------------以下为共用函数----------------------- + def _try_to_get(self, + to_url: str, + times: int = 0, + interval: float = 1, + show_errmsg: bool = False, + **kwargs): + """尝试连接,重试若干次 \n + :param to_url: 要访问的url + :param times: 重试次数 + :param interval: 重试间隔(秒) + :param show_errmsg: 是否抛出异常 + :param kwargs: 连接参数 + :return: s模式为HTMLResponse对象,d模式为bool + """ + if self._mode == 'd': + return super(SessionPage, self)._try_to_get(to_url, times, interval, show_errmsg) + elif self._mode == 's': + return super()._try_to_get(to_url, times, interval, show_errmsg, **kwargs) def get(self, url: str, go_anyway=False, show_errmsg: bool = False, - retry: int = 0, + retry: int = 2, interval: float = 1, **kwargs) -> Union[bool, None]: """跳转到一个url \n diff --git a/DrissionPage/session_page.py b/DrissionPage/session_page.py index 1002f7c..553474e 100644 --- a/DrissionPage/session_page.py +++ b/DrissionPage/session_page.py @@ -148,7 +148,7 @@ class SessionPage(object): interval: float = 1, show_errmsg: bool = False, **kwargs) -> HTMLResponse: - """尝试连接,重试若干次 + """尝试连接,重试若干次 \n :param to_url: 要访问的url :param times: 重试次数 :param interval: 重试间隔(秒) From b3d4cc67f4643d2436c3c71f883586d9d91850e7 Mon Sep 17 00:00:00 2001 From: g1879 Date: Wed, 2 Sep 2020 00:08:12 +0800 Subject: [PATCH 09/13] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E6=8C=89=E6=96=87=E6=9C=AC=E5=8C=B9=E9=85=8D=EF=BC=8C=E6=9C=AA?= =?UTF-8?q?=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/shadow_root_element.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/DrissionPage/shadow_root_element.py b/DrissionPage/shadow_root_element.py index 36c5a84..1e1e036 100644 --- a/DrissionPage/shadow_root_element.py +++ b/DrissionPage/shadow_root_element.py @@ -100,6 +100,27 @@ class ShadowRootElement(DrissionElement): except: return False + def _find_eles_by_text(self, text: str, mode: str = 'single', match: str = 'exact'): + eles = self.run_script('return arguments[0].querySelectorAll("*")') + from .driver_element import DriverElement + results = [] + for ele in eles: + txt = self.driver.execute_script( + 'if(arguments[0].firstChild!=null){return arguments[0].firstChild.nodeValue}', ele) + if match == 'exact': + if text == txt: + if mode == 'single': + return DriverElement(ele) + elif mode == 'all': + results.append(DriverElement(ele)) + elif match == 'fuzzy': + if txt and text in txt: + if mode == 'single': + return DriverElement(ele) + elif mode == 'all': + results.append(DriverElement(ele)) + return None if mode == 'single' else results + def get_css_from_str(loc: str) -> tuple: """处理元素查找语句 \n From 9863aea69cb26e670c45b6843a23b884dbc5c413 Mon Sep 17 00:00:00 2001 From: g1879 Date: Wed, 2 Sep 2020 16:33:55 +0800 Subject: [PATCH 10/13] =?UTF-8?q?=E6=94=AF=E6=8C=81=E7=94=A8=E6=96=87?= =?UTF-8?q?=E6=9C=AC=E6=9F=A5=E6=89=BEshadow-root=E4=B8=AD=E5=85=83?= =?UTF-8?q?=E7=B4=A0=EF=BC=8C=E5=9F=BA=E6=9C=AC=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/shadow_root_element.py | 124 ++++++++++++++++++++++------ 1 file changed, 97 insertions(+), 27 deletions(-) diff --git a/DrissionPage/shadow_root_element.py b/DrissionPage/shadow_root_element.py index 1e1e036..ab2a098 100644 --- a/DrissionPage/shadow_root_element.py +++ b/DrissionPage/shadow_root_element.py @@ -60,24 +60,76 @@ class ShadowRootElement(DrissionElement): def ele(self, loc_or_str: Union[tuple, str], - mode: str = None, + mode: str = 'single', timeout: float = None, show_errmsg: bool = False): + """返回当前元素下级符合条件的子元素,默认返回第一个 \n + 示例: \n + - 用loc元组查找: \n + ele.ele((By.CLASS_NAME, 'ele_class')) - 返回第一个class为ele_class的子元素 \n + - 用查询字符串查找: \n + 查找方式:属性、tag name和属性、文本、css selector \n + 其中,@表示属性,=表示精确匹配,:表示模糊匹配,无控制字符串时默认搜索该字符串 \n + ele.ele('@class:ele_class') - 返回第一个class含有ele_class的子元素 \n + ele.ele('@name=ele_name') - 返回第一个name等于ele_name的子元素 \n + ele.ele('@placeholder') - 返回第一个带placeholder属性的子元素 \n + ele.ele('tag:p') - 返回第一个

子元素 \n + ele.ele('tag:div@class:ele_class') - 返回第一个class含有ele_class的div子元素 \n + ele.ele('tag:div@class=ele_class') - 返回第一个class等于ele_class的div子元素 \n + ele.ele('tag:div@text():some_text') - 返回第一个文本含有some_text的div子元素 \n + ele.ele('tag:div@text()=some_text') - 返回第一个文本等于some_text的div子元素 \n + ele.ele('text:some_text') - 返回第一个文本含有some_text的子元素 \n + ele.ele('some_text') - 返回第一个文本含有some_text的子元素(等价于上一行) \n + ele.ele('text=some_text') - 返回第一个文本等于some_text的子元素 \n + ele.ele('css:div.ele_class') - 返回第一个符合css selector的子元素 \n + :param loc_or_str: 元素的定位信息,可以是loc元组,或查询字符串 + :param mode: 'single' 或 'all',对应查找一个或全部 + :param timeout: 查找元素超时时间 + :param show_errmsg: 出现异常时是否打印信息 + :return: DriverElement对象 + """ if isinstance(loc_or_str, str): loc_or_str = get_css_from_str(loc_or_str) elif isinstance(loc_or_str, tuple) and len(loc_or_str) == 2: - pass + if loc_or_str[0] == 'xpath': + raise ValueError('不支持xpath') else: raise ValueError('Argument loc_or_str can only be tuple or str.') - if loc_or_str[0] == 'xpath': - raise ValueError('不支持xpath') + timeout = timeout or self.timeout - return execute_driver_find(self.inner_ele, loc_or_str, mode, show_errmsg, timeout) + if loc_or_str[0] == 'css selector': + return execute_driver_find(self.inner_ele, loc_or_str, mode, show_errmsg, timeout) + elif loc_or_str[0] == 'text': + return self._find_eles_by_text(loc_or_str[1], loc_or_str[2], loc_or_str[3], mode) def eles(self, loc_or_str: Union[tuple, str], timeout: float = None, show_errmsg: bool = False): + """返回当前元素下级所有符合条件的子元素 \n + 示例: \n + - 用loc元组查找: \n + ele.eles((By.CLASS_NAME, 'ele_class')) - 返回所有class为ele_class的子元素 \n + - 用查询字符串查找: \n + 查找方式:属性、tag name和属性、文本、css selector \n + 其中,@表示属性,=表示精确匹配,:表示模糊匹配,无控制字符串时默认搜索该字符串 \n + ele.eles('@class:ele_class') - 返回所有class含有ele_class的子元素 \n + ele.eles('@name=ele_name') - 返回所有name等于ele_name的子元素 \n + ele.eles('@placeholder') - 返回所有带placeholder属性的子元素 \n + ele.eles('tag:p') - 返回所有

子元素 \n + ele.eles('tag:div@class:ele_class') - 返回所有class含有ele_class的div子元素 \n + ele.eles('tag:div@class=ele_class') - 返回所有class等于ele_class的div子元素 \n + ele.eles('tag:div@text():some_text') - 返回所有文本含有some_text的div子元素 \n + ele.eles('tag:div@text()=some_text') - 返回所有文本等于some_text的div子元素 \n + ele.eles('text:some_text') - 返回所有文本含有some_text的子元素 \n + ele.eles('some_text') - 返回所有文本含有some_text的子元素(等价于上一行) \n + ele.eles('text=some_text') - 返回所有文本等于some_text的子元素 \n + ele.eles('css:div.ele_class') - 返回所有符合css selector的子元素 \n + :param loc_or_str: 元素的定位信息,可以是loc元组,或查询字符串 + :param timeout: 查找元素超时时间 + :param show_errmsg: 出现异常时是否打印信息 + :return: DriverElement对象组成的列表 + """ return self.ele(loc_or_str, mode='all', show_errmsg=show_errmsg, timeout=timeout) def run_script(self, script: str, *args) -> Any: @@ -100,21 +152,31 @@ class ShadowRootElement(DrissionElement): except: return False - def _find_eles_by_text(self, text: str, mode: str = 'single', match: str = 'exact'): - eles = self.run_script('return arguments[0].querySelectorAll("*")') + def _find_eles_by_text(self, text: str, tag: str = '', match: str = 'exact', mode: str = 'single'): + """根据文本获取页面元素 \n + :param text: 文本字符串 + :param tag: tag name + :param match: 'exact' 或 'fuzzy',对应精确或模糊匹配 + :param mode: 'single' 或 'all',对应匹配一个或全部 + :return: 返回DriverElement对象或组成的列表 + """ + eles = self.run_script('return arguments[0].querySelectorAll("*")') # 获取所有元素 from .driver_element import DriverElement results = [] - for ele in eles: + for ele in eles: # 遍历所有元素,找到符合条件的 + if tag and tag != ele.tag_name: + continue txt = self.driver.execute_script( 'if(arguments[0].firstChild!=null){return arguments[0].firstChild.nodeValue}', ele) - if match == 'exact': + txt = txt or '' + if text == '' or match == 'exact': # 匹配没有文本的元素或精确匹配 if text == txt: if mode == 'single': return DriverElement(ele) elif mode == 'all': results.append(DriverElement(ele)) - elif match == 'fuzzy': - if txt and text in txt: + elif match == 'fuzzy': # 模糊匹配 + if text in txt: if mode == 'single': return DriverElement(ele) elif mode == 'all': @@ -123,18 +185,24 @@ class ShadowRootElement(DrissionElement): def get_css_from_str(loc: str) -> tuple: - """处理元素查找语句 \n - 查找方式:属性、tag name及属性、css selector \n + """处理元素查找语句 \n + 查找方式:属性、tag name及属性、文本、css selector \n =表示精确匹配,:表示模糊匹配,无控制字符串时默认搜索该字符串 \n - 示例: \n - @class:ele_class - class含有ele_class的元素 \n - @class=ele_class - class等于ele_class的元素 \n - @class - 带class属性的元素 \n - tag:div - div元素 \n - tag:div@class:ele_class - class含有ele_class的div元素 \n - tag:div@class=ele_class - class等于ele_class的div元素 \n - css:div.ele_class \n + =表示精确匹配,:表示模糊匹配,无控制字符串时默认搜索该字符串 \n + 示例: \n + @class:ele_class - class含有ele_class的元素 \n + @class=ele_class - class等于ele_class的元素 \n + @class - 带class属性的元素 \n + tag:div - div元素 \n + tag:div@class:ele_class - class含有ele_class的div元素 \n + tag:div@class=ele_class - class等于ele_class的div元素 \n + tag:div@text():search_text - 文本含有search_text的div元素 \n + tag:div@text()=search_text - 文本等于search_text的div元素 \n + text:search_text - 文本含有search_text的元素 \n + text=search_text - 文本等于search_text的元素 \n + css:div.ele_class \n """ + loc_by = 'css selector' if loc.startswith('@'): # 根据属性查找 r = re_SPLIT(r'([:=])', loc[1:], maxsplit=1) if len(r) == 3: @@ -150,17 +218,19 @@ def get_css_from_str(loc: str) -> tuple: r = re_SPLIT(r'([:=])', at_lst[1], maxsplit=1) if len(r) == 3: if r[0] == 'text()': - raise ValueError('不支持按文本查找') + match = 'exact' if r[1] == '=' else 'fuzzy' + return 'text', r[2], at_lst[0], match mode = '=' if r[1] == '=' else '*=' loc_str = f'{at_lst[0]}[{r[0]}{mode}"{r[2]}"]' else: loc_str = f'{at_lst[0]}[{r[0]}]' elif loc.startswith(('css=', 'css:')): # 用css selector查找 loc_str = loc[4:] - elif loc.startswith(('text=', 'text:')): # 根据文本查找 - raise ValueError('不支持按文本查找') elif loc.startswith(('xpath=', 'xpath:')): # 用xpath查找 raise ValueError('不支持xpath') - else: - raise ValueError('不支持的查询语句') - return 'css selector', loc_str + elif loc.startswith(('text=', 'text:')): # 根据文本查找 + match = 'exact' if loc[4] == '=' else 'fuzzy' + return 'text', loc[5:], '', match + else: # 根据文本模糊查找 + return 'text', loc, '', 'fuzzy' + return loc_by, loc_str From 470dd02ce7eb1be36e666778a24ef8372af3a2c8 Mon Sep 17 00:00:00 2001 From: g1879 Date: Wed, 2 Sep 2020 16:34:18 +0800 Subject: [PATCH 11/13] =?UTF-8?q?=E5=BE=AE=E8=B0=83=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/driver_element.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DrissionPage/driver_element.py b/DrissionPage/driver_element.py index b94497d..2a4ccee 100644 --- a/DrissionPage/driver_element.py +++ b/DrissionPage/driver_element.py @@ -208,7 +208,7 @@ class DriverElement(DrissionElement): ele.ele('xpath://div[@class="ele_class"]') - 返回第一个符合xpath的子元素 \n ele.ele('css:div.ele_class') - 返回第一个符合css selector的子元素 \n :param loc_or_str: 元素的定位信息,可以是loc元组,或查询字符串 - :param mode: 'single' 或 'all‘,对应查找一个或全部 + :param mode: 'single' 或 'all',对应查找一个或全部 :param timeout: 查找元素超时时间 :param show_errmsg: 出现异常时是否打印信息 :return: DriverElement对象 From f83d72a6fe89139e0c4cd19e4933164954dbad7b Mon Sep 17 00:00:00 2001 From: g1879 Date: Thu, 3 Sep 2020 18:01:59 +0800 Subject: [PATCH 12/13] =?UTF-8?q?=E5=BE=AE=E8=B0=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/session_page.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DrissionPage/session_page.py b/DrissionPage/session_page.py index 553474e..3fcaaaf 100644 --- a/DrissionPage/session_page.py +++ b/DrissionPage/session_page.py @@ -180,7 +180,7 @@ class SessionPage(object): :param kwargs: 连接参数 :return: url是否可用 """ - to_url = quote(url, safe='/:&?=%;#@') + to_url = quote(url, safe='/:&?=%;#@+') if not url or (not go_anyway and self.url == to_url): return self._url = to_url @@ -366,7 +366,7 @@ class SessionPage(object): """ if mode not in ['get', 'post']: raise ValueError("Argument mode can only be 'get' or 'post'.") - url = quote(url, safe='/:&?=%;#@') + url = quote(url, safe='/:&?=%;#@+') # 设置referer和host值 kwargs_set = set(x.lower() for x in kwargs) From 1b286c100e6fef8d7842264f51ce8da4feeee9a8 Mon Sep 17 00:00:00 2001 From: g1879 Date: Mon, 7 Sep 2020 00:15:46 +0800 Subject: [PATCH 13/13] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E5=88=86=E9=83=A8=E5=88=86=E4=B8=8B=E8=BD=BD=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/session_page.py | 42 +++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/DrissionPage/session_page.py b/DrissionPage/session_page.py index 3fcaaaf..35c0b58 100644 --- a/DrissionPage/session_page.py +++ b/DrissionPage/session_page.py @@ -158,6 +158,8 @@ class SessionPage(object): """ r = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0] while times and (not r or r.content == b''): + if r is not None and r.status_code in (403, 404): + break print('重试', to_url) sleep(interval) r = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0] @@ -188,10 +190,12 @@ class SessionPage(object): if self._response is None: self._url_available = False else: - try: - self._response.html.encoding = self._response.encoding # 修复requests_html丢失编码方式的bug - except: - pass + stream = tuple(x for x in kwargs if x.lower() == 'stream') + if (not stream or not kwargs[stream[0]]) and not self.session.stream: + try: + self._response.html.encoding = self._response.encoding # 修复requests_html丢失编码方式的bug + except: + pass if self._response.ok: self._url_available = True @@ -397,15 +401,27 @@ class SessionPage(object): return None, e else: headers = dict(r.headers) - if 'Content-Type' not in headers or 'charset' not in headers['Content-Type']: - re_result = re_SEARCH(r']+).*?>', r.text) - try: - charset = re_result.group(1) - except: - charset = r.apparent_encoding + content_type = tuple(x for x in headers if x.lower() == 'content-type') + stream = tuple(x for x in kwargs if x.lower() == 'stream') + charset = None + if not content_type or 'charset' not in headers[content_type[0]].lower(): + if (not stream or not kwargs[stream[0]]) and not self.session.stream: + # ======================== + re_result = None + for chunk in r.iter_content(chunk_size=512): + re_result = re_SEARCH(r']+).*?>', chunk.decode()) + break + # ======================== + # re_result = re_SEARCH(r']+).*?>', r.text) + try: + charset = re_result.group(1) + except: + charset = r.apparent_encoding else: - charset = headers['Content-Type'].split('=')[1] + charset = headers[content_type[0]].split('=')[1] # 避免存在退格符导致乱码或解析出错 - r._content = r.content if 'stream' in kwargs and kwargs['stream'] else r.content.replace(b'\x08', b'\\b') - r.encoding = charset + if (not stream or not kwargs[stream[0]]) and not self.session.stream: + r._content = r.content.replace(b'\x08', b'\\b') + if charset: + r.encoding = charset return r, 'Success'