From 41d700a3d906a4502a59cfd6d9f5c9a13d3c2892 Mon Sep 17 00:00:00 2001 From: g1879 Date: Wed, 4 Nov 2020 16:34:55 +0800 Subject: [PATCH] =?UTF-8?q?=E5=BC=83=E7=94=A8requests=5Fhtml=EF=BC=8C?= =?UTF-8?q?=E6=9C=AA=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/common.py | 8 +- DrissionPage/drission.py | 26 ++-- DrissionPage/mix_page.py | 8 +- DrissionPage/session_element.py | 205 ++++++++++++++++++-------------- DrissionPage/session_page.py | 25 ++-- requirements.txt | 9 +- 6 files changed, 154 insertions(+), 127 deletions(-) diff --git a/DrissionPage/common.py b/DrissionPage/common.py index d2a1f66..1ea1229 100644 --- a/DrissionPage/common.py +++ b/DrissionPage/common.py @@ -10,18 +10,20 @@ from re import split as re_SPLIT from shutil import rmtree from typing import Union -from requests_html import Element +from lxml.etree import _Element +# from lxml.html import HtmlElement +# from requests_html import Element from selenium.webdriver.remote.webelement import WebElement class DrissionElement(object): """SessionElement和DriverElement的基类""" - def __init__(self, ele: Union[Element, WebElement]): + def __init__(self, ele: Union[WebElement, _Element]): self._inner_ele = ele @property - def inner_ele(self) -> Union[WebElement, Element]: + def inner_ele(self) -> Union[WebElement, _Element]: return self._inner_ele @property diff --git a/DrissionPage/drission.py b/DrissionPage/drission.py index 2ab9b54..29de827 100644 --- a/DrissionPage/drission.py +++ b/DrissionPage/drission.py @@ -8,7 +8,7 @@ from typing import Union from urllib.parse import urlparse from requests import Session -from requests_html import HTMLSession +# from requests_html import HTMLSession from selenium import webdriver from selenium.common.exceptions import WebDriverException from selenium.webdriver.chrome.options import Options @@ -23,12 +23,12 @@ class Drission(object): def __init__(self, driver_or_options: Union[WebDriver, dict, Options] = None, - session_or_options: Union[Session, HTMLSession, dict] = None, + session_or_options: Union[Session, dict] = None, ini_path: str = None, proxy: dict = None): """初始化,可接收现成的WebDriver和Session对象,或接收它们的配置信息 \n :param driver_or_options: driver对象或chrome设置,Options类或设置字典 - :param session_or_options: session、HTMLSession对象或session设置 + :param session_or_options: Session对象设置 :param ini_path: ini文件路径 :param proxy: 代理设置 """ @@ -36,15 +36,15 @@ class Drission(object): self._driver = None self._driver_path = 'chromedriver' self._proxy = proxy - if isinstance(session_or_options, HTMLSession): + if isinstance(session_or_options, Session): self._session = session_or_options - elif isinstance(session_or_options, Session): - self._session = HTMLSession() - for key in session_or_options.__dict__: # session对象强制升级为子类HTMLSession对象 - if key != 'hooks': - self._session.__dict__[key] = session_or_options.__dict__[key] - else: - self._session.hooks['response'].extend(session_or_options.hooks['response']) + # elif isinstance(session_or_options, Session): + # self._session = HTMLSession() + # for key in session_or_options.__dict__: # session对象强制升级为子类HTMLSession对象 + # if key != 'hooks': + # self._session.__dict__[key] = session_or_options.__dict__[key] + # else: + # self._session.hooks['response'].extend(session_or_options.hooks['response']) else: if session_or_options is None: self._session_options = OptionsManager(ini_path).get_option('session_options') @@ -64,10 +64,10 @@ class Drission(object): self._driver_path = self._driver_options['driver_path'] @property - def session(self) -> HTMLSession: + def session(self) -> Session: """返回HTMLSession对象,如为None则按配置信息创建""" if self._session is None: - self._session = HTMLSession() + self._session = Session() attrs = ['headers', 'cookies', 'auth', 'proxies', 'hooks', 'params', 'verify', 'cert', 'adapters', 'stream', 'trust_env', 'max_redirects'] for i in attrs: diff --git a/DrissionPage/mix_page.py b/DrissionPage/mix_page.py index 29cff55..453e081 100644 --- a/DrissionPage/mix_page.py +++ b/DrissionPage/mix_page.py @@ -6,8 +6,8 @@ """ from typing import Union, List, Tuple -from requests import Response -from requests_html import HTMLSession, Element +from requests import Response, Session +# from requests_html import HTMLSession, Element from selenium.webdriver.chrome.webdriver import WebDriver from selenium.webdriver.remote.webelement import WebElement @@ -127,7 +127,7 @@ class MixPage(Null, SessionPage, DriverPage): return self._drission.driver @property - def session(self) -> HTMLSession: + def session(self) -> Session: """返回session对象,如没有则创建 \n :return: HTMLSession对象 """ @@ -279,7 +279,7 @@ class MixPage(Null, SessionPage, DriverPage): return super().get(url, go_anyway, show_errmsg, retry, interval, **kwargs) def ele(self, - loc_or_ele: Union[Tuple[str, str], str, DriverElement, SessionElement, Element, WebElement], + loc_or_ele: Union[Tuple[str, str], str, DriverElement, SessionElement, WebElement], mode: str = None, timeout: float = None, show_errmsg: bool = False) -> Union[DriverElement, SessionElement, str]: diff --git a/DrissionPage/session_element.py b/DrissionPage/session_element.py index 4a5ee80..efd51cb 100644 --- a/DrissionPage/session_element.py +++ b/DrissionPage/session_element.py @@ -8,28 +8,33 @@ import re from html import unescape from typing import Union, List, Tuple -from requests_html import Element, BaseParser +from lxml import etree +from lxml.etree import _Element from .common import DrissionElement, get_loc_from_str, translate_loc_to_xpath +# from lxml.html import HtmlElement +# from requests_html import Element, BaseParser + + class SessionElement(DrissionElement): """session模式的元素对象,包装了一个Element对象,并封装了常用功能""" - def __init__(self, ele: Element): + def __init__(self, ele: _Element): super().__init__(ele) - def __repr__(self): - attrs = [f"{attr}='{self.attrs[attr]}'" for attr in self.attrs] - return f'' + # def __repr__(self): + # attrs = [f"{attr}='{self.attrs[attr]}'" for attr in self.attrs] + # return f'' - @property - def attrs(self) -> dict: - """返回元素所有属性及值""" - attrs = dict() - for attr in self.inner_ele.attrs: - attrs[attr] = self.attr(attr) - return attrs + # @property + # def attrs(self) -> dict: + # """返回元素所有属性及值""" + # attrs = dict() + # for attr in self.inner_ele.attrs: + # attrs[attr] = self.attr(attr) + # return attrs @property def text(self) -> str: @@ -45,42 +50,45 @@ class SessionElement(DrissionElement): @property def html(self) -> str: """返回元素innerHTML文本""" - html = unescape(self._inner_ele.html).replace('\xa0', ' ') + # ee=self.ele('xpath:./*') + html = unescape(etree.tostring(self._inner_ele).decode()).replace('\xa0', ' ') + # html = unescape(self._inner_ele.html).replace('\xa0', ' ') r = re.match(r'<.*?>(.*)', html, flags=re.DOTALL) return None if not r else r.group(1) + # return html @property def tag(self) -> str: """返回元素类型""" return self._inner_ele.tag - @property - def css_path(self) -> str: - """返回css path路径""" - return self._get_ele_path('css') + # @property + # def css_path(self) -> str: + # """返回css path路径""" + # return self._get_ele_path('css') - @property - def xpath(self) -> str: - """返回xpath路径""" - return self._get_ele_path('xpath') + # @property + # def xpath(self) -> str: + # """返回xpath路径""" + # return self._get_ele_path('xpath') - def _get_ele_path(self, mode): - """获取css路径或xpath路径""" - path_str = '' - ele = self - while ele: - ele_id = ele.attr('id') - if ele_id: - return f'#{ele_id}{path_str}' if mode == 'css' else f'//{ele.tag}[@id="{ele_id}"]{path_str}' - else: - if mode == 'css': - brothers = len(ele.eles(f'xpath:./preceding-sibling::*')) - path_str = f'>:nth-child({brothers + 1}){path_str}' - else: - brothers = len(ele.eles(f'xpath:./preceding-sibling::{ele.tag}')) - path_str = f'/{ele.tag}[{brothers + 1}]{path_str}' if brothers > 0 else f'/{ele.tag}{path_str}' - ele = ele.parent - return path_str[1:] if mode == 'css' else path_str + # def _get_ele_path(self, mode): + # """获取css路径或xpath路径""" + # path_str = '' + # ele = self + # while ele: + # ele_id = ele.attr('id') + # if ele_id: + # return f'#{ele_id}{path_str}' if mode == 'css' else f'//{ele.tag}[@id="{ele_id}"]{path_str}' + # else: + # if mode == 'css': + # brothers = len(ele.eles(f'xpath:./preceding-sibling::*')) + # path_str = f'>:nth-child({brothers + 1}){path_str}' + # else: + # brothers = len(ele.eles(f'xpath:./preceding-sibling::{ele.tag}')) + # path_str = f'/{ele.tag}[{brothers + 1}]{path_str}' if brothers > 0 else f'/{ele.tag}{path_str}' + # ele = ele.parent + # return path_str[1:] if mode == 'css' else path_str @property def parent(self): @@ -199,47 +207,47 @@ class SessionElement(DrissionElement): """ return self.ele(loc_or_str, mode='all', show_errmsg=show_errmsg) - def attr(self, attr: str) -> Union[str, None]: - """返回属性值 \n - :param attr: 属性名 - :return: 属性值文本,没有该属性返回None - """ - try: - if attr == 'href': - # 如直接获取attr只能获取相对地址 - link = self._inner_ele.attrs['href'] - if link.lower().startswith(('javascript:', 'mailto:')): - return link - elif link.startswith('#'): - if '#' in self.inner_ele.url: - return re.sub(r'#.*', link, self.inner_ele.url) - else: - return f'{self.inner_ele.url}{link}' - elif link.startswith('?'): # 避免当相对URL以?开头时requests-html丢失参数的bug - if '?' in self.inner_ele.url: - return re.sub(r'\?.*', link, self.inner_ele.url) - else: - return f'{self.inner_ele.url}{link}' - else: - for link in self._inner_ele.absolute_links: - return link - elif attr == 'src': - return self._inner_ele._make_absolute(self._inner_ele.attrs['src']) - elif attr == 'class': - return ' '.join(self._inner_ele.attrs['class']) - elif attr == 'text': - return self.text - elif attr == 'outerHTML': - return self.inner_ele.html - elif attr == 'innerHTML': - return self.html - else: - return self._inner_ele.attrs[attr] - except: - return None + # def attr(self, attr: str) -> Union[str, None]: + # """返回属性值 \n + # :param attr: 属性名 + # :return: 属性值文本,没有该属性返回None + # """ + # try: + # if attr == 'href': + # # 如直接获取attr只能获取相对地址 + # link = self._inner_ele.attrs['href'] + # if link.lower().startswith(('javascript:', 'mailto:')): + # return link + # elif link.startswith('#'): + # if '#' in self.inner_ele.url: + # return re.sub(r'#.*', link, self.inner_ele.url) + # else: + # return f'{self.inner_ele.url}{link}' + # elif link.startswith('?'): # 避免当相对URL以?开头时requests-html丢失参数的bug + # if '?' in self.inner_ele.url: + # return re.sub(r'\?.*', link, self.inner_ele.url) + # else: + # return f'{self.inner_ele.url}{link}' + # else: + # for link in self._inner_ele.absolute_links: + # return link + # elif attr == 'src': + # return self._inner_ele._make_absolute(self._inner_ele.attrs['src']) + # elif attr == 'class': + # return ' '.join(self._inner_ele.attrs['class']) + # elif attr == 'text': + # return self.text + # elif attr == 'outerHTML': + # return self.inner_ele.html + # elif attr == 'innerHTML': + # return self.html + # else: + # return self._inner_ele.attrs[attr] + # except: + # return None -def execute_session_find(page_or_ele: BaseParser, +def execute_session_find(page_or_ele: _Element, loc: Tuple[str, str], mode: str = 'single', show_errmsg: bool = False) -> Union[SessionElement, List[SessionElement or str]]: @@ -254,32 +262,47 @@ def execute_session_find(page_or_ele: BaseParser, mode = mode or 'single' if mode not in ['single', 'all']: raise ValueError("Argument mode can only be 'single' or 'all'.") + loc_by, loc_str = loc + # print(loc) + # ele = page_or_ele.xpath(loc_str) + # print(ele) try: - ele = None + # ele = None if loc_by == 'xpath': - if 'PyQuery' in str(type(page_or_ele.element)): - # 从页面查找。 - ele = page_or_ele.xpath(loc_str) - elif 'HtmlElement' in str(type(page_or_ele.element)): - # 从元素查找。这样区分是为了能找到上级元素 - try: - elements = page_or_ele.element.xpath(loc_str) - ele = [Element(element=e, url=page_or_ele.url) for e in elements] - except AttributeError: - ele = page_or_ele.xpath(loc_str) + ele = page_or_ele.xpath(loc_str) + # if 'PyQuery' in str(type(page_or_ele.element)): + # # 从页面查找。 + # ele = page_or_ele.xpath(loc_str) + # elif 'HtmlElement' in str(type(page_or_ele.element)): + # # 从元素查找。这样区分是为了能找到上级元素 + # try: + # elements = page_or_ele.element.xpath(loc_str) + # ele = [Element(element=e, url=page_or_ele.url) for e in elements] + # except AttributeError: + # ele = page_or_ele.xpath(loc_str) else: # 用css selector获取 - ele = page_or_ele.find(loc_str) + ele = page_or_ele.cssselect(loc_str) if mode == 'single': ele = ele[0] if ele else None - return SessionElement(ele) if isinstance(ele, Element) else unescape(ele).replace('\xa0', ' ') + return SessionElement(ele) if isinstance(ele, _Element) else unescape(ele).replace('\xa0', ' ') elif mode == 'all': ele = filter(lambda x: x != '\n', ele) # 去除元素间换行符 ele = map(lambda x: unescape(x).replace('\xa0', ' ') if isinstance(x, str) else x, ele) # 替换空格 - return [SessionElement(e) if isinstance(e, Element) else e for e in ele] + return [SessionElement(e) if isinstance(e, _Element) else e for e in ele] except: if show_errmsg: print('Element(s) not found.', loc) raise return [] if mode == 'all' else None + + +def get_HtmlElement(html: str) -> _Element: + # html = f'{html}' + ele_or_page = etree.HTML(html) + # html = etree.tostring(ele_or_page).decode() + # if str(html).startswith('') and str(html).endswith(''): + # html = etree.tostring(ele_or_page)[12:-14].decode() + # ele_or_page = etree.fromstring(html) + return ele_or_page diff --git a/DrissionPage/session_page.py b/DrissionPage/session_page.py index fbede1c..9762a7d 100644 --- a/DrissionPage/session_page.py +++ b/DrissionPage/session_page.py @@ -15,17 +15,18 @@ from typing import Union, List, Tuple from urllib import parse from urllib.parse import urlparse, quote -from requests_html import HTMLSession, HTMLResponse, Element +# from requests_html import HTMLSession, HTMLResponse, Element +from requests import Session, Response from .common import get_loc_from_str, translate_loc_to_xpath, get_available_file_name from .config import OptionsManager -from .session_element import SessionElement, execute_session_find +from .session_element import SessionElement, execute_session_find, get_HtmlElement class SessionPage(object): """SessionPage封装了页面操作的常用功能,使用requests_html来获取、解析网页。""" - def __init__(self, session: HTMLSession, timeout: float = 10): + def __init__(self, session: Session, timeout: float = 10): """初始化函数""" self._session = session self.timeout = timeout @@ -34,12 +35,12 @@ class SessionPage(object): self._response = None @property - def session(self) -> HTMLSession: + def session(self) -> Session: """返回session对象""" return self._session @property - def response(self) -> HTMLResponse: + def response(self) -> Response: """返回访问url得到的response对象""" return self._response @@ -66,10 +67,10 @@ class SessionPage(object): @property def html(self) -> str: """返回页面html文本""" - return self.response.html.html + return self.response.text def ele(self, - loc_or_ele: Union[Tuple[str, str], str, SessionElement, Element], + loc_or_ele: Union[Tuple[str, str], str, SessionElement], # , Element mode: str = None, show_errmsg: bool = False) -> Union[SessionElement, List[SessionElement or str], str, None]: """返回页面中符合条件的元素,默认返回第一个 \n @@ -112,13 +113,13 @@ class SessionPage(object): elif isinstance(loc_or_ele, SessionElement): return loc_or_ele - elif isinstance(loc_or_ele, Element): - return SessionElement(loc_or_ele) + # elif isinstance(loc_or_ele, Element): + # return SessionElement(loc_or_ele) else: raise ValueError('Argument loc_or_str can only be tuple, str, SessionElement, Element.') - return execute_session_find(self.response.html, loc_or_ele, mode, show_errmsg) + return execute_session_find(get_HtmlElement(self.response.text), loc_or_ele, mode, show_errmsg) def eles(self, loc_or_str: Union[Tuple[str, str], str], @@ -156,7 +157,7 @@ class SessionPage(object): times: int = 0, interval: float = 1, show_errmsg: bool = False, - **kwargs) -> HTMLResponse: + **kwargs) -> Response: """尝试连接,重试若干次 \n :param to_url: 要访问的url :param times: 重试次数 @@ -434,7 +435,7 @@ class SessionPage(object): if not_stream: # 加载网页时修复编码 r._content = r.content.replace(b'\x08', b'\\b') # 避免存在退格符导致乱码或解析出错 - r.html.encoding = r.encoding # 修复requests_html丢失编码方式的bug + # r.html.encoding = r.encoding # 修复requests_html丢失编码方式的bug if charset: r.encoding = charset return r, 'Success' diff --git a/requirements.txt b/requirements.txt index 934a26b..d019385 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ -selenium~=3.141.0 -requests-html~=0.10.0 -requests~=2.23.0 -tldextract~=2.2.2 \ No newline at end of file +selenium +requests +tldextract +DrissionPage +lxml \ No newline at end of file