From a31ed1d3543a5557f2bfba534137e2653870a98b Mon Sep 17 00:00:00 2001 From: g1879 Date: Tue, 2 Jun 2020 00:03:43 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=8E=B7=E5=8F=96href?= =?UTF-8?q?=E7=9A=84=E4=BB=A3=E7=A0=81=EF=BC=8C=E9=81=BF=E5=85=8D=E5=BD=93?= =?UTF-8?q?=E7=9B=B8=E5=AF=B9URL=E4=BB=A5=3F=E5=BC=80=E5=A4=B4=E6=97=B6req?= =?UTF-8?q?uests-html=E4=B8=A2=E5=A4=B1=E5=8F=82=E6=95=B0=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/session_element.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/DrissionPage/session_element.py b/DrissionPage/session_element.py index d04254f..3eeec7a 100644 --- a/DrissionPage/session_element.py +++ b/DrissionPage/session_element.py @@ -11,7 +11,6 @@ from typing import Union, List from requests_html import Element, BaseParser from .common import DrissionElement, get_loc_from_str, translate_loc_to_xpath -from urllib.parse import urlparse, urljoin class SessionElement(DrissionElement): @@ -105,15 +104,16 @@ class SessionElement(DrissionElement): """获取属性值""" try: if attr == 'href': - # TODO: 须测试 # 如直接获取attr只能获取相对地址 link = self._inner_ele.attrs['href'] - parsed = urlparse(link) - if not parsed.netloc: - return urljoin(self._inner_ele.url, link) - if not parsed.scheme: - return urljoin(urlparse(self._inner_ele.url).scheme, link) - return link + if link.startswith('?'): # 避免当相对URL以?开头时requests-html丢失参数的bug + if '?' in self.inner_ele.url: + return re.sub(r'\?.*', link, self.inner_ele.url) + else: + return f'{self.inner_ele.url}{link}' + else: + for link in self._inner_ele.absolute_links: + return link elif attr == 'class': class_str = '' for key, i in enumerate(self._inner_ele.attrs['class']):