From 1b286c100e6fef8d7842264f51ce8da4feeee9a8 Mon Sep 17 00:00:00 2001 From: g1879 Date: Mon, 7 Sep 2020 00:15:46 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=97=A0=E6=B3=95=E5=88=86?= =?UTF-8?q?=E9=83=A8=E5=88=86=E4=B8=8B=E8=BD=BD=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/session_page.py | 42 +++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/DrissionPage/session_page.py b/DrissionPage/session_page.py index 3fcaaaf..35c0b58 100644 --- a/DrissionPage/session_page.py +++ b/DrissionPage/session_page.py @@ -158,6 +158,8 @@ class SessionPage(object): """ r = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0] while times and (not r or r.content == b''): + if r is not None and r.status_code in (403, 404): + break print('重试', to_url) sleep(interval) r = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0] @@ -188,10 +190,12 @@ class SessionPage(object): if self._response is None: self._url_available = False else: - try: - self._response.html.encoding = self._response.encoding # 修复requests_html丢失编码方式的bug - except: - pass + stream = tuple(x for x in kwargs if x.lower() == 'stream') + if (not stream or not kwargs[stream[0]]) and not self.session.stream: + try: + self._response.html.encoding = self._response.encoding # 修复requests_html丢失编码方式的bug + except: + pass if self._response.ok: self._url_available = True @@ -397,15 +401,27 @@ class SessionPage(object): return None, e else: headers = dict(r.headers) - if 'Content-Type' not in headers or 'charset' not in headers['Content-Type']: - re_result = re_SEARCH(r']+).*?>', r.text) - try: - charset = re_result.group(1) - except: - charset = r.apparent_encoding + content_type = tuple(x for x in headers if x.lower() == 'content-type') + stream = tuple(x for x in kwargs if x.lower() == 'stream') + charset = None + if not content_type or 'charset' not in headers[content_type[0]].lower(): + if (not stream or not kwargs[stream[0]]) and not self.session.stream: + # ======================== + re_result = None + for chunk in r.iter_content(chunk_size=512): + re_result = re_SEARCH(r']+).*?>', chunk.decode()) + break + # ======================== + # re_result = re_SEARCH(r']+).*?>', r.text) + try: + charset = re_result.group(1) + except: + charset = r.apparent_encoding else: - charset = headers['Content-Type'].split('=')[1] + charset = headers[content_type[0]].split('=')[1] # 避免存在退格符导致乱码或解析出错 - r._content = r.content if 'stream' in kwargs and kwargs['stream'] else r.content.replace(b'\x08', b'\\b') - r.encoding = charset + if (not stream or not kwargs[stream[0]]) and not self.session.stream: + r._content = r.content.replace(b'\x08', b'\\b') + if charset: + r.encoding = charset return r, 'Success'