From 39a8a5236d252b2e4a71bff9a9685e1d1d2743a0 Mon Sep 17 00:00:00 2001 From: g1879 Date: Wed, 18 Nov 2020 21:33:08 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E8=8E=B7=E5=8F=96=E7=BC=96?= =?UTF-8?q?=E7=A0=81=E7=9A=84=E9=80=BB=E8=BE=91=EF=BC=9B=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E4=B8=8B=E8=BD=BD=E4=B8=8D=E8=83=BD=E6=98=BE=E7=A4=BA=E8=BF=9B?= =?UTF-8?q?=E5=BA=A6=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/session_page.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/DrissionPage/session_page.py b/DrissionPage/session_page.py index 79acc91..267a25e 100644 --- a/DrissionPage/session_page.py +++ b/DrissionPage/session_page.py @@ -491,11 +491,14 @@ class SessionPage(object): else: # ----------------获取并设置编码开始----------------- # 在headers中获取编码 - try: - charset = r.headers.get('Content-type').split('=')[1] + content_type = r.headers.get('content-type') + charset = re.search(r'charset[=: ]*(.*)?[;]', content_type) - # 在headers中获取不到编码 - except IndexError: + if charset: + r.encoding = charset.group(1) + + # 在headers中获取不到编码,且如果是网页 + elif content_type.replace(' ', '').lower().startswith('text/html'): re_result = re_SEARCH(b']+).*?>', r.content) if re_result: @@ -503,7 +506,7 @@ class SessionPage(object): else: charset = r.apparent_encoding - r.encoding = charset + r.encoding = charset # ----------------获取并设置编码结束----------------- return r, 'Success'