当response的header没有charset时,尝试从meta获取

This commit is contained in:
g1879 2020-06-02 00:10:55 +08:00
parent a31ed1d354
commit 1c823470bd

View File

@ -5,6 +5,7 @@
@File : session_page.py
"""
import os
import re
from pathlib import Path
from random import random
from time import time
@ -196,7 +197,7 @@ class SessionPage(object):
# 设置referer和host值
if self._url:
if 'headers' in set(x.lower() for x in kwargs):
keys=set(x.lower() for x in kwargs['headers'])
keys = set(x.lower() for x in kwargs['headers'])
if 'referer' not in keys:
kwargs['headers']['Referer'] = self._url
if 'host' not in keys:
@ -216,13 +217,14 @@ class SessionPage(object):
return_value = False
else:
headers = dict(r.headers)
if 'Content-Type' not in headers:
charset = 'utf-8'
else:
if 'charset' not in headers['Content-Type']:
if 'Content-Type' not in headers or 'charset' not in headers['Content-Type']:
re_result = re.search(r'<meta.*?charset=([^"\']+)', r.text)
try:
charset = re_result.group(1)
except:
charset = 'utf-8'
else:
charset = headers['Content-Type'].split('=')[1]
else:
charset = headers['Content-Type'].split('=')[1]
r.encoding = charset
return_value = r
return return_value