当response的header没有charset时,尝试从meta获取

This commit is contained in:
g1879 2020-06-02 00:10:55 +08:00
parent a31ed1d354
commit 1c823470bd

View File

@ -5,6 +5,7 @@
@File : session_page.py @File : session_page.py
""" """
import os import os
import re
from pathlib import Path from pathlib import Path
from random import random from random import random
from time import time from time import time
@ -196,7 +197,7 @@ class SessionPage(object):
# 设置referer和host值 # 设置referer和host值
if self._url: if self._url:
if 'headers' in set(x.lower() for x in kwargs): if 'headers' in set(x.lower() for x in kwargs):
keys=set(x.lower() for x in kwargs['headers']) keys = set(x.lower() for x in kwargs['headers'])
if 'referer' not in keys: if 'referer' not in keys:
kwargs['headers']['Referer'] = self._url kwargs['headers']['Referer'] = self._url
if 'host' not in keys: if 'host' not in keys:
@ -216,13 +217,14 @@ class SessionPage(object):
return_value = False return_value = False
else: else:
headers = dict(r.headers) headers = dict(r.headers)
if 'Content-Type' not in headers: if 'Content-Type' not in headers or 'charset' not in headers['Content-Type']:
charset = 'utf-8' re_result = re.search(r'<meta.*?charset=([^"\']+)', r.text)
else: try:
if 'charset' not in headers['Content-Type']: charset = re_result.group(1)
except:
charset = 'utf-8' charset = 'utf-8'
else: else:
charset = headers['Content-Type'].split('=')[1] charset = headers['Content-Type'].split('=')[1]
r.encoding = charset r.encoding = charset
return_value = r return_value = r
return return_value return return_value