修复无法分部分下载的问题

This commit is contained in:
g1879 2020-09-07 00:15:46 +08:00
parent f83d72a6fe
commit 1b286c100e

View File

@ -158,6 +158,8 @@ class SessionPage(object):
""" """
r = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0] r = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0]
while times and (not r or r.content == b''): while times and (not r or r.content == b''):
if r is not None and r.status_code in (403, 404):
break
print('重试', to_url) print('重试', to_url)
sleep(interval) sleep(interval)
r = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0] r = self._make_response(to_url, show_errmsg=show_errmsg, **kwargs)[0]
@ -188,10 +190,12 @@ class SessionPage(object):
if self._response is None: if self._response is None:
self._url_available = False self._url_available = False
else: else:
try: stream = tuple(x for x in kwargs if x.lower() == 'stream')
self._response.html.encoding = self._response.encoding # 修复requests_html丢失编码方式的bug if (not stream or not kwargs[stream[0]]) and not self.session.stream:
except: try:
pass self._response.html.encoding = self._response.encoding # 修复requests_html丢失编码方式的bug
except:
pass
if self._response.ok: if self._response.ok:
self._url_available = True self._url_available = True
@ -397,15 +401,27 @@ class SessionPage(object):
return None, e return None, e
else: else:
headers = dict(r.headers) headers = dict(r.headers)
if 'Content-Type' not in headers or 'charset' not in headers['Content-Type']: content_type = tuple(x for x in headers if x.lower() == 'content-type')
re_result = re_SEARCH(r'<meta.*?charset=[ \'"]*([^"\' />]+).*?>', r.text) stream = tuple(x for x in kwargs if x.lower() == 'stream')
try: charset = None
charset = re_result.group(1) if not content_type or 'charset' not in headers[content_type[0]].lower():
except: if (not stream or not kwargs[stream[0]]) and not self.session.stream:
charset = r.apparent_encoding # ========================
re_result = None
for chunk in r.iter_content(chunk_size=512):
re_result = re_SEARCH(r'<meta.*?charset=[ \'"]*([^"\' />]+).*?>', chunk.decode())
break
# ========================
# re_result = re_SEARCH(r'<meta.*?charset=[ \'"]*([^"\' />]+).*?>', r.text)
try:
charset = re_result.group(1)
except:
charset = r.apparent_encoding
else: else:
charset = headers['Content-Type'].split('=')[1] charset = headers[content_type[0]].split('=')[1]
# 避免存在退格符导致乱码或解析出错 # 避免存在退格符导致乱码或解析出错
r._content = r.content if 'stream' in kwargs and kwargs['stream'] else r.content.replace(b'\x08', b'\\b') if (not stream or not kwargs[stream[0]]) and not self.session.stream:
r.encoding = charset r._content = r.content.replace(b'\x08', b'\\b')
if charset:
r.encoding = charset
return r, 'Success' return r, 'Success'