From af11bc7f57187e49aec30eb68403bcdd3196ec47 Mon Sep 17 00:00:00 2001 From: g1879 Date: Tue, 30 Nov 2021 00:17:01 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=96=84download()=E4=BB=8Eheaders?= =?UTF-8?q?=E8=8E=B7=E5=8F=96=E6=96=87=E4=BB=B6=E5=90=8D=EF=BC=8C=E5=BE=85?= =?UTF-8?q?=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DrissionPage/session_page.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/DrissionPage/session_page.py b/DrissionPage/session_page.py index 07770ae..7049b4f 100644 --- a/DrissionPage/session_page.py +++ b/DrissionPage/session_page.py @@ -7,7 +7,7 @@ from os import path as os_PATH, sep from pathlib import Path from random import randint -from re import search as re_SEARCH, sub +from re import search, sub from time import time, sleep from typing import Union, List, Tuple from urllib.parse import urlparse, quote, unquote @@ -277,7 +277,7 @@ class SessionPage(BasePage): """ if file_exists == 'skip' and Path(f'{goal_path}{sep}{rename}').exists(): if show_msg: - print(f'{file_url}\n{goal_path}{sep}{rename}\nSkipped.\n') + print(f'{file_url}\n{goal_path}{sep}{rename}\n已跳过。\n') return False, 'Skipped because a file with the same name already exists.' @@ -312,18 +312,22 @@ class SessionPage(BasePage): # -------------------获取文件名------------------- file_name = '' - content_disposition = r.headers.get('content-disposition') + content_disposition = r.headers.get('content-disposition', '').replace(' ', '') # 使用header里的文件名 if content_disposition: - file_name = content_disposition.encode('ISO-8859-1').decode('utf-8') - file_name = re_SEARCH(r'filename *= *"?([^";]+)', file_name) + # TODO: 待测试 + txt = search(r'filename\*="?([^";]+)', content_disposition) + if txt: + charset, file_name = txt.group(1).split("''", 1) + file_name = unquote(content_disposition, charset) + else: + txt = search(r'filename="?([^";]+)', content_disposition) + if txt: + file_name = unquote(txt.group(1)) - if file_name: - file_name = file_name.group(1) - - if file_name[0] == file_name[-1] == "'": - file_name = file_name[1:-1] + if file_name and file_name[0] == file_name[-1] == "'": + file_name = file_name[1:-1] # 在url里获取文件名 if not file_name and os_PATH.basename(url): @@ -375,16 +379,16 @@ class SessionPage(BasePage): pass else: - raise ValueError("file_exists参数只能是'skip'、'overwrite'或'rename'。") + raise ValueError("file_exists参数只能是'skip'、'overwrite' 或 'rename'。") # -------------------打印要下载的文件------------------- if msg: print(file_url) print(full_name if file_name == full_name else f'{file_name} -> {full_name}') - print(f'Downloading to: {goal}') + print(f'正在下载到:{goal}') if skip: - print('Skipped.\n') + print('已跳过。\n') # -------------------开始下载------------------- if skip: @@ -518,14 +522,14 @@ class SessionPage(BasePage): # ----------------获取并设置编码开始----------------- # 在headers中获取编码 content_type = r.headers.get('content-type', '').lower() - charset = re_SEARCH(r'charset[=: ]*(.*)?[;]', content_type) + charset = search(r'charset[=: ]*(.*)?[;]', content_type) if charset: r.encoding = charset.group(1) # 在headers中获取不到编码,且如果是网页 elif content_type.replace(' ', '').startswith('text/html'): - re_result = re_SEARCH(b']+).*?>', r.content) + re_result = search(b']+).*?>', r.content) if re_result: charset = re_result.group(1).decode()