修改_make_response(),未完成

This commit is contained in:
g1879 2022-04-10 19:13:57 +08:00
parent a89c6941c1
commit b39470f922
4 changed files with 86 additions and 121 deletions

View File

@ -350,11 +350,3 @@ class BasePage(BaseParser):
retry: int = None, retry: int = None,
interval: float = None): interval: float = None):
pass pass
@abstractmethod
def _try_to_connect(self,
to_url: str,
times: int = 0,
interval: float = 1,
show_errmsg: bool = False, ):
pass

View File

@ -9,6 +9,7 @@ from typing import Union
from requests import Session from requests import Session
from requests.cookies import RequestsCookieJar from requests.cookies import RequestsCookieJar
from requests.structures import CaseInsensitiveDict
from selenium import webdriver from selenium import webdriver
from selenium.common.exceptions import SessionNotCreatedException, WebDriverException from selenium.common.exceptions import SessionNotCreatedException, WebDriverException
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
@ -316,12 +317,13 @@ class Drission(object):
if self._session is None: if self._session is None:
self._session = Session() self._session = Session()
attrs = ['headers', 'auth', 'proxies', 'hooks', 'params', 'verify', if 'headers' in data:
'cert', 'stream', 'trust_env', 'max_redirects'] # , 'adapters' self._session.headers = CaseInsensitiveDict(data['headers'])
if 'cookies' in data: if 'cookies' in data:
self.set_cookies(data['cookies'], set_session=True) self.set_cookies(data['cookies'], set_session=True)
attrs = ['auth', 'proxies', 'hooks', 'params', 'verify',
'cert', 'stream', 'trust_env', 'max_redirects'] # , 'adapters'
for i in attrs: for i in attrs:
if i in data: if i in data:
self._session.__setattr__(i, data[i]) self._session.__setattr__(i, data[i])

View File

@ -204,26 +204,26 @@ class MixPage(SessionPage, DriverPage, BasePage):
elif self._mode == 'd': elif self._mode == 'd':
return super(SessionPage, self).get_cookies(as_dict) return super(SessionPage, self).get_cookies(as_dict)
def _try_to_connect(self, # def _try_to_connect(self,
to_url: str, # to_url: str,
times: int = 0, # times: int = 0,
interval: float = 1, # interval: float = 1,
mode: str = 'get', # mode: str = 'get',
data: dict = None, # data: dict = None,
show_errmsg: bool = False, # show_errmsg: bool = False,
**kwargs): # **kwargs):
"""尝试连接,重试若干次 \n # """尝试连接,重试若干次 \n
:param to_url: 要访问的url # :param to_url: 要访问的url
:param times: 重试次数 # :param times: 重试次数
:param interval: 重试间隔 # :param interval: 重试间隔(秒)
:param show_errmsg: 是否抛出异常 # :param show_errmsg: 是否抛出异常
:param kwargs: 连接参数 # :param kwargs: 连接参数
:return: s模式为Response对象d模式为bool或None # :return: s模式为Response对象d模式为bool或None
""" # """
if self._mode == 'd': # if self._mode == 'd':
return super(SessionPage, self)._try_to_connect(to_url, times, interval, show_errmsg) # return super(SessionPage, self)._try_to_connect(to_url, times, interval, show_errmsg)
elif self._mode == 's': # elif self._mode == 's':
return super()._try_to_connect(to_url, times, interval, mode, data, show_errmsg, **kwargs) # return super()._try_to_connect(to_url, times, interval, mode, data, show_errmsg, **kwargs)
# ----------------MixPage独有属性和方法----------------------- # ----------------MixPage独有属性和方法-----------------------
@property @property
@ -336,7 +336,7 @@ class MixPage(SessionPage, DriverPage, BasePage):
# 使用requests访问url并判断可用性 # 使用requests访问url并判断可用性
if by_requests: if by_requests:
self.cookies_to_session() self.cookies_to_session()
r = self._make_response(self.url, **{'timeout': 3})[0] r = self._make_response(self.url)[0]
return r.ok if r else False return r.ok if r else False
def close_driver(self) -> None: def close_driver(self) -> None:

View File

@ -163,46 +163,46 @@ class SessionPage(BasePage):
else: else:
return [_cookie_to_dict(cookie) for cookie in cookies] return [_cookie_to_dict(cookie) for cookie in cookies]
def _try_to_connect(self, # def _try_to_connect(self,
to_url: str, # to_url: str,
times: int = 0, # times: int = 0,
interval: float = 1, # interval: float = 1,
mode: str = 'get', # mode: str = 'get',
data: Union[dict, str] = None, # data: Union[dict, str] = None,
show_errmsg: bool = False, # show_errmsg: bool = False,
**kwargs) -> Union[Response, None]: # **kwargs) -> Union[Response, None]:
"""尝试连接,重试若干次 \n # """尝试连接,重试若干次 \n
:param to_url: 要访问的url # :param to_url: 要访问的url
:param times: 重试次数 # :param times: 重试次数
:param interval: 重试间隔 # :param interval: 重试间隔(秒)
:param mode: 连接方式'get' 'post' # :param mode: 连接方式,'get' 或 'post'
:param data: post方式提交的数据 # :param data: post方式提交的数据
:param show_errmsg: 是否抛出异常 # :param show_errmsg: 是否抛出异常
:param kwargs: 连接参数 # :param kwargs: 连接参数
:return: HTMLResponse对象 # :return: HTMLResponse对象
""" # """
err = None # err = None
r = None # r = None
#
for _ in range(times + 1): # for _ in range(times + 1):
try: # try:
r = self._make_response(to_url, mode=mode, data=data, show_errmsg=True, **kwargs)[0] # r = self._make_response(to_url, mode=mode, data=data, show_errmsg=True, **kwargs)[0]
except Exception as e: # except Exception as e:
err = e # err = e
r = None # r = None
#
if r and (r.content != b'' or r.status_code in (403, 404)): # if r and (r.content != b'' or r.status_code in (403, 404)):
break # break
#
if _ < times: # if _ < times:
sleep(interval) # sleep(interval)
if show_errmsg: # if show_errmsg:
print(f'重试 {to_url}') # print(f'重试 {to_url}')
#
if not r and show_errmsg: # if not r and show_errmsg:
raise err if err is not None else ConnectionError(f'连接异常。{r.status_code if r is not None else ""}') # raise err if err is not None else ConnectionError(f'连接异常。{r.status_code if r is not None else ""}')
#
return r # return r
# ----------------session独有属性和方法----------------------- # ----------------session独有属性和方法-----------------------
@property @property
@ -286,8 +286,9 @@ class SessionPage(BasePage):
kwargs['headers'] = CaseInsensitiveDict(kwargs['headers']) kwargs['headers'] = CaseInsensitiveDict(kwargs['headers'])
# 设置referer和host值 # 设置referer和host值
hostname = urlparse(url).hostname parsed_url = urlparse(url)
scheme = urlparse(url).scheme hostname = parsed_url.hostname
scheme = parsed_url.scheme
if not _check_headers(kwargs, self.session.headers, 'Referer'): if not _check_headers(kwargs, self.session.headers, 'Referer'):
kwargs['headers']['Referer'] = self.url if self.url else f'{scheme}://{hostname}' kwargs['headers']['Referer'] = self.url if self.url else f'{scheme}://{hostname}'
if 'Host' not in kwargs['headers']: if 'Host' not in kwargs['headers']:
@ -300,71 +301,41 @@ class SessionPage(BasePage):
retry = retry if retry is not None else self.retry_times retry = retry if retry is not None else self.retry_times
interval = interval if interval is not None else self.retry_interval interval = interval if interval is not None else self.retry_interval
for i in range(retry + 1): for i in range(retry + 1):
e = None
try: try:
if mode == 'get': if mode == 'get':
r = self.session.get(url, **kwargs) r = self.session.get(url, **kwargs)
elif mode == 'post': elif mode == 'post':
r = self.session.post(url, data=data, **kwargs) r = self.session.post(url, data=data, **kwargs)
raise ConnectionError
print(r.url)
if r: if r:
print(r.request.headers) return _set_charset(r), 'Success'
e = 'Success'
r = _set_charset(r)
return r, e
except Exception as e: except Exception as e:
if show_errmsg: pass
raise e
# if r and (r.content != b'' or r.status_code in (403, 404)):
# break
if i < retry: if i < retry:
sleep(interval) sleep(interval)
if show_errmsg:
print(f'重试 {url}')
if r is None: if r is None:
return None, '连接失败' if show_errmsg:
if e:
raise e
else:
raise ConnectionError('连接失败')
return None, '连接失败' if e is None else e
if not r.ok: if not r.ok:
if show_errmsg:
raise ConnectionError(f'状态码:{r.status_code}')
return r, f'状态码:{r.status_code}' return r, f'状态码:{r.status_code}'
# try:
# r = None
# if mode == 'get':
# r = self.session.get(url, **kwargs)
# elif mode == 'post':
# r = self.session.post(url, data=data, **kwargs)
#
# if r is None:
# return None, '连接失败'
#
# except Exception as e:
# if show_errmsg:
# raise e
#
# return None, e
#
# else:
# # ----------------获取并设置编码开始-----------------
# # 在headers中获取编码
# content_type = r.headers.get('content-type', '').lower()
# charset = search(r'charset[=: ]*(.*)?[;]', content_type)
#
# if charset:
# r.encoding = charset.group(1)
#
# # 在headers中获取不到编码且如果是网页
# elif content_type.replace(' ', '').startswith('text/html'):
# re_result = search(b'<meta.*?charset=[ \\\'"]*([^"\\\' />]+).*?>', r.content)
#
# if re_result:
# charset = re_result.group(1).decode()
# else:
# charset = r.apparent_encoding
#
# r.encoding = charset
# # ----------------获取并设置编码结束-----------------
#
# return r, 'Success'
def _check_headers(kwargs, headers: Union[dict, CaseInsensitiveDict], arg: str) -> bool: def _check_headers(kwargs, headers: Union[dict, CaseInsensitiveDict], arg: str) -> bool:
"""检查kwargs或headers中是否有arg所示属性""" """检查kwargs或headers中是否有arg所示属性"""