弃用requests_html,未完成

This commit is contained in:
g1879 2020-11-05 17:48:02 +08:00
parent 41d700a3d9
commit c9eae68acb
2 changed files with 73 additions and 47 deletions

View File

@ -113,21 +113,45 @@ class SessionElement(DrissionElement):
""" """
return self.ele(f'xpath:..{"/.." * (num - 1)}') return self.ele(f'xpath:..{"/.." * (num - 1)}')
def nexts(self, num: int = 1): def nexts(self, num: int = 1, mode: str = 'ele'):
"""返回后面第num个兄弟元素 \n """返回后面第num个兄弟元素 \n
:param num: 后面第几个兄弟元素 :param num: 后面第几个兄弟元素
:param mode: 匹配元素还是节点
:return: SessionElement对象 :return: SessionElement对象
""" """
# TODO: 增加获取node if mode == 'ele':
return self.ele(f'xpath:./following-sibling::*[{num}]') node_txt = '*'
elif mode == 'node':
node_txt = 'node()'
else:
raise ValueError("Argument mode can only be 'node' or 'ele'.")
def prevs(self, num: int = 1): e = self.ele(f'xpath:./following-sibling::{node_txt}[{num}]', show_errmsg=False)
while e == '\n':
num += 1
e = self.ele(f'xpath:./following-sibling::{node_txt}[{num}]', show_errmsg=False)
return e
def prevs(self, num: int = 1, mode: str = 'ele'):
"""返回前面第num个兄弟元素 \n """返回前面第num个兄弟元素 \n
:param num: 前面第几个兄弟元素 :param num: 前面第几个兄弟元素
:param mode: 匹配元素还是节点
:return: SessionElement对象 :return: SessionElement对象
""" """
# TODO: 增加获取node if mode == 'ele':
return self.ele(f'xpath:./preceding-sibling::*[{num}]') node_txt = '*'
elif mode == 'node':
node_txt = 'node()'
else:
raise ValueError("Argument mode can only be 'node' or 'ele'.")
e = self.ele(f'xpath:./preceding-sibling::{node_txt}[{num}]', show_errmsg=False)
while e == '\n':
num += 1
e = self.ele(f'xpath:./preceding-sibling::{node_txt}[{num}]', show_errmsg=False)
return e
def ele(self, loc_or_str: Union[Tuple[str, str], str], mode: str = None, show_errmsg: bool = False): def ele(self, loc_or_str: Union[Tuple[str, str], str], mode: str = None, show_errmsg: bool = False):
"""返回当前元素下级符合条件的子元素,默认返回第一个 \n """返回当前元素下级符合条件的子元素,默认返回第一个 \n
@ -207,44 +231,45 @@ class SessionElement(DrissionElement):
""" """
return self.ele(loc_or_str, mode='all', show_errmsg=show_errmsg) return self.ele(loc_or_str, mode='all', show_errmsg=show_errmsg)
# def attr(self, attr: str) -> Union[str, None]: def attr(self, attr: str) -> Union[str, None]:
# """返回属性值 \n """返回属性值 \n
# :param attr: 属性名 :param attr: 属性名
# :return: 属性值文本没有该属性返回None :return: 属性值文本没有该属性返回None
# """ """
# try: try:
# if attr == 'href':
# # 如直接获取attr只能获取相对地址 if attr == 'href':
# link = self._inner_ele.attrs['href'] # 如直接获取attr只能获取相对地址
# if link.lower().startswith(('javascript:', 'mailto:')): link = self.inner_ele.get('href')
# return link if link.lower().startswith(('javascript:', 'mailto:')):
# elif link.startswith('#'): return link
# if '#' in self.inner_ele.url: elif link.startswith('#'):
# return re.sub(r'#.*', link, self.inner_ele.url) if '#' in self.url:
# else: return re.sub(r'#.*', link, self.url)
# return f'{self.inner_ele.url}{link}' else:
# elif link.startswith('?'): # 避免当相对URL以?开头时requests-html丢失参数的bug return f'{self.url}{link}'
# if '?' in self.inner_ele.url: # elif link.startswith('?'): # 避免当相对URL以?开头时requests-html丢失参数的bug
# return re.sub(r'\?.*', link, self.inner_ele.url) # if '?' in self.inner_ele.url:
# else: # return re.sub(r'\?.*', link, self.inner_ele.url)
# return f'{self.inner_ele.url}{link}' # else:
# else: # return f'{self.inner_ele.url}{link}'
# for link in self._inner_ele.absolute_links: # else:
# return link # for link in self._inner_ele.absolute_links:
# elif attr == 'src': # return link
# return self._inner_ele._make_absolute(self._inner_ele.attrs['src']) # elif attr == 'src':
# elif attr == 'class': # return self._inner_ele._make_absolute(self._inner_ele.attrs['src'])
# return ' '.join(self._inner_ele.attrs['class']) # elif attr == 'class':
# elif attr == 'text': # return ' '.join(self._inner_ele.attrs['class'])
# return self.text # elif attr == 'text':
# elif attr == 'outerHTML': # return self.text
# return self.inner_ele.html # elif attr == 'outerHTML':
# elif attr == 'innerHTML': # return self.inner_ele.html
# return self.html # elif attr == 'innerHTML':
# else: # return self.html
# return self._inner_ele.attrs[attr] else:
# except: return self.inner_ele.get(attr)
# return None except:
return None
def execute_session_find(page_or_ele: _Element, def execute_session_find(page_or_ele: _Element,

View File

@ -204,7 +204,7 @@ class SessionPage(object):
self._url_available = True self._url_available = True
else: else:
if show_errmsg: if show_errmsg:
raise ConnectionError(f'Status code: {self._response.status_code}.') raise ConnectionError(f'{to_url}\nStatus code: {self._response.status_code}.')
self._url_available = False self._url_available = False
return self._url_available return self._url_available
@ -433,9 +433,10 @@ class SessionPage(object):
else: else:
charset = headers[content_type[0]].split('=')[1] charset = headers[content_type[0]].split('=')[1]
if charset:
r.encoding = charset
if not_stream: # 加载网页时修复编码 if not_stream: # 加载网页时修复编码
r._content = r.content.replace(b'\x08', b'\\b') # 避免存在退格符导致乱码或解析出错 r._content = r.content.replace(b'\x08', b'\\b') # 避免存在退格符导致乱码或解析出错
# r.html.encoding = r.encoding # 修复requests_html丢失编码方式的bug # r.html.encoding = r.encoding # 修复requests_html丢失编码方式的bug
if charset:
r.encoding = charset
return r, 'Success' return r, 'Success'