435 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding:utf-8 -*-
"""
@Author : g1879
@Contact : g1879@qq.com
@File : common.py
"""
from html import unescape
from pathlib import Path
from re import split, search, sub
from shutil import rmtree
from typing import Union
from zipfile import ZipFile
def get_ele_txt(e) -> str:
"""获取元素内所有文本
:param e: 元素对象
:return: 元素内所有文本
"""
# 前面无须换行的元素
nowrap_list = ('br', 'sub', 'sup', 'em', 'strong', 'a', 'font', 'b', 'span', 's', 'i', 'del', 'ins', 'img', 'td',
'th', 'abbr', 'bdi', 'bdo', 'cite', 'code', 'data', 'dfn', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby',
'samp', 'small', 'time', 'u', 'var', 'wbr', 'button', 'slot', 'content')
# 后面添加换行的元素
wrap_after_list = ('p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'li', 'blockquote', 'header',
'footer', 'address' 'article', 'aside', 'main', 'nav', 'section', 'figcaption', 'summary')
# 不获取文本的元素
noText_list = ('script', 'style', 'video', 'audio', 'iframe', 'embed', 'noscript', 'canvas', 'template')
# 用/t分隔的元素
tab_list = ('td', 'th')
if e.tag in noText_list:
return e.raw_text
def get_node_txt(ele, pre: bool = False):
tag = ele.tag
if tag == 'br':
return [True]
if not pre and tag == 'pre':
pre = True
str_list = []
if tag in noText_list and not pre: # 标签内的文本不返回
return str_list
nodes = ele.eles('xpath:./text() | *')
prev_ele = ''
for el in nodes:
if isinstance(el, str): # 字符节点
if pre:
str_list.append(el)
else:
if sub('[ \n\t\r]', '', el) != '': # 字符除了回车和空格还有其它内容
txt = el
if not pre:
txt = txt.replace('\n', ' ').strip(' ')
txt = sub(r' {2,}', ' ', txt)
str_list.append(txt)
else: # 元素节点
if el.tag not in nowrap_list and str_list and str_list[-1] != '\n': # 元素间换行的情况
str_list.append('\n')
if el.tag in tab_list and prev_ele in tab_list: # 表格的行
str_list.append('\t')
str_list.extend(get_node_txt(el, pre))
prev_ele = el.tag
if tag in wrap_after_list and str_list and str_list[-1] not in ('\n', True): # 有些元素后面要添加回车
str_list.append('\n')
return str_list
re_str = get_node_txt(e)
if re_str and re_str[-1] == '\n':
re_str.pop()
re_str = ''.join([i if i is not True else '\n' for i in re_str])
return format_html(re_str)
def get_loc(loc: Union[tuple, str], translate_css: bool = False) -> tuple:
"""接收selenium定位元组或本库定位语法转换为标准定位元组可翻译css selector为xpath \n
:param loc: selenium定位元组或本库定位语法
:param translate_css: 是否翻译css selector为xpath
:return: DrissionPage定位元组
"""
if isinstance(loc, tuple):
loc = translate_loc(loc)
elif isinstance(loc, str):
loc = str_to_loc(loc)
else:
raise TypeError('loc参数只能是tuple或str。')
if loc[0] == 'css selector' and translate_css:
from lxml.cssselect import CSSSelector, ExpressionError
try:
path = str(CSSSelector(loc[1], translator='html').path)
path = path[20:] if path.startswith('descendant-or-self::') else path
loc = 'xpath', path
except ExpressionError:
pass
return loc
def str_to_loc(loc: str) -> tuple:
"""处理元素查找语句 \n
查找方式属性、tag name及属性、文本、xpath、css selector、id、class \n
@表示属性,.表示class#表示id=表示精确匹配,:表示模糊匹配,无控制字符串时默认搜索该字符串 \n
"""
loc_by = 'xpath'
if loc.startswith('.'):
if loc.startswith(('.=', '.:',)):
loc = loc.replace('.', '@class', 1)
else:
loc = loc.replace('.', '@class=', 1)
elif loc.startswith('#'):
if loc.startswith(('#=', '#:',)):
loc = loc.replace('#', '@id', 1)
else:
loc = loc.replace('#', '@id=', 1)
elif loc.startswith(('t:', 't=')):
loc = f'tag:{loc[2:]}'
elif loc.startswith(('tx:', 'tx=')):
loc = f'text{loc[2:]}'
# ------------------------------------------------------------------
# 多属性查找
if loc.startswith('@@') and loc != '@@':
loc_str = _make_multi_xpath_str('*', loc)
# 单属性查找
elif loc.startswith('@') and loc != '@':
loc_str = _make_single_xpath_str('*', loc)
# 根据tag name查找
elif loc.startswith(('tag:', 'tag=')) and loc not in ('tag:', 'tag='):
at_ind = loc.find('@')
if at_ind == -1:
loc_str = f'//*[name()="{loc[4:]}"]'
else:
if loc[at_ind:].startswith('@@'):
loc_str = _make_multi_xpath_str(loc[4:at_ind], loc[at_ind:])
else:
loc_str = _make_single_xpath_str(loc[4:at_ind], loc[at_ind:])
# 根据文本查找
elif loc.startswith('text='):
loc_str = f'//*[.={_make_search_str(loc[5:])}]'
elif loc.startswith('text:') and loc != 'text:':
loc_str = f'//*/text()[contains(., {_make_search_str(loc[5:])})]/..'
# 用xpath查找
elif loc.startswith(('xpath:', 'xpath=')) and loc not in ('xpath:', 'xpath='):
loc_str = loc[6:]
elif loc.startswith(('x:', 'x=')) and loc not in ('x:', 'x='):
loc_str = loc[2:]
# 用css selector查找
elif loc.startswith(('css:', 'css=')) and loc not in ('css:', 'css='):
loc_by = 'css selector'
loc_str = loc[4:]
elif loc.startswith(('c:', 'c=')) and loc not in ('c:', 'c='):
loc_by = 'css selector'
loc_str = loc[2:]
# 根据文本模糊查找
elif loc:
loc_str = f'//*/text()[contains(., {_make_search_str(loc)})]/..'
else:
loc_str = '//*'
return loc_by, loc_str
def _make_single_xpath_str(tag: str, text: str) -> str:
"""生成xpath语句 \n
:param tag: 标签名
:param text: 待处理的字符串
:return: xpath字符串
"""
arg_list = [] if tag == '*' else [f'name()="{tag}"']
arg_str = txt_str = ''
if text == '@':
arg_str = 'not(@*)'
else:
r = split(r'([:=])', text, maxsplit=1)
len_r = len(r)
len_r0 = len(r[0])
if len_r != 3 and len_r0 > 1:
arg_str = 'normalize-space(text())' if r[0] in ('@text()', '@tx()') else f'{r[0]}'
elif len_r == 3 and len_r0 > 1:
if r[1] == '=': # 精确查找
arg = '.' if r[0] in ('@text()', '@tx()') else r[0]
arg_str = f'{arg}={_make_search_str(r[2])}'
else: # 模糊查找
if r[0] in ('@text()', '@tx()'):
txt_str = f'/text()[contains(., {_make_search_str(r[2])})]/..'
arg_str = ''
else:
arg_str = f"contains({r[0]},{_make_search_str(r[2])})"
if arg_str:
arg_list.append(arg_str)
arg_str = ' and '.join(arg_list)
return f'//*[{arg_str}]{txt_str}' if arg_str else f'//*{txt_str}'
def _make_multi_xpath_str(tag: str, text: str) -> str:
"""生成多属性查找的xpath语句 \n
:param tag: 标签名
:param text: 待处理的字符串
:return: xpath字符串
"""
arg_list = [] if tag == '*' else [f'name()="{tag}"']
args = text.split('@@')
for arg in args[1:]:
r = split(r'([:=])', arg, maxsplit=1)
arg_str = ''
len_r = len(r)
if not r[0]: # 不查询任何属性
arg_str = 'not(@*)'
else:
r[0], ignore = (r[0][1:], True) if r[0][0] == '-' else (r[0], None) # 是否去除某个属性
if len_r != 3: # 只有属性名没有属性内容,查询是否存在该属性
arg_str = 'normalize-space(text())' if r[0] in ('text()', 'tx()') else f'@{r[0]}'
elif len_r == 3: # 属性名和内容都有
arg = '.' if r[0] in ('text()', 'tx()') else f'@{r[0]}'
if r[1] == '=':
arg_str = f'{arg}={_make_search_str(r[2])}'
else:
arg_str = f'contains({arg},{_make_search_str(r[2])})'
if arg_str and ignore:
arg_str = f'not({arg_str})'
if arg_str:
arg_list.append(arg_str)
arg_str = ' and '.join(arg_list)
return f'//*[{arg_str}]' if arg_str else f'//*'
def _make_search_str(search_str: str) -> str:
""""转义,不知何故不能直接用 \ 来转义 \n
:param search_str: 查询字符串
:return: 把"转义后的字符串
"""
parts = search_str.split('"')
parts_num = len(parts)
search_str = 'concat('
for key, i in enumerate(parts):
search_str += f'"{i}"'
search_str += ',' + '\'"\',' if key < parts_num - 1 else ''
search_str += ',"")'
return search_str
def translate_loc(loc: tuple) -> tuple:
"""把By类型的loc元组转换为css selector或xpath类型的 \n
:param loc: By类型的loc元组
:return: css selector或xpath类型的loc元组
"""
if len(loc) != 2:
raise ValueError('定位符长度必须为2。')
loc_by = 'xpath'
if loc[0] == 'xpath':
loc_str = loc[1]
elif loc[0] == 'css selector':
loc_by = 'css selector'
loc_str = loc[1]
elif loc[0] == 'id':
loc_str = f'//*[@id="{loc[1]}"]'
elif loc[0] == 'class name':
loc_str = f'//*[@class="{loc[1]}"]'
elif loc[0] == 'link text':
loc_str = f'//a[text()="{loc[1]}"]'
elif loc[0] == 'name':
loc_str = f'//*[@name="{loc[1]}"]'
elif loc[0] == 'tag name':
loc_str = f'//{loc[1]}'
elif loc[0] == 'partial link text':
loc_str = f'//a[contains(text(),"{loc[1]}")]'
else:
raise ValueError('无法识别的定位符。')
return loc_by, loc_str
def format_html(text: str) -> str:
"""处理html编码字符 \n
:param text: html文本
:return: 格式化后的html文本
"""
return unescape(text).replace('\xa0', ' ') if text else text
def clean_folder(folder_path: str, ignore: list = None) -> None:
"""清空一个文件夹除了ignore里的文件和文件夹 \n
:param folder_path: 要清空的文件夹路径
:param ignore: 忽略列表
:return: None
"""
ignore = [] if not ignore else ignore
p = Path(folder_path)
for f in p.iterdir():
if f.name not in ignore:
if f.is_file():
f.unlink()
elif f.is_dir():
rmtree(f, True)
def unzip(zip_path: str, to_path: str) -> Union[list, None]:
"""解压下载的chromedriver.zip文件"""
if not zip_path:
return
with ZipFile(zip_path, 'r') as f:
return [f.extract(f.namelist()[0], path=to_path)]
def get_exe_path_from_port(port: Union[str, int]) -> Union[str, None]:
"""获取端口号第一条进程的可执行文件路径 \n
:param port: 端口号
:return: 可执行文件的绝对路径
"""
from os import popen
from time import perf_counter
process = popen(f'netstat -ano |findstr {port}').read().split('\n')[0]
t = perf_counter()
while not process and perf_counter() - t < 10:
process = popen(f'netstat -ano |findstr {port}').read().split('\n')[0]
processid = process.split(' ')[-1]
if not processid:
return
else:
file_lst = popen(f'wmic process where processid={processid} get executablepath').read().split('\n')
return file_lst[2].strip() if len(file_lst) > 2 else None
def get_usable_path(path: Union[str, Path]) -> Path:
"""检查文件或文件夹是否有重名,并返回可以使用的路径 \n
:param path: 文件或文件夹路径
:return: 可用的路径Path对象
"""
path = Path(path)
parent = path.parent
path = parent / make_valid_name(path.name)
name = path.stem if path.is_file() else path.name
ext = path.suffix if path.is_file() else ''
first_time = True
while path.exists():
r = search(r'(.*)_(\d+)$', name)
if not r or (r and first_time):
src_name, num = name, '1'
else:
src_name, num = r.group(1), int(r.group(2)) + 1
name = f'{src_name}_{num}'
path = parent / f'{name}{ext}'
first_time = None
return path
def make_valid_name(full_name: str) -> str:
"""获取有效的文件名 \n
:param full_name: 文件名
:return: 可用的文件名
"""
# ----------------去除前后空格----------------
full_name = full_name.strip()
# ----------------使总长度不大于255个字符一个汉字是2个字符----------------
r = search(r'(.*)(\.[^.]+$)', full_name) # 拆分文件名和后缀名
if r:
name, ext = r.group(1), r.group(2)
ext_long = len(ext)
else:
name, ext = full_name, ''
ext_long = 0
while get_long(name) > 255 - ext_long:
name = name[:-1]
full_name = f'{name}{ext}'
# ----------------去除不允许存在的字符----------------
return sub(r'[<>/\\|:*?\n]', '', full_name)
def get_long(txt) -> int:
"""返回字符串中字符个数一个汉字是2个字符 \n
:param txt: 字符串
:return: 字符个数
"""
txt_len = len(txt)
return int((len(txt.encode('utf-8')) - txt_len) / 2 + txt_len)