GPT-SoVITS/detectors.py
2025-07-28 18:11:45 +03:00

528 lines
18 KiB
Python

"""
detectors.py - Face Detection Algorithms and Validators
Contains different face detection methods and quality validation
"""
import cv2
import numpy as np
import logging
from typing import List, Tuple
logger = logging.getLogger(__name__)
class EnhancedFaceDetector:
"""Enhanced face detector using multiple detection methods"""
def __init__(self):
self.detectors = {}
self.load_detectors()
def load_detectors(self):
"""Load multiple face detection algorithms"""
# 1. MediaPipe Face Detection (primary)
try:
import mediapipe as mp
self.mp_face_detection = mp.solutions.face_detection
self.mp_face_detector = self.mp_face_detection.FaceDetection(
model_selection=1,
min_detection_confidence=0.5 # Relaxed threshold
)
self.detectors['mediapipe'] = True
logger.info("✅ MediaPipe Face Detector loaded")
except ImportError:
logger.warning("⚠️ MediaPipe not available - install with: pip install mediapipe")
self.detectors['mediapipe'] = False
# 2. Haar Cascade (fallback)
try:
self.face_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)
self.detectors['haar'] = True
logger.info("✅ Haar cascade loaded as fallback")
except Exception as e:
logger.error(f"❌ Haar cascade loading error: {e}")
self.detectors['haar'] = False
def detect_faces_mediapipe(self, image):
"""Detect faces using MediaPipe (most accurate)"""
if not self.detectors.get('mediapipe', False):
return []
try:
# Convert BGR to RGB
rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = self.mp_face_detector.process(rgb_image)
faces = []
if results.detections:
h, w = image.shape[:2]
for detection in results.detections:
bbox = detection.location_data.relative_bounding_box
confidence = detection.score[0]
# Convert relative coordinates to absolute
x = max(0, int(bbox.xmin * w))
y = max(0, int(bbox.ymin * h))
width = int(bbox.width * w)
height = int(bbox.height * h)
# Ensure face is within image bounds
x = min(x, w - 1)
y = min(y, h - 1)
width = min(width, w - x)
height = min(height, h - y)
if width > 30 and height > 30: # Minimum size
faces.append((x, y, width, height, confidence))
return faces
except Exception as e:
logger.error(f"MediaPipe detection error: {e}")
return []
def detect_faces_haar(self, image):
"""Detect faces using Haar cascades (fallback)"""
if not self.detectors.get('haar', False):
return []
try:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
faces = self.face_cascade.detectMultiScale(
gray,
scaleFactor=1.1,
minNeighbors=3, # Relaxed from 5
minSize=(30, 30), # Relaxed from (80, 80)
maxSize=(500, 500)
)
# Add estimated confidence
faces_with_conf = []
for (x, y, w, h) in faces:
# Estimate confidence based on face size
confidence = min(0.9, max(0.5, (w * h) / (100 * 100)))
faces_with_conf.append((x, y, w, h, confidence))
return faces_with_conf
except Exception as e:
logger.error(f"Haar detection error: {e}")
return []
def detect_faces_dnn(self, image):
"""Detect faces using OpenCV DNN (optional third method)"""
try:
# This requires pre-trained DNN model files
# For now, we'll skip this implementation
# You can add DNN detection here if you have the model files
return []
except Exception as e:
logger.error(f"DNN detection error: {e}")
return []
def detect_faces(self, image):
"""Main face detection using best available method"""
all_faces = []
# Try MediaPipe first (most accurate)
faces = self.detect_faces_mediapipe(image)
if faces:
all_faces.extend([(x, y, w, h, conf, 'mediapipe') for x, y, w, h, conf in faces])
# If no MediaPipe faces or low confidence, try Haar
if not all_faces or max([f[4] for f in all_faces]) < 0.7:
faces = self.detect_faces_haar(image)
all_faces.extend([(x, y, w, h, conf, 'haar') for x, y, w, h, conf in faces])
# Filter and remove duplicates
filtered_faces = self.filter_and_deduplicate_faces(all_faces)
return filtered_faces
def filter_and_deduplicate_faces(self, faces):
"""Remove duplicate faces and apply quality filters"""
if not faces:
return []
# Sort by confidence (highest first)
faces.sort(key=lambda x: x[4], reverse=True)
filtered = []
for face in faces:
x, y, w, h, conf, method = face
# Apply quality checks
if not self.is_valid_face_detection(x, y, w, h, conf):
continue
# Check for overlap with existing faces
is_duplicate = False
for existing in filtered:
if self.faces_overlap(face, existing):
is_duplicate = True
break
if not is_duplicate:
filtered.append(face)
return filtered
def is_valid_face_detection(self, x, y, w, h, confidence):
"""Validate face detection quality"""
# Minimum confidence threshold (relaxed)
if confidence < 0.3:
return False
# Minimum size check (relaxed)
if w < 30 or h < 30:
return False
# Aspect ratio check (more permissive)
aspect_ratio = w / h
if aspect_ratio < 0.4 or aspect_ratio > 2.5:
return False
# Coordinates should be positive
if x < 0 or y < 0:
return False
return True
def faces_overlap(self, face1, face2, threshold=0.5):
"""Check if two face detections overlap significantly"""
x1, y1, w1, h1 = face1[:4]
x2, y2, w2, h2 = face2[:4]
# Calculate intersection area
xi1 = max(x1, x2)
yi1 = max(y1, y2)
xi2 = min(x1 + w1, x2 + w2)
yi2 = min(y1 + h1, y2 + h2)
if xi2 <= xi1 or yi2 <= yi1:
return False
intersection = (xi2 - xi1) * (yi2 - yi1)
# Calculate union area
area1 = w1 * h1
area2 = w2 * h2
union = area1 + area2 - intersection
# Calculate IoU (Intersection over Union)
iou = intersection / union if union > 0 else 0
return iou > threshold
class FaceQualityValidator:
"""Validate face quality before analysis"""
@staticmethod
def is_face_clear(face_img, blur_threshold=30):
"""Check if face is clear enough (not too blurry)"""
try:
gray = cv2.cvtColor(face_img, cv2.COLOR_BGR2GRAY)
laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
return laplacian_var > blur_threshold
except:
return True # If check fails, assume it's OK
@staticmethod
def has_sufficient_size(face_img, min_size=30):
"""Check if face has sufficient resolution"""
h, w = face_img.shape[:2]
return min(h, w) >= min_size
@staticmethod
def is_properly_aligned(face_img):
"""Basic check for face alignment (relaxed)"""
try:
h, w = face_img.shape[:2]
# More permissive aspect ratio
aspect_ratio = w / h
return 0.3 <= aspect_ratio <= 3.0
except:
return True # If check fails, assume it's OK
@staticmethod
def has_good_contrast(face_img, min_std=20):
"""Check if face has sufficient contrast"""
try:
gray = cv2.cvtColor(face_img, cv2.COLOR_BGR2GRAY)
std_dev = np.std(gray)
return std_dev > min_std
except:
return True # If check fails, assume it's OK
@staticmethod
def is_well_lit(face_img, min_brightness=30, max_brightness=220):
"""Check if face is well lit (not too dark or overexposed)"""
try:
gray = cv2.cvtColor(face_img, cv2.COLOR_BGR2GRAY)
mean_brightness = np.mean(gray)
return min_brightness < mean_brightness < max_brightness
except:
return True # If check fails, assume it's OK
@staticmethod
def validate_face(face_img):
"""Complete face validation with relaxed criteria"""
if face_img is None or face_img.size == 0:
return False, "Empty face image"
if not FaceQualityValidator.has_sufficient_size(face_img):
return False, "Face too small"
# Skip strict checks for now - they were too restrictive
# if not FaceQualityValidator.is_face_clear(face_img):
# return False, "Face too blurry"
if not FaceQualityValidator.is_properly_aligned(face_img):
return False, "Face poorly aligned"
# Optional additional checks (commented out for relaxed validation)
# if not FaceQualityValidator.has_good_contrast(face_img):
# return False, "Face has poor contrast"
# if not FaceQualityValidator.is_well_lit(face_img):
# return False, "Face is poorly lit"
return True, "Face valid"
@staticmethod
def get_face_quality_score(face_img):
"""Get overall quality score for face (0-100)"""
if face_img is None or face_img.size == 0:
return 0
score = 0
# Size score (0-25 points)
h, w = face_img.shape[:2]
min_dim = min(h, w)
if min_dim >= 100:
score += 25
elif min_dim >= 60:
score += 20
elif min_dim >= 30:
score += 15
else:
score += 5
# Clarity score (0-25 points)
try:
gray = cv2.cvtColor(face_img, cv2.COLOR_BGR2GRAY)
laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
if laplacian_var > 100:
score += 25
elif laplacian_var > 50:
score += 20
elif laplacian_var > 20:
score += 15
else:
score += 10
except:
score += 15 # Default moderate score
# Alignment score (0-25 points)
try:
aspect_ratio = w / h
if 0.8 <= aspect_ratio <= 1.2:
score += 25 # Perfect square-ish
elif 0.6 <= aspect_ratio <= 1.6:
score += 20 # Good
elif 0.4 <= aspect_ratio <= 2.0:
score += 15 # Acceptable
else:
score += 5 # Poor
except:
score += 15 # Default moderate score
# Contrast score (0-25 points)
try:
gray = cv2.cvtColor(face_img, cv2.COLOR_BGR2GRAY)
std_dev = np.std(gray)
if std_dev > 60:
score += 25
elif std_dev > 40:
score += 20
elif std_dev > 20:
score += 15
else:
score += 5
except:
score += 15 # Default moderate score
return min(100, score)
@staticmethod
def enhance_face_for_analysis(face_img):
"""Apply basic enhancement to improve face for analysis"""
try:
# Convert to LAB color space
lab = cv2.cvtColor(face_img, cv2.COLOR_BGR2LAB)
l, a, b = cv2.split(lab)
# Apply CLAHE to L channel (brightness)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(4, 4))
l = clahe.apply(l)
# Merge back
enhanced = cv2.merge([l, a, b])
enhanced = cv2.cvtColor(enhanced, cv2.COLOR_LAB2BGR)
# Apply slight Gaussian blur to reduce noise
enhanced = cv2.GaussianBlur(enhanced, (3, 3), 0)
return enhanced
except Exception as e:
logger.error(f"Face enhancement error: {e}")
return face_img
class FaceTracker:
"""Track faces across frames for better stability"""
def __init__(self, max_distance=50, max_age=30):
self.tracks = {}
self.next_id = 1
self.max_distance = max_distance
self.max_age = max_age
def update(self, detections):
"""Update tracks with new detections"""
# Calculate distances between existing tracks and new detections
matched_tracks = {}
unmatched_detections = list(detections)
for track_id, track in self.tracks.items():
if track['age'] > self.max_age:
continue # Skip old tracks
best_match = None
best_distance = float('inf')
for i, detection in enumerate(unmatched_detections):
x, y, w, h = detection[:4]
center_x, center_y = x + w//2, y + h//2
track_x, track_y = track['center']
distance = np.sqrt((center_x - track_x)**2 + (center_y - track_y)**2)
if distance < self.max_distance and distance < best_distance:
best_distance = distance
best_match = i
if best_match is not None:
detection = unmatched_detections[best_match]
x, y, w, h = detection[:4]
# Update track
self.tracks[track_id].update({
'center': (x + w//2, y + h//2),
'bbox': (x, y, w, h),
'age': 0,
'confidence': detection[4] if len(detection) > 4 else 0.5
})
matched_tracks[track_id] = detection
unmatched_detections.pop(best_match)
# Age existing tracks
for track_id in list(self.tracks.keys()):
if track_id not in matched_tracks:
self.tracks[track_id]['age'] += 1
if self.tracks[track_id]['age'] > self.max_age:
del self.tracks[track_id]
# Create new tracks for unmatched detections
for detection in unmatched_detections:
x, y, w, h = detection[:4]
self.tracks[self.next_id] = {
'center': (x + w//2, y + h//2),
'bbox': (x, y, w, h),
'age': 0,
'confidence': detection[4] if len(detection) > 4 else 0.5,
'created_frame': self.next_id
}
matched_tracks[self.next_id] = detection
self.next_id += 1
return matched_tracks
def get_stable_faces(self, min_track_length=3):
"""Get faces that have been tracked for a minimum number of frames"""
stable_tracks = {}
for track_id, track in self.tracks.items():
if track['age'] < min_track_length:
stable_tracks[track_id] = track
return stable_tracks
class MultiScaleDetector:
"""Detect faces at multiple scales for better accuracy"""
def __init__(self, base_detector):
self.base_detector = base_detector
self.scales = [1.0, 0.8, 1.2] # Different scales to try
def detect_faces(self, image):
"""Detect faces at multiple scales"""
all_detections = []
h, w = image.shape[:2]
for scale in self.scales:
if scale != 1.0:
# Resize image
new_w = int(w * scale)
new_h = int(h * scale)
resized = cv2.resize(image, (new_w, new_h))
else:
resized = image
# Detect faces
faces = self.base_detector.detect_faces(resized)
# Scale coordinates back to original size
for face in faces:
x, y, w_f, h_f, conf, method = face
if scale != 1.0:
x = int(x / scale)
y = int(y / scale)
w_f = int(w_f / scale)
h_f = int(h_f / scale)
all_detections.append((x, y, w_f, h_f, conf, f"{method}_scale_{scale}"))
# Remove duplicates and return best detections
return self.base_detector.filter_and_deduplicate_faces(all_detections)
# Factory function to create detector with best available methods
def create_face_detector(use_tracking=False, use_multiscale=False):
"""
Factory function to create the best available face detector
Args:
use_tracking: Enable face tracking across frames
use_multiscale: Enable multi-scale detection
Returns:
Configured face detector
"""
base_detector = EnhancedFaceDetector()
if use_multiscale:
detector = MultiScaleDetector(base_detector)
else:
detector = base_detector
if use_tracking:
# Note: Tracking would need to be integrated into the main detection loop
logger.info("Face tracking enabled")
return detector