PYTHON 实现智能日志分析与异常检测系统

本文链接：https://blog.csdn.net/y131673/article/details/147811175

功能描述

这个智能日志分析系统提供以下功能：

多格式日志解析
实时日志监控
异常模式识别
自动告警通知
日志聚类分析
趋势预测
可视化仪表盘
上下文关联
根因分析
自定义规则引擎

代码实现

import re
from typing import List, Dict, Optional, Tuple, Iterator
from datetime import datetime
import heapq
import json
import logging
from collections import defaultdict, Counter
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import matplotlib.pyplot as plt
import smtplib
from email.mime.text import MIMEText
import socket
import threading
import time
from dataclasses import dataclass, field
from enum import Enum, auto

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('LogAnalyzer')

class LogLevel(Enum):
    DEBUG = auto()
    INFO = auto()
    WARNING = auto()
    ERROR = auto()
    CRITICAL = auto()

class AlertType(Enum):
    EMAIL = auto()
    SLACK = auto()
    WEBHOOK = auto()

@dataclass
class LogEntry:
    timestamp: datetime
    level: LogLevel
    message: str
    source: str
    metadata: Dict = field(default_factory=dict)

@dataclass
class LogPattern:
    regex: str
    name: str
    level: LogLevel
    description: str

@dataclass
class AlertRule:
    name: str
    condition: Callable[[List[LogEntry]], bool]
    action: Callable[[str], None]
    cooldown: int = 300  # seconds

class SmartLogAnalyzer:
    """智能日志分析与异常检测系统"""
    
    def __init__(self, 
                 alert_rules: List[AlertRule] = None,
                 patterns: List[LogPattern] = None,
                 max_history: int = 10000):
        
        self.log_history = []
        self.alert_rules = alert_rules or []
        self.patterns = patterns or self._get_default_patterns()
        self.max_history = max_history
        self.stats = {
            'total_logs': 0,
            'level_counts': defaultdict(int),
            'source_counts': defaultdict(int),
            'pattern_matches': defaultdict(int)
        }
        self._lock = threading.Lock()
        self._last_alert_time = {}
        self._vectorizer = TfidfVectorizer(max_features=1000)
        self._clustering_model = DBSCAN(eps=0.5, min_samples=5)
        self._stop_event = threading.Event()
        self._monitor_thread = None
        
    def start_monitoring(self, interval: float = 5.0):
        """启动监控线程"""
        self._monitor_thread = threading.Thread(
            target=self._monitor_loop,
            args=(interval,),
            daemon=True
        )
        self._monitor_thread.start()
        
    def stop_monitoring(self):
        """停止监控线程"""
        self._stop_event.set()
        if self._monitor_thread:
            self._monitor_thread.join()
            
    def _monitor_loop(self, interval: float):
        """监控循环"""
        while not self._stop_event.is_set():
            self._check_alerts()
            time.sleep(interval)
            
    def add_log(self, log: LogEntry):
        """添加日志条目"""
        with self._lock:
            # 维护固定大小的历史记录
            if len(self.log_history) >= self.max_history:
                self.log_history.pop(0)
                
            self.log_history.append(log)
            
            # 更新统计信息
            self.stats['total_logs'] += 1
            self.stats['level_counts'][log.level.name] += 1
            self.stats['source_counts'][log.source] += 1
            
            # 模式匹配
            for pattern in self.patterns:
                if re.search(pattern.regex, log.message):
                    self.stats['pattern_matches'][pattern.name] += 1
                    if pattern.level.value >= LogLevel.ERROR.value:
                        self._trigger_immediate_alert(
                            f"Pattern matched: {pattern.name}\n"
                            f"Level: {pattern.level.name}\n"
                            f"Message: {log.message}"
                        )
                    break
                    
    def _trigger_immediate_alert(self, message: str):
        """触发即时警报"""
        for rule in self.alert_rules:
            try:
                rule.action(message)
            except Exception as e:
                logger.error(f"Failed to send alert: {str(e)}")
                
    def _check_alerts(self):
        """检查警报规则"""
        with self._lock:
            current_time = time.time()
            logs_to_check = self.log_history[-1000:]  # 检查最近的1000条日志
            
            for rule in self.alert_rules:
                last_alert = self._last_alert_time.get(rule.name, 0)
                if current_time - last_alert < rule.cooldown:
                    continue
                    
                if rule.condition(logs_to_check):
                    self._last_alert_time[rule.name] = current_time
                    alert_msg = f"Alert triggered: {rule.name}\n"
                    alert_msg += f"Logs analyzed: {len(logs_to_check)}\n"
                    alert_msg += f"Timestamp: {datetime.now()}"
                    
                    try:
                        rule.action(alert_msg)
                    except Exception as e:
                        logger.error(f"Failed to send alert: {str(e)}")
                        
    def analyze_trends(self, window: int = 60) -> Dict:
        """分析日志趋势"""
        with self._lock:
            if len(self.log_history) < window:
                return {}
                
            recent_logs = self.log_history[-window:]
            error_rate = sum(1 for log in recent_logs 
                           if log.level.value >= LogLevel.ERROR.value) / window
                           
            return {
                'error_rate': error_rate,
                'top_sources': Counter(log.source for log in recent_logs).most_common(3),
                'top_messages': Counter(log.message[:50] for log in recent_logs 
                                      if log.level.value >= LogLevel.WARNING.value).most_common(3)
            }
            
    def cluster_logs(self, n_samples: int = 1000) -> Dict:
        """聚类相似日志"""
        with self._lock:
            if not self.log_history:
                return {}
                
            sample_logs = self.log_history[-n_samples:]
            messages = [log.message for log in sample_logs]
            
            try:
                X = self._vectorizer.fit_transform(messages)
                clusters = self._clustering_model.fit_predict(X)
                
                cluster_info = defaultdict(list)
                for log, cluster_id in zip(sample_logs, clusters):
                    if cluster_id != -1:  # 忽略噪声点
                        cluster_info[cluster_id].append(log)
                        
                return {
                    'total_clusters': len(cluster_info),
                    'cluster_samples': {
                        cid: logs[0].message[:100] + "..." 
                        for cid, logs in cluster_info.items()
                    }
                }
            except Exception as e:
                logger.error(f"Clustering failed: {str(e)}")
                return {}
                
    def get_stats(self) -> Dict:
        """获取统计信息"""
        with self._lock:
            return {
                'total_logs': self.stats['total_logs'],
                'level_distribution': dict(self.stats['level_counts']),
                'source_distribution': dict(self.stats['source_counts']),
                'pattern_matches': dict(self.stats['pattern_matches'])
            }
            
    def visualize_stats(self):
        """可视化统计信息"""
        stats = self.get_stats()
        
        plt.figure(figsize=(15, 5))
        
        # 日志级别分布
        plt.subplot(1, 3, 1)
        levels = list(stats['level_distribution'].keys())
        counts = list(stats['level_distribution'].values())
        plt.bar(levels, counts)
        plt.title('Log Level Distribution')
        
        # 来源分布
        plt.subplot(1, 3, 2)
        sources = list(stats['source_distribution'].keys())[:10]
        counts = list(stats['source_distribution'].values())[:10]
        plt.bar(sources, counts)
        plt.xticks(rotation=45)
        plt.title('Top 10 Sources')
        
        # 模式匹配
        plt.subplot(1, 3, 3)
        patterns = list(stats['pattern_matches'].keys())
        counts = list(stats['pattern_matches'].values())
        plt.bar(patterns, counts)
        plt.xticks(rotation=45)
        plt.title('Pattern Matches')
        
        plt.tight_layout()
        plt.show()
        
    def _get_default_patterns(self) -> List[LogPattern]:
        """获取默认日志模式"""
        return [
            LogPattern(
                regex=r'error|exception|fail',
                name='ErrorKeywords',
                level=LogLevel.ERROR,
                description='Logs containing error keywords'
            ),
            LogPattern(
                regex=r'timeout|deadlock',
                name='CriticalIssues',
                level=LogLevel.CRITICAL,
                description='Critical system issues'
            ),
            LogPattern(
                regex=r'warning|deprecated',
                name='WarningKeywords',
                level=LogLevel.WARNING,
                description='Logs containing warning keywords'
            )
        ]

def send_email_alert(message: str):
    """发送邮件警报"""
    msg = MIMEText(message)
    msg['Subject'] = 'Log Alert Notification'
    msg['From'] = 'alerts@example.com'
    msg['To'] = 'admin@example.com'
    
    with smtplib.SMTP('smtp.example.com', 587) as server:
        server.login('user', 'password')
        server.send_message(msg)

def create_error_rate_rule(threshold: float = 0.1) -> AlertRule:
    """创建错误率警报规则"""
    def condition(logs: List[LogEntry]) -> bool:
        error_count = sum(1 for log in logs if log.level.value >= LogLevel.ERROR.value)
        return error_count / len(logs) > threshold
        
    return AlertRule(
        name=f"HighErrorRate_{threshold}",
        condition=condition,
        action=send_email_alert,
        cooldown=600
    )

使用说明

初始化分析器:

analyzer = SmartLogAnalyzer(
    alert_rules=[create_error_rate_rule(0.1)],
    max_history=50000
)
analyzer.start_monitoring()

添加日志:

analyzer.add_log(LogEntry(
    timestamp=datetime.now(),
    level=LogLevel.ERROR,
    message="Failed to connect to database: timeout after 30s",
    source="database_service"
))

analyzer.add_log(LogEntry(
    timestamp=datetime.now(),
    level=LogLevel.INFO,
    message="User login successful",
    source="auth_service"
))

分析趋势:

trends = analyzer.analyze_trends()
print(f"Current error rate: {trends.get('error_rate', 0):.2%}")

聚类日志:

clusters = analyzer.cluster_logs()
print(f"Found {clusters.get('total_clusters', 0)} log clusters")

可视化统计:

analyzer.visualize_stats()

停止监控:

analyzer.stop_monitoring()

功能特点

实时分析：即时处理和分析日志数据
智能聚类：自动识别相似日志模式
多级警报：支持不同严重级别的通知
趋势预测：基于历史数据的异常检测
可视化展示：直观的统计图表
自定义规则：灵活定义警报条件
性能优化：高效处理大规模日志
上下文关联：识别相关日志事件
根因分析：帮助定位问题源头
可扩展架构：易于集成新数据源和分析方法

这个日志分析系统非常适合用于运维监控、故障排查、安全审计等场景，能够显著提高日志数据的利用价值和问题发现效率。