26.1 数据管理与治理
数据管理基础
在MLOps中,数据管理是确保机器学习项目成功的关键环节。有效的数据管理包括数据收集、存储、版本控制、质量保证和治理等多个方面。
数据管理的核心挑战:
-
数据质量
- 数据完整性
- 数据一致性
- 数据准确性
- 数据时效性
-
数据血缘
- 数据来源追踪
- 处理流程记录
- 依赖关系管理
- 影响分析
-
数据安全
- 访问控制
- 数据加密
- 隐私保护
- 合规性管理
数据管道架构
import os
import json
import hashlib
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
import sqlite3
from dataclasses import dataclass, asdict
import logging
from abc import ABC, abstractmethod
@dataclass
class DataQualityMetrics:
"""
数据质量指标
"""
completeness: float # 完整性
consistency: float # 一致性
accuracy: float # 准确性
validity: float # 有效性
uniqueness: float # 唯一性
timeliness: float # 时效性
overall_score: float # 总体评分
@dataclass
class DataLineage:
"""
数据血缘信息
"""
source_id: str
target_id: str
transformation: str
timestamp: str
metadata: Dict[str, Any]
class DataQualityChecker:
"""
数据质量检查器
功能:
- 数据完整性检查
- 数据一致性验证
- 数据准确性评估
- 异常值检测
"""
def __init__(self):
self.logger = logging.getLogger(__name__)
def check_completeness(self, df: pd.DataFrame) -> float:
"""
检查数据完整性
Args:
df: 数据框
Returns:
完整性分数 (0-1)
"""
total_cells = df.shape[0] * df.shape[1]
missing_cells = df.isnull().sum().sum()
completeness = 1 - (missing_cells / total_cells)
return completeness
def check_consistency(self, df: pd.DataFrame, rules: Dict[str, Any]) -> float:
"""
检查数据一致性
Args:
df: 数据框
rules: 一致性规则字典
Returns:
一致性分数 (0-1)
"""
violations = 0
total_checks = 0
for column, rule in rules.items():
if column not in df.columns:
continue
total_checks += len(df)
if rule['type'] == 'range':
min_val, max_val = rule['min'], rule['max']
violations += ((df[column] < min_val) | (df[column] > max_val)).sum()
elif rule['type'] == 'categorical':
valid_values = rule['values']
violations += (~df[column].isin(valid_values)).sum()
elif rule['type'] == 'pattern':
pattern = rule['pattern']
violations += (~df[column].str.match(pattern, na=False)).sum()
consistency = 1 - (violations / total_checks) if total_checks > 0 else 1.0
return consistency
def check_uniqueness(self, df: pd.DataFrame, key_columns: List[str]) -> float:
"""
检查数据唯一性
Args:
df: 数据框
key_columns: 关键列列表
Returns:
唯一性分数 (0-1)
"""
if not key_columns or not all(col in df.columns for col in key_columns):
return 1.0
total_rows = len(df)
unique_rows = len(df.drop_duplicates(subset=key_columns))
uniqueness = unique_rows / total_rows if total_rows > 0 else 1.0
return uniqueness
def detect_outliers(self, df: pd.DataFrame, method: str = 'iqr') -> Dict[str, List[int]]:
"""
检测异常值
Args:
df: 数据框
method: 检测方法 ('iqr', 'zscore')
Returns:
异常值索引字典
"""
outliers = {}
numeric_columns = df.select_dtypes(include=[np.number]).columns
for column in numeric_columns:
if method == 'iqr':
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outlier_indices = df[(df[column] < lower_bound) | (df[column] > upper_bound)].index.tolist()
elif method == 'zscore':
z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
outlier_indices = df[z_scores > 3].index.tolist()
outliers[column] = outlier_indices
return outliers
def calculate_quality_metrics(self, df: pd.DataFrame,
consistency_rules: Dict[str, Any] = None,
key_columns: List[str] = None) -> DataQualityMetrics:
"""
计算综合数据质量指标
Args:
df: 数据框
consistency_rules: 一致性规则
key_columns: 关键列
Returns:
数据质量指标
"""
completeness = self.check_completeness(df)
consistency = self.check_consistency(df, consistency_rules or {})
uniqueness = self.check_uniqueness(df, key_columns or [])
# 简化的准确性和有效性评估
accuracy = 0.9 # 实际应用中需要基于业务规则
validity = 0.95 # 实际应用中需要基于数据格式验证
timeliness = 1.0 # 实际应用中需要基于数据时间戳
# 计算总体评分
weights = [0.2, 0.2, 0.15, 0.15, 0.15, 0.15] # 各指标权重
scores = [completeness, consistency, accuracy, validity, uniqueness, timeliness]
overall_score = sum(w * s for w, s in zip(weights, scores))
return DataQualityMetrics(
completeness=completeness,
consistency=consistency,
accuracy=accuracy,
validity=validity,
uniqueness=uniqueness,
timeliness=timeliness,
overall_score=overall_score
)
class DataLineageTracker:
"""
数据血缘追踪器
功能:
- 记录数据转换过程
- 追踪数据依赖关系
- 生成血缘图谱
- 影响分析
"""
def __init__(self, db_path: str = "data_lineage.db"):
self.db_path = db_path
self._init_database()
def _init_database(self):
"""
初始化数据库
"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS lineage (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_id TEXT NOT NULL,
target_id TEXT NOT NULL,
transformation TEXT NOT NULL,
timestamp TEXT NOT NULL,
metadata TEXT
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS datasets (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
description TEXT,
schema_info TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
)
""")
conn.commit()
conn.close()
def register_dataset(self, dataset_id: str, name: str,
description: str = "", schema_info: Dict = None):
"""
注册数据集
Args:
dataset_id: 数据集ID
name: 数据集名称
description: 描述
schema_info: 模式信息
"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
now = datetime.now().isoformat()
cursor.execute("""
INSERT OR REPLACE INTO datasets
(id, name, description, schema_info, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?)
""", (dataset_id, name, description,
json.dumps(schema_info or {}), now, now))
conn.commit()
conn.close()
def record_transformation(self, source_id: str, target_id: str,
transformation: str, metadata: Dict = None):
"""
记录数据转换
Args:
source_id: 源数据集ID
target_id: 目标数据集ID
transformation: 转换描述
metadata: 元数据
"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
INSERT INTO lineage
(source_id, target_id, transformation, timestamp, metadata)
VALUES (?, ?, ?, ?, ?)
""", (source_id, target_id, transformation,
datetime.now().isoformat(), json.dumps(metadata or {})))
conn.commit()
conn.close()
def get_lineage_graph(self, dataset_id: str, direction: str = 'both') -> Dict:
"""
获取数据血缘图
Args:
dataset_id: 数据集ID
direction: 方向 ('upstream', 'downstream', 'both')
Returns:
血缘图字典
"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
lineage_graph = {'nodes': [], 'edges': []}
visited = set()
def traverse(node_id, is_upstream=True):
if node_id in visited:
return
visited.add(node_id)
# 添加节点
cursor.execute("SELECT * FROM datasets WHERE id = ?", (node_id,))
dataset_info = cursor.fetchone()
if dataset_info:
lineage_graph['nodes'].append({
'id': dataset_info[0],
'name': dataset_info[1],
'description': dataset_info[2]
})
# 查找相关边
if direction in ['upstream', 'both'] and is_upstream:
cursor.execute(
"SELECT * FROM lineage WHERE target_id = ?", (node_id,)
)
for edge in cursor.fetchall():
lineage_graph['edges'].append({
'source': edge[1],
'target': edge[2],
'transformation': edge[3],
'timestamp': edge[4]
})
traverse(edge[1], True)
if direction in ['downstream', 'both'] and not is_upstream:
cursor.execute(
"SELECT * FROM lineage WHERE source_id = ?", (node_id,)
)
for edge in cursor.fetchall():
lineage_graph['edges'].append({
'source': edge[1],
'target': edge[2],
'transformation': edge[3],
'timestamp': edge[4]
})
traverse(edge[2], False)
if direction == 'both':
traverse(dataset_id, True)
traverse(dataset_id, False)
else:
traverse(dataset_id, direction == 'upstream')
conn.close()
return lineage_graph
def analyze_impact(self, dataset_id: str) -> Dict:
"""
分析数据变更影响
Args:
dataset_id: 数据集ID
Returns:
影响分析结果
"""
downstream_graph = self.get_lineage_graph(dataset_id, 'downstream')
impact_analysis = {
'affected_datasets': len(downstream_graph['nodes']) - 1, # 排除自身
'affected_transformations': len(downstream_graph['edges']),
'impact_levels': {}
}
# 计算影响层级
for edge in downstream_graph['edges']:
if edge['source'] == dataset_id:
target = edge['target']
if target not in impact_analysis['impact_levels']:
impact_analysis['impact_levels'][target] = 1
else:
# 递归计算更深层级的影响
pass
return impact_analysis
class DataManager:
"""
数据管理器
功能:
- 数据存储管理
- 数据版本控制
- 数据质量监控
- 数据血缘追踪
"""
def __init__(self, storage_path: str):
self.storage_path = Path(storage_path)
self.storage_path.mkdir(parents=True, exist_ok=True)
self.quality_checker = DataQualityChecker()
self.lineage_tracker = DataLineageTracker(
str(self.storage_path / "lineage.db")
)
# 元数据存储
self.metadata_file = self.storage_path / "metadata.json"
self.metadata = self._load_metadata()
def _load_metadata(self) -> Dict:
"""
加载元数据
"""
if self.metadata_file.exists():
with open(self.metadata_file, 'r') as f:
return json.load(f)
return {}
def _save_metadata(self):
"""
保存元数据
"""
with open(self.metadata_file, 'w') as f:
json.dump(self.metadata, f, indent=2, default=str)
def _calculate_hash(self, df: pd.DataFrame) -> str:
"""
计算数据框哈希值
"""
return hashlib.md5(pd.util.hash_pandas_object(df).values).hexdigest()
def store_dataset(self, df: pd.DataFrame, dataset_id: str,
name: str, description: str = "",
quality_rules: Dict = None) -> Dict:
"""
存储数据集
Args:
df: 数据框
dataset_id: 数据集ID
name: 数据集名称
description: 描述
quality_rules: 质量规则
Returns:
存储结果信息
"""
# 计算数据质量指标
quality_metrics = self.quality_checker.calculate_quality_metrics(
df, quality_rules
)
# 计算数据哈希
data_hash = self._calculate_hash(df)
# 存储数据文件
data_file = self.storage_path / f"{dataset_id}.parquet"
df.to_parquet(data_file)
# 记录元数据
dataset_metadata = {
'id': dataset_id,
'name': name,
'description': description,
'file_path': str(data_file),
'data_hash': data_hash,
'shape': df.shape,
'columns': df.columns.tolist(),
'dtypes': df.dtypes.to_dict(),
'quality_metrics': asdict(quality_metrics),
'created_at': datetime.now().isoformat(),
'file_size': data_file.stat().st_size
}
self.metadata[dataset_id] = dataset_metadata
self._save_metadata()
# 注册到血缘追踪器
self.lineage_tracker.register_dataset(
dataset_id, name, description,
{'columns': df.columns.tolist(), 'dtypes': df.dtypes.to_dict()}
)
print(f"数据集已存储: {dataset_id}")
print(f"文件路径: {data_file}")
print(f"数据形状: {df.shape}")
print(f"质量评分: {quality_metrics.overall_score:.3f}")
return dataset_metadata
def load_dataset(self, dataset_id: str) -> Optional[pd.DataFrame]:
"""
加载数据集
Args:
dataset_id: 数据集ID
Returns:
数据框或None
"""
if dataset_id not in self.metadata:
print(f"数据集不存在: {dataset_id}")
return None
file_path = self.metadata[dataset_id]['file_path']
if not Path(file_path).exists():
print(f"数据文件不存在: {file_path}")
return None
return pd.read_parquet(file_path)
def transform_dataset(self, source_id: str, target_id: str,
transformation_func, transformation_name: str,
target_name: str, target_description: str = "") -> Optional[pd.DataFrame]:
"""
转换数据集
Args:
source_id: 源数据集ID
target_id: 目标数据集ID
transformation_func: 转换函数
transformation_name: 转换名称
target_name: 目标数据集名称
target_description: 目标数据集描述
Returns:
转换后的数据框
"""
# 加载源数据
source_df = self.load_dataset(source_id)
if source_df is None:
return None
# 执行转换
try:
target_df = transformation_func(source_df)
except Exception as e:
print(f"数据转换失败: {e}")
return None
# 存储目标数据
self.store_dataset(target_df, target_id, target_name, target_description)
# 记录血缘关系
self.lineage_tracker.record_transformation(
source_id, target_id, transformation_name,
{
'source_shape': source_df.shape,
'target_shape': target_df.shape,
'transformation_time': datetime.now().isoformat()
}
)
print(f"数据转换完成: {source_id} -> {target_id}")
print(f"转换: {transformation_name}")
print(f"形状变化: {source_df.shape} -> {target_df.shape}")
return target_df
def get_dataset_info(self, dataset_id: str) -> Optional[Dict]:
"""
获取数据集信息
Args:
dataset_id: 数据集ID
Returns:
数据集信息字典
"""
return self.metadata.get(dataset_id)
def list_datasets(self) -> List[Dict]:
"""
列出所有数据集
Returns:
数据集信息列表
"""
return list(self.metadata.values())
def validate_dataset(self, dataset_id: str, quality_rules: Dict = None) -> DataQualityMetrics:
"""
验证数据集质量
Args:
dataset_id: 数据集ID
quality_rules: 质量规则
Returns:
质量指标
"""
df = self.load_dataset(dataset_id)
if df is None:
return None
return self.quality_checker.calculate_quality_metrics(df, quality_rules)
def get_lineage_info(self, dataset_id: str) -> Dict:
"""
获取数据血缘信息
Args:
dataset_id: 数据集ID
Returns:
血缘信息字典
"""
return self.lineage_tracker.get_lineage_graph(dataset_id)
# 使用示例
print("数据管理与治理演示")
print("=" * 50)
# 创建数据管理器
data_manager = DataManager("/tmp/ml_data_storage")
# 1. 生成示例原始数据
print("\n1. 生成原始数据")
np.random.seed(42)
raw_data = pd.DataFrame({
'customer_id': range(1, 1001),
'age': np.random.randint(18, 80, 1000),
'income': np.random.normal(50000, 15000, 1000),
'purchase_amount': np.random.exponential(100, 1000),
'category': np.random.choice(['A', 'B', 'C', 'D'], 1000),
'timestamp': pd.date_range('2023-01-01', periods=1000, freq='H')
})
# 添加一些数据质量问题
raw_data.loc[10:15, 'age'] = np.nan # 缺失值
raw_data.loc[20, 'income'] = -5000 # 异常值
raw_data.loc[25, 'category'] = 'X' # 无效类别
# 存储原始数据
quality_rules = {
'age': {'type': 'range', 'min': 18, 'max': 100},
'income': {'type': 'range', 'min': 0, 'max': 200000},
'category': {'type': 'categorical', 'values': ['A', 'B', 'C', 'D']}
}
raw_metadata = data_manager.store_dataset(
raw_data, 'raw_customer_data',
'原始客户数据', '包含客户基本信息和购买记录',
quality_rules
)
# 2. 数据清洗转换
print("\n2. 数据清洗")
def clean_data(df):
cleaned_df = df.copy()
# 处理缺失值
cleaned_df['age'].fillna(cleaned_df['age'].median(), inplace=True)
# 处理异常值
cleaned_df.loc[cleaned_df['income'] < 0, 'income'] = cleaned_df['income'].median()
# 处理无效类别
valid_categories = ['A', 'B', 'C', 'D']
cleaned_df.loc[~cleaned_df['category'].isin(valid_categories), 'category'] = 'A'
return cleaned_df
cleaned_data = data_manager.transform_dataset(
'raw_customer_data', 'cleaned_customer_data',
clean_data, '数据清洗',
'清洗后客户数据', '经过质量处理的客户数据'
)
# 3. 特征工程
print("\n3. 特征工程")
def feature_engineering(df):
feature_df = df.copy()
# 创建新特征
feature_df['income_per_age'] = feature_df['income'] / feature_df['age']
feature_df['purchase_ratio'] = feature_df['purchase_amount'] / feature_df['income']
feature_df['age_group'] = pd.cut(feature_df['age'], bins=[0, 30, 50, 100], labels=['Young', 'Middle', 'Senior'])
feature_df['high_value'] = (feature_df['purchase_amount'] > feature_df['purchase_amount'].quantile(0.8)).astype(int)
# 编码分类变量
feature_df = pd.get_dummies(feature_df, columns=['category', 'age_group'], prefix=['cat', 'age'])
return feature_df
feature_data = data_manager.transform_dataset(
'cleaned_customer_data', 'feature_customer_data',
feature_engineering, '特征工程',
'特征工程客户数据', '包含衍生特征的客户数据'
)
# 4. 数据质量监控
print("\n4. 数据质量监控")
for dataset_id in ['raw_customer_data', 'cleaned_customer_data', 'feature_customer_data']:
print(f"\n{dataset_id} 质量指标:")
quality_metrics = data_manager.validate_dataset(dataset_id, quality_rules)
if quality_metrics:
print(f" 完整性: {quality_metrics.completeness:.3f}")
print(f" 一致性: {quality_metrics.consistency:.3f}")
print(f" 唯一性: {quality_metrics.uniqueness:.3f}")
print(f" 总体评分: {quality_metrics.overall_score:.3f}")
# 5. 数据血缘追踪
print("\n5. 数据血缘追踪")
lineage_info = data_manager.get_lineage_info('feature_customer_data')
print(f"血缘图节点数: {len(lineage_info['nodes'])}")
print(f"血缘图边数: {len(lineage_info['edges'])}")
print("\n数据转换链:")
for edge in lineage_info['edges']:
print(f" {edge['source']} -> {edge['target']} ({edge['transformation']})")
# 6. 影响分析
print("\n6. 影响分析")
impact_analysis = data_manager.lineage_tracker.analyze_impact('raw_customer_data')
print(f"受影响的数据集数量: {impact_analysis['affected_datasets']}")
print(f"受影响的转换数量: {impact_analysis['affected_transformations']}")
# 7. 数据集列表
print("\n7. 数据集概览")
datasets = data_manager.list_datasets()
for dataset in datasets:
print(f"\n数据集: {dataset['name']} ({dataset['id']})")
print(f" 描述: {dataset['description']}")
print(f" 形状: {dataset['shape']}")
print(f" 质量评分: {dataset['quality_metrics']['overall_score']:.3f}")
print(f" 创建时间: {dataset['created_at']}")
print(f" 文件大小: {dataset['file_size']} bytes")
print("\n数据管理与治理演示完成!")
26.2 实验跟踪与管理
实验管理基础
实验跟踪是MLOps中的核心组件,它帮助数据科学家和机器学习工程师系统地记录、比较和管理机器学习实验。
实验跟踪的关键要素:
-
实验配置
- 超参数设置
- 模型架构
- 数据版本
- 环境信息
-
实验结果
- 性能指标
- 训练日志
- 模型文件
- 可视化结果
-
实验比较
- 指标对比
- 参数分析
- 趋势分析
- 最优选择
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Any, Tuple
import warnings
warnings.filterwarnings('ignore')
class ExperimentTracker:
"""
实验跟踪器
功能:
- 实验配置管理
- 指标记录
- 模型保存
- 实验比较
"""
def __init__(self, experiment_name: str, tracking_uri: str = None):
"""
初始化实验跟踪器
Args:
experiment_name: 实验名称
tracking_uri: MLflow跟踪URI
"""
self.experiment_name = experiment_name
if tracking_uri:
mlflow.set_tracking_uri(tracking_uri)
# 设置实验
mlflow.set_experiment(experiment_name)
self.client = MlflowClient()
# 获取实验ID
experiment = self.client.get_experiment_by_name(experiment_name)
self.experiment_id = experiment.experiment_id
def start_run(self, run_name: str = None, tags: Dict[str, str] = None):
"""
开始新的实验运行
Args:
run_name: 运行名称
tags: 标签字典
"""
self.run = mlflow.start_run(run_name=run_name)
if tags:
for key, value in tags.items():
mlflow.set_tag(key, value)
return self.run
def log_params(self, params: Dict[str, Any]):
"""
记录参数
Args:
params: 参数字典
"""
for key, value in params.items():
mlflow.log_param(key, value)
def log_metrics(self, metrics: Dict[str, float], step: int = None):
"""
记录指标
Args:
metrics: 指标字典
step: 步骤数
"""
for key, value in metrics.items():
mlflow.log_metric(key, value, step=step)
def log_model(self, model, model_name: str,
signature=None, input_example=None):
"""
记录模型
Args:
model: 模型对象
model_name: 模型名称
signature: 模型签名
input_example: 输入示例
"""
mlflow.sklearn.log_model(
model, model_name,
signature=signature,
input_example=input_example
)
def log_artifact(self, local_path: str, artifact_path: str = None):
"""
记录文件
Args:
local_path: 本地文件路径
artifact_path: 存储路径
"""
mlflow.log_artifact(local_path, artifact_path)
def log_figure(self, figure, filename: str):
"""
记录图表
Args:
figure: matplotlib图表对象
filename: 文件名
"""
mlflow.log_figure(figure, filename)
def end_run(self):
"""
结束当前运行
"""
mlflow.end_run()
def get_experiment_runs(self) -> List[Dict]:
"""
获取实验的所有运行
Returns:
运行信息列表
"""
runs = self.client.search_runs(
experiment_ids=[self.experiment_id],
order_by=["start_time DESC"]
)
run_data = []
for run in runs:
run_info = {
'run_id': run.info.run_id,
'run_name': run.data.tags.get('mlflow.runName', 'Unnamed'),
'status': run.info.status,
'start_time': run.info.start_time,
'end_time': run.info.end_time,
'params': run.data.params,
'metrics': run.data.metrics,
'tags': run.data.tags
}
run_data.append(run_info)
return run_data
def compare_runs(self, run_ids: List[str], metrics: List[str] = None) -> pd.DataFrame:
"""
比较多个运行
Args:
run_ids: 运行ID列表
metrics: 要比较的指标列表
Returns:
比较结果DataFrame
"""
comparison_data = []
for run_id in run_ids:
run = self.client.get_run(run_id)
run_data = {
'run_id': run_id,
'run_name': run.data.tags.get('mlflow.runName', 'Unnamed')
}
# 添加参数
for param, value in run.data.params.items():
run_data[f'param_{param}'] = value
# 添加指标
for metric, value in run.data.metrics.items():
if metrics is None or metric in metrics:
run_data[f'metric_{metric}'] = value
comparison_data.append(run_data)
return pd.DataFrame(comparison_data)
def get_best_run(self, metric_name: str, ascending: bool = False) -> Dict:
"""
获取最佳运行
Args:
metric_name: 指标名称
ascending: 是否升序排列
Returns:
最佳运行信息
"""
runs = self.get_experiment_runs()
# 过滤包含指定指标的运行
valid_runs = [run for run in runs if metric_name in run['metrics']]
if not valid_runs:
return None
# 排序并获取最佳运行
best_run = sorted(
valid_runs,
key=lambda x: x['metrics'][metric_name],
reverse=not ascending
)[0]
return best_run
class HyperparameterOptimizer:
"""
超参数优化器
功能:
- 自动超参数搜索
- 贝叶斯优化
- 实验记录集成
- 最优参数推荐
"""
def __init__(self, experiment_tracker: ExperimentTracker):
self.tracker = experiment_tracker
def optimize_hyperparameters(self,
objective_func,
param_space: Dict,
n_trials: int = 100,
direction: str = 'maximize') -> Dict:
"""
优化超参数
Args:
objective_func: 目标函数
param_space: 参数空间
n_trials: 试验次数
direction: 优化方向
Returns:
最优参数
"""
study = optuna.create_study(direction=direction)
def objective(trial):
# 从参数空间采样
params = {}
for param_name, param_config in param_space.items():
if param_config['type'] == 'float':
params[param_name] = trial.suggest_float(
param_name,
param_config['low'],
param_config['high']
)
elif param_config['type'] == 'int':
params[param_name] = trial.suggest_int(
param_name,
param_config['low'],
param_config['high']
)
elif param_config['type'] == 'categorical':
params[param_name] = trial.suggest_categorical(
param_name,
param_config['choices']
)
# 开始MLflow运行
with mlflow.start_run(run_name=f"trial_{trial.number}"):
# 记录参数
self.tracker.log_params(params)
# 执行目标函数
score = objective_func(params)
# 记录指标
self.tracker.log_metrics({'objective_score': score})
return score
study.optimize(objective, n_trials=n_trials)
return {
'best_params': study.best_params,
'best_value': study.best_value,
'n_trials': len(study.trials)
}
class MLExperimentPipeline:
"""
机器学习实验流水线
功能:
- 端到端实验流程
- 多模型比较
- 自动化实验
- 结果分析
"""
def __init__(self, experiment_name: str):
self.tracker = ExperimentTracker(experiment_name)
self.optimizer = HyperparameterOptimizer(self.tracker)
self.models = {
'random_forest': RandomForestClassifier,
'logistic_regression': LogisticRegression,
'svm': SVC
}
def prepare_data(self, X, y, test_size: float = 0.2, random_state: int = 42):
"""
准备数据
Args:
X: 特征数据
y: 标签数据
test_size: 测试集比例
random_state: 随机种子
Returns:
分割后的数据
"""
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state
)
# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled, y_train, y_test, scaler
def evaluate_model(self, model, X_train, X_test, y_train, y_test) -> Dict[str, float]:
"""
评估模型
Args:
model: 模型对象
X_train: 训练特征
X_test: 测试特征
y_train: 训练标签
y_test: 测试标签
Returns:
评估指标字典
"""
# 训练模型
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 计算指标
metrics = {
'accuracy': accuracy_score(y_test, y_pred),
'precision': precision_score(y_test, y_pred, average='weighted'),
'recall': recall_score(y_test, y_pred, average='weighted'),
'f1_score': f1_score(y_test, y_pred, average='weighted')
}
# 交叉验证
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
metrics['cv_mean'] = cv_scores.mean()
metrics['cv_std'] = cv_scores.std()
return metrics
def run_baseline_experiments(self, X, y):
"""
运行基线实验
Args:
X: 特征数据
y: 标签数据
Returns:
实验结果
"""
# 准备数据
X_train, X_test, y_train, y_test, scaler = self.prepare_data(X, y)
results = {}
for model_name, model_class in self.models.items():
print(f"\n运行 {model_name} 基线实验...")
with self.tracker.start_run(run_name=f"baseline_{model_name}"):
# 创建模型(使用默认参数)
if model_name == 'random_forest':
model = model_class(random_state=42)
params = {'n_estimators': 100, 'random_state': 42}
elif model_name == 'logistic_regression':
model = model_class(random_state=42, max_iter=1000)
params = {'random_state': 42, 'max_iter': 1000}
elif model_name == 'svm':
model = model_class(random_state=42)
params = {'random_state': 42}
# 记录参数
self.tracker.log_params(params)
# 评估模型
metrics = self.evaluate_model(model, X_train, X_test, y_train, y_test)
# 记录指标
self.tracker.log_metrics(metrics)
# 保存模型
self.tracker.log_model(model, f"{model_name}_model")
results[model_name] = {
'model': model,
'metrics': metrics,
'params': params
}
print(f" 准确率: {metrics['accuracy']:.4f}")
print(f" F1分数: {metrics['f1_score']:.4f}")
return results
def run_hyperparameter_optimization(self, X, y, model_name: str, n_trials: int = 50):
"""
运行超参数优化
Args:
X: 特征数据
y: 标签数据
model_name: 模型名称
n_trials: 试验次数
Returns:
优化结果
"""
# 准备数据
X_train, X_test, y_train, y_test, scaler = self.prepare_data(X, y)
# 定义参数空间
param_spaces = {
'random_forest': {
'n_estimators': {'type': 'int', 'low': 50, 'high': 200},
'max_depth': {'type': 'int', 'low': 3, 'high': 20},
'min_samples_split': {'type': 'int', 'low': 2, 'high': 20},
'min_samples_leaf': {'type': 'int', 'low': 1, 'high': 10}
},
'logistic_regression': {
'C': {'type': 'float', 'low': 0.01, 'high': 100},
'solver': {'type': 'categorical', 'choices': ['liblinear', 'lbfgs']}
},
'svm': {
'C': {'type': 'float', 'low': 0.1, 'high': 100},
'gamma': {'type': 'categorical', 'choices': ['scale', 'auto']},
'kernel': {'type': 'categorical', 'choices': ['rbf', 'linear']}
}
}
param_space = param_spaces.get(model_name)
if not param_space:
print(f"不支持的模型: {model_name}")
return None
def objective(params):
# 创建模型
model_class = self.models[model_name]
if model_name in ['random_forest', 'logistic_regression', 'svm']:
params['random_state'] = 42
if model_name == 'logistic_regression':
params['max_iter'] = 1000
model = model_class(**params)
# 评估模型
metrics = self.evaluate_model(model, X_train, X_test, y_train, y_test)
return metrics['f1_score']
print(f"\n开始 {model_name} 超参数优化...")
optimization_result = self.optimizer.optimize_hyperparameters(
objective, param_space, n_trials, 'maximize'
)
print(f"优化完成:")
print(f" 最佳参数: {optimization_result['best_params']}")
print(f" 最佳F1分数: {optimization_result['best_value']:.4f}")
print(f" 试验次数: {optimization_result['n_trials']}")
return optimization_result
def generate_experiment_report(self) -> Dict:
"""
生成实验报告
Returns:
实验报告字典
"""
runs = self.tracker.get_experiment_runs()
if not runs:
return {'message': '没有找到实验运行记录'}
# 统计信息
total_runs = len(runs)
successful_runs = len([r for r in runs if r['status'] == 'FINISHED'])
# 最佳运行
best_run = self.tracker.get_best_run('f1_score')
# 模型性能比较
model_performance = {}
for run in runs:
if 'f1_score' in run['metrics']:
run_name = run['run_name']
if 'baseline' in run_name:
model_name = run_name.replace('baseline_', '')
if model_name not in model_performance:
model_performance[model_name] = []
model_performance[model_name].append(run['metrics']['f1_score'])
# 计算平均性能
avg_performance = {}
for model, scores in model_performance.items():
avg_performance[model] = {
'mean_f1': np.mean(scores),
'max_f1': np.max(scores),
'runs_count': len(scores)
}
report = {
'experiment_summary': {
'total_runs': total_runs,
'successful_runs': successful_runs,
'success_rate': successful_runs / total_runs if total_runs > 0 else 0
},
'best_run': best_run,
'model_performance': avg_performance,
'recommendations': self._generate_recommendations(avg_performance, best_run)
}
return report
def _generate_recommendations(self, performance: Dict, best_run: Dict) -> List[str]:
"""
生成建议
Args:
performance: 性能统计
best_run: 最佳运行
Returns:
建议列表
"""
recommendations = []
if not performance:
return ['需要运行更多实验以获得建议']
# 找出最佳模型
best_model = max(performance.items(), key=lambda x: x[1]['mean_f1'])
recommendations.append(f"推荐使用 {best_model[0]} 模型,平均F1分数: {best_model[1]['mean_f1']:.4f}")
# 性能差异分析
f1_scores = [perf['mean_f1'] for perf in performance.values()]
if max(f1_scores) - min(f1_scores) < 0.05:
recommendations.append("各模型性能相近,建议考虑模型复杂度和训练时间")
# 超参数优化建议
if best_run and 'trial_' in best_run['run_name']:
recommendations.append("超参数优化有效,建议在生产环境中使用优化后的参数")
return recommendations
# 使用示例
print("实验跟踪与管理演示")
print("=" * 50)
# 使用之前的特征数据
if 'feature_data' in locals():
# 准备机器学习数据
feature_columns = [col for col in feature_data.columns
if col not in ['customer_id', 'timestamp', 'purchase_amount']]
X = feature_data[feature_columns]
y = feature_data['high_value']
print(f"特征数量: {X.shape[1]}")
print(f"样本数量: {X.shape[0]}")
print(f"正样本比例: {y.mean():.3f}")
# 创建实验流水线
pipeline = MLExperimentPipeline("customer_value_prediction")
# 1. 运行基线实验
print("\n1. 基线模型实验")
baseline_results = pipeline.run_baseline_experiments(X, y)
# 2. 超参数优化(选择最佳基线模型)
best_baseline = max(baseline_results.items(),
key=lambda x: x[1]['metrics']['f1_score'])
best_model_name = best_baseline[0]
print(f"\n2. {best_model_name} 超参数优化")
optimization_result = pipeline.run_hyperparameter_optimization(
X, y, best_model_name, n_trials=20
)
# 3. 生成实验报告
print("\n3. 实验报告")
report = pipeline.generate_experiment_report()
print(f"\n实验总结:")
print(f" 总运行次数: {report['experiment_summary']['total_runs']}")
print(f" 成功率: {report['experiment_summary']['success_rate']:.1%}")
if report['best_run']:
print(f"\n最佳运行:")
print(f" 运行名称: {report['best_run']['run_name']}")
print(f" F1分数: {report['best_run']['metrics']['f1_score']:.4f}")
print(f" 准确率: {report['best_run']['metrics']['accuracy']:.4f}")
print(f"\n模型性能对比:")
for model, perf in report['model_performance'].items():
print(f" {model}: 平均F1={perf['mean_f1']:.4f}, 最高F1={perf['max_f1']:.4f}")
print(f"\n建议:")
for i, rec in enumerate(report['recommendations'], 1):
print(f" {i}. {rec}")
# 4. 实验比较
print("\n4. 实验运行比较")
runs = pipeline.tracker.get_experiment_runs()
if len(runs) >= 2:
run_ids = [run['run_id'] for run in runs[:3]] # 比较前3个运行
comparison_df = pipeline.tracker.compare_runs(run_ids, ['accuracy', 'f1_score'])
print("\n运行对比表:")
print(comparison_df[['run_name', 'metric_accuracy', 'metric_f1_score']].to_string(index=False))
else:
print("请先运行数据管理部分以生成特征数据")
print("\n实验跟踪与管理演示完成!")
章节总结
核心知识点
-
数据管理与治理
- 数据质量:完整性、一致性、准确性、有效性评估
- 数据血缘:转换过程追踪、依赖关系管理、影响分析
- 数据版本:哈希验证、元数据记录、变更历史
-
实验跟踪系统
- 实验配置:参数记录、环境信息、数据版本
- 结果管理:指标记录、模型保存、可视化结果
- 实验比较:性能对比、参数分析、最优选择
-
自动化实验流程
- 超参数优化:贝叶斯优化、网格搜索、随机搜索
- 模型比较:多算法评估、交叉验证、统计显著性
- 流水线管理:端到端自动化、结果分析、报告生成
技术要点
-
数据质量保证
- 质量指标:建立完整的数据质量评估体系
- 异常检测:IQR方法、Z-score方法、业务规则验证
- 自动化监控:实时质量检查、告警机制
-
MLflow实验管理
- 实验组织:项目、实验、运行的层次结构
- 参数记录:超参数、模型配置、环境信息
- 模型注册:版本管理、阶段标记、部署追踪
-
超参数优化策略
- Optuna框架:Tree-structured Parzen Estimator算法
- 搜索空间:连续、离散、条件参数定义
- 早停机制:无效试验提前终止、资源优化
应用前景
-
企业级MLOps
- 数据治理:建立企业级数据管理标准
- 实验标准化:统一的实验流程和评估标准
- 知识管理:实验经验积累和最佳实践分享
-
自动化机器学习
- AutoML集成:自动特征工程、模型选择、超参数优化
- 持续学习:在线学习、模型更新、性能监控
- 智能推荐:基于历史实验的参数推荐
-
合规与审计
- 可追溯性:完整的实验历史和决策过程
- 监管要求:满足金融、医疗等行业的合规需求
- 风险管理:模型风险评估、偏差检测、公平性分析
数据管理与实验跟踪是MLOps成功实施的核心基础,通过建立完善的数据治理体系和实验管理流程,可以显著提高机器学习项目的效率、质量和可维护性,为企业的AI转型提供坚实的技术支撑。