在这里插入图片描述

Vue+Prometheus+Grafana实战:打造全方位AI监控面板开发指南

引言:为什么AI系统需要专业的监控方案?

在现代AI系统中,复杂的模型推理、大规模数据处理和分布式架构使得传统监控方法不再适用。一次模型性能下降、一个特征数据漂移、一次GPU内存泄漏,都可能在不经意间导致严重的业务影响。据统计,超过60%的AI系统故障是由于缺乏有效的监控和预警机制造成的。

本文将通过实战方式,展示如何构建一个集数据采集、可视化展示、智能告警于一体的AI系统监控中心,帮助您掌握从零搭建企业级AI监控系统的完整方法论。

1. Prometheus指标采集:AI系统的数据脉搏

1.1 Java服务:全方位指标暴露实践

Java服务指标采集

应用启动

Micrometer初始化

JVM指标

业务指标

GPU指标

内存使用

GC统计

线程状态

接口QPS

请求延迟

错误率

自定义指标

显存使用率

GPU利用率

温度监控

Metrics Endpoint

Prometheus抓取

1.1.1 Spring Boot集成Micrometer
import io.micrometer.core.instrument.*;
import io.micrometer.core.instrument.binder.jvm.*;
import io.micrometer.core.instrument.binder.system.*;
import io.micrometer.prometheus.PrometheusMeterRegistry;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import com.sun.management.OperatingSystemMXBean;
import javax.management.MBeanServer;
import java.lang.management.ManagementFactory;

@Configuration
public class AIPrometheusConfig {
    
    @Bean
    public PrometheusMeterRegistry prometheusMeterRegistry() {
        PrometheusMeterRegistry registry = new PrometheusMeterRegistry(
            io.micrometer.prometheus.PrometheusConfig.DEFAULT
        );
        
        // 绑定JVM指标
        new JvmMemoryMetrics().bindTo(registry);
        new JvmGcMetrics().bindTo(registry);
        new JvmThreadMetrics().bindTo(registry);
        new JvmHeapPressureMetrics().bindTo(registry);
        
        // 绑定系统指标
        new ProcessorMetrics().bindTo(registry);
        new UptimeMetrics().bindTo(registry);
        
        // 自定义GPU监控(需要NVIDIA Management Library)
        registerGPUMetrics(registry);
        
        return registry;
    }
    
    private void registerGPUMetrics(MeterRegistry registry) {
        GpuMonitor gpuMonitor = new GpuMonitor();
        
        // GPU显存使用率
        Gauge.builder("ai.gpu.memory.used", gpuMonitor, GpuMonitor::getUsedMemory)
            .description("GPU memory used in MB")
            .baseUnit("MB")
            .register(registry);
            
        Gauge.builder("ai.gpu.memory.total", gpuMonitor, GpuMonitor::getTotalMemory)
            .description("Total GPU memory in MB")
            .baseUnit("MB")
            .register(registry);
            
        // GPU利用率
        Gauge.builder("ai.gpu.utilization", gpuMonitor, GpuMonitor::getUtilization)
            .description("GPU utilization percentage")
            .baseUnit("percent")
            .register(registry);
            
        // GPU温度
        Gauge.builder("ai.gpu.temperature", gpuMonitor, GpuMonitor::getTemperature)
            .description("GPU temperature in Celsius")
            .baseUnit("celsius")
            .register(registry);
    }
}

// AI业务指标监控
@Component
public class AIMetricsService {
    
    private final MeterRegistry meterRegistry;
    private final Timer recommendationTimer;
    private final Counter predictionCounter;
    private final DistributionSummary featureSizeSummary;
    
    public AIMetricsService(MeterRegistry meterRegistry) {
        this.meterRegistry = meterRegistry;
        
        // 推荐服务计时器
        recommendationTimer = Timer.builder("ai.recommendation.latency")
            .description("Recommendation service latency")
            .publishPercentiles(0.5, 0.95, 0.99)  // P50, P95, P99
            .publishPercentileHistogram()
            .register(meterRegistry);
        
        // 预测计数器
        predictionCounter = Counter.builder("ai.prediction.count")
            .description("Total prediction count")
            .tags("model", "recommendation")
            .register(meterRegistry);
        
        // 特征大小分布
        featureSizeSummary = DistributionSummary.builder("ai.feature.size")
            .description("Feature vector size")
            .baseUnit("bytes")
            .register(meterRegistry);
        
        // 模型性能指标
        Gauge.builder("ai.model.accuracy", this, AIMetricsService::getCurrentAccuracy)
            .description("Current model accuracy")
            .register(meterRegistry);
    }
    
    public Timer.Sample startRecommendationTimer() {
        return Timer.start(meterRegistry);
    }
    
    public void recordRecommendationLatency(Timer.Sample sample, String userId) {
        sample.stop(recommendationTimer);
        predictionCounter.increment();
        
        // 记录业务标签
        meterRegistry.counter("ai.recommendation.by_user", 
            "user_id", userId.substring(0, Math.min(5, userId.length())))
            .increment();
    }
    
    public void recordFeatureSize(int sizeBytes) {
        featureSizeSummary.record(sizeBytes);
    }
    
    private double getCurrentAccuracy() {
        // 从模型服务获取当前准确率
        return ModelService.getCurrentAccuracy();
    }
}

// GPU监控实现(Linux环境)
class GpuMonitor {
    
    public double getUsedMemory() {
        try {
            // 使用nvidia-smi获取GPU信息
            Process process = Runtime.getRuntime().exec(
                "nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits"
            );
            BufferedReader reader = new BufferedReader(
                new InputStreamReader(process.getInputStream())
            );
            String output = reader.readLine();
            process.waitFor();
            
            if (output != null && !output.trim().isEmpty()) {
                return Double.parseDouble(output.trim());
            }
        } catch (Exception e) {
            System.err.println("Failed to get GPU memory: " + e.getMessage());
        }
        return 0.0;
    }
    
    public double getTotalMemory() {
        try {
            Process process = Runtime.getRuntime().exec(
                "nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits"
            );
            BufferedReader reader = new BufferedReader(
                new InputStreamReader(process.getInputStream())
            );
            String output = reader.readLine();
            process.waitFor();
            
            if (output != null && !output.trim().isEmpty()) {
                return Double.parseDouble(output.trim());
            }
        } catch (Exception e) {
            System.err.println("Failed to get total GPU memory: " + e.getMessage());
        }
        return 0.0;
    }
    
    public double getUtilization() {
        try {
            Process process = Runtime.getRuntime().exec(
                "nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits"
            );
            BufferedReader reader = new BufferedReader(
                new InputStreamReader(process.getInputStream())
            );
            String output = reader.readLine();
            process.waitFor();
            
            if (output != null && !output.trim().isEmpty()) {
                return Double.parseDouble(output.trim());
            }
        } catch (Exception e) {
            System.err.println("Failed to get GPU utilization: " + e.getMessage());
        }
        return 0.0;
    }
    
    public double getTemperature() {
        try {
            Process process = Runtime.getRuntime().exec(
                "nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits"
            );
            BufferedReader reader = new BufferedReader(
                new InputStreamReader(process.getInputStream())
            );
            String output = reader.readLine();
            process.waitFor();
            
            if (output != null && !output.trim().isEmpty()) {
                return Double.parseDouble(output.trim());
            }
        } catch (Exception e) {
            System.err.println("Failed to get GPU temperature: " + e.getMessage());
        }
        return 0.0;
    }
}

// Spring Boot配置端点
@RestController
@RequestMapping("/actuator")
public class MetricsController {
    
    private final PrometheusMeterRegistry meterRegistry;
    
    public MetricsController(PrometheusMeterRegistry meterRegistry) {
        this.meterRegistry = meterRegistry;
    }
    
    @GetMapping("/prometheus")
    public String getPrometheusMetrics() {
        return meterRegistry.scrape();
    }
    
    @GetMapping("/metrics/custom")
    public Map<String, Object> getCustomMetrics() {
        Map<String, Object> metrics = new HashMap<>();
        
        // JVM指标
        Runtime runtime = Runtime.getRuntime();
        metrics.put("jvm.memory.free", runtime.freeMemory());
        metrics.put("jvm.memory.total", runtime.totalMemory());
        metrics.put("jvm.memory.max", runtime.maxMemory());
        metrics.put("jvm.processors", runtime.availableProcessors());
        
        // 线程信息
        ThreadMXBean threadBean = ManagementFactory.getThreadMXBean();
        metrics.put("jvm.threads.live", threadBean.getThreadCount());
        metrics.put("jvm.threads.daemon", threadBean.getDaemonThreadCount());
        metrics.put("jvm.threads.peak", threadBean.getPeakThreadCount());
        
        // GC信息
        List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans();
        Map<String, Object> gcMetrics = new HashMap<>();
        for (GarbageCollectorMXBean gcBean : gcBeans) {
            Map<String, Object> gcInfo = new HashMap<>();
            gcInfo.put("count", gcBean.getCollectionCount());
            gcInfo.put("time", gcBean.getCollectionTime());
            gcMetrics.put(gcBean.getName(), gcInfo);
        }
        metrics.put("jvm.gc", gcMetrics);
        
        return metrics;
    }
}

1.2 Python服务:模型与数据质量监控

from prometheus_client import start_http_server, Gauge, Counter, Histogram, Summary, Info
from prometheus_client.exposition import push_to_gateway, pushadd_to_gateway
from prometheus_client.registry import CollectorRegistry
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import threading
import time
import logging
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from scipy import stats
import json

logger = logging.getLogger(__name__)

@dataclass
class ModelMetrics:
    """模型指标数据结构"""
    accuracy: float
    precision: float
    recall: float
    f1_score: float
    auc: float
    inference_time_ms: float
    psi_score: Optional[float] = None
    data_drift: Optional[float] = None
    concept_drift: Optional[float] = None

class AIModelMetricsExporter:
    """AI模型指标导出器"""
    
    def __init__(self, prometheus_port: int = 8000, 
                 push_gateway_url: Optional[str] = None):
        """
        初始化指标导出器
        
        Args:
            prometheus_port: Prometheus HTTP服务端口
            push_gateway_url: Push Gateway地址(批处理任务使用)
        """
        self.port = prometheus_port
        self.push_gateway_url = push_gateway_url
        self.registry = CollectorRegistry()
        
        # 初始化指标
        self._init_metrics()
        
        # 启动HTTP服务器(如果不需要推送模式)
        if not push_gateway_url:
            start_http_server(prometheus_port, registry=self.registry)
            logger.info(f"Prometheus metrics server started on port {prometheus_port}")
    
    def _init_metrics(self):
        """初始化所有监控指标"""
        
        # 模型性能指标
        self.metrics_accuracy = Gauge(
            'ai_model_accuracy',
            'Model accuracy score',
            ['model_name', 'model_version'],
            registry=self.registry
        )
        
        self.metrics_f1_score = Gauge(
            'ai_model_f1_score',
            'Model F1 score',
            ['model_name', 'model_version'],
            registry=self.registry
        )
        
        self.metrics_auc = Gauge(
            'ai_model_auc',
            'Model AUC score',
            ['model_name', 'model_version'],
            registry=self.registry
        )
        
        # 推理性能指标
        self.metrics_inference_time = Histogram(
            'ai_model_inference_time_seconds',
            'Model inference time in seconds',
            ['model_name', 'model_version'],
            buckets=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0],
            registry=self.registry
        )
        
        # PSI (Population Stability Index) 指标
        self.metrics_psi = Gauge(
            'ai_model_psi_score',
            'Population Stability Index score',
            ['model_name', 'feature_name'],
            registry=self.registry
        )
        
        # 数据质量指标
        self.metrics_missing_rate = Gauge(
            'ai_data_missing_rate',
            'Missing value rate in dataset',
            ['dataset', 'feature'],
            registry=self.registry
        )
        
        self.metrics_outlier_rate = Gauge(
            'ai_data_outlier_rate',
            'Outlier rate in dataset',
            ['dataset', 'feature'],
            registry=self.registry
        )
        
        self.metrics_data_volume = Gauge(
            'ai_data_volume',
            'Data volume in dataset',
            ['dataset'],
            registry=self.registry
        )
        
        # 业务指标
        self.metrics_recommendation_ctr = Gauge(
            'ai_recommendation_ctr',
            'Click-through rate of recommendations',
            ['model_name', 'position'],
            registry=self.registry
        )
        
        self.metrics_prediction_count = Counter(
            'ai_prediction_total',
            'Total number of predictions',
            ['model_name', 'status'],
            registry=self.registry
        )
        
        # 模型信息
        self.metrics_model_info = Info(
            'ai_model_info',
            'Model metadata information',
            registry=self.registry
        )
    
    def update_model_performance(self, model_name: str, 
                                model_version: str, 
                                metrics: ModelMetrics):
        """
        更新模型性能指标
        
        Args:
            model_name: 模型名称
            model_version: 模型版本
            metrics: 模型指标数据
        """
        try:
            # 设置性能指标
            self.metrics_accuracy.labels(
                model_name=model_name, 
                model_version=model_version
            ).set(metrics.accuracy)
            
            self.metrics_f1_score.labels(
                model_name=model_name, 
                model_version=model_version
            ).set(metrics.f1_score)
            
            self.metrics_auc.labels(
                model_name=model_name, 
                model_version=model_version
            ).set(metrics.auc)
            
            # 记录推理时间
            self.metrics_inference_time.labels(
                model_name=model_name,
                model_version=model_version
            ).observe(metrics.inference_time_ms / 1000.0)
            
            # 更新模型信息
            self.metrics_model_info.info({
                'model_name': model_name,
                'version': model_version,
                'update_time': datetime.now().isoformat(),
                'metrics_updated': 'true'
            })
            
            logger.info(f"Updated metrics for model {model_name}:{model_version}")
            
        except Exception as e:
            logger.error(f"Failed to update model metrics: {e}")
    
    def calculate_and_update_psi(self, 
                                model_name: str,
                                current_data: pd.DataFrame,
                                reference_data: pd.DataFrame,
                                features: List[str]):
        """
        计算并更新PSI指标
        
        Args:
            model_name: 模型名称
            current_data: 当前数据集
            reference_data: 参考数据集
            features: 需要计算的特征列表
        """
        for feature in features:
            if feature not in current_data.columns or feature not in reference_data.columns:
                logger.warning(f"Feature {feature} not found in datasets")
                continue
            
            try:
                # 清理数据
                current_clean = current_data[feature].dropna()
                reference_clean = reference_data[feature].dropna()
                
                if len(current_clean) == 0 or len(reference_clean) == 0:
                    logger.warning(f"Feature {feature} has no valid data for PSI calculation")
                    continue
                
                # 计算PSI
                psi_score = self._calculate_psi(current_clean, reference_clean)
                
                # 更新指标
                self.metrics_psi.labels(
                    model_name=model_name,
                    feature_name=feature
                ).set(psi_score)
                
                logger.debug(f"PSI for {feature}: {psi_score}")
                
            except Exception as e:
                logger.error(f"Failed to calculate PSI for {feature}: {e}")
    
    def _calculate_psi(self, current: pd.Series, reference: pd.Series, 
                      buckets: int = 10) -> float:
        """
        计算PSI值
        
        Args:
            current: 当前分布
            reference: 参考分布
            buckets: 分桶数量
            
        Returns:
            PSI分数
        """
        # 确定分箱边界
        min_val = min(current.min(), reference.min())
        max_val = max(current.max(), reference.max())
        
        # 避免边界相等
        if max_val == min_val:
            max_val = min_val + 1
        
        # 创建分箱
        bins = np.linspace(min_val, max_val, buckets + 1)
        
        # 计算每个分箱的比例
        current_counts, _ = np.histogram(current, bins=bins)
        reference_counts, _ = np.histogram(reference, bins=bins)
        
        # 避免除零
        current_prop = (current_counts + 0.001) / (len(current) + 0.001 * buckets)
        reference_prop = (reference_counts + 0.001) / (len(reference) + 0.001 * buckets)
        
        # 计算PSI
        psi = np.sum((current_prop - reference_prop) * np.log(current_prop / reference_prop))
        
        return float(psi)
    
    def update_data_quality_metrics(self, 
                                   dataset_name: str,
                                   data: pd.DataFrame):
        """
        更新数据质量指标
        
        Args:
            dataset_name: 数据集名称
            data: 数据DataFrame
        """
        try:
            # 更新数据量
            self.metrics_data_volume.labels(
                dataset=dataset_name
            ).set(len(data))
            
            # 计算并更新每个特征的缺失率和异常值率
            for column in data.columns:
                column_data = data[column]
                
                # 缺失率
                missing_rate = column_data.isnull().mean()
                self.metrics_missing_rate.labels(
                    dataset=dataset_name,
                    feature=column
                ).set(missing_rate)
                
                # 异常值率(基于IQR方法)
                if pd.api.types.is_numeric_dtype(column_data):
                    q1 = column_data.quantile(0.25)
                    q3 = column_data.quantile(0.75)
                    iqr = q3 - q1
                    
                    if iqr > 0:
                        lower_bound = q1 - 1.5 * iqr
                        upper_bound = q3 + 1.5 * iqr
                        outlier_rate = ((column_data < lower_bound) | 
                                       (column_data > upper_bound)).mean()
                        
                        self.metrics_outlier_rate.labels(
                            dataset=dataset_name,
                            feature=column
                        ).set(outlier_rate)
            
            logger.info(f"Updated data quality metrics for {dataset_name}")
            
        except Exception as e:
            logger.error(f"Failed to update data quality metrics: {e}")
    
    def record_prediction(self, 
                         model_name: str, 
                         success: bool = True,
                         metadata: Optional[Dict] = None):
        """
        记录预测请求
        
        Args:
            model_name: 模型名称
            success: 是否成功
            metadata: 附加元数据
        """
        status = "success" if success else "failure"
        
        # 增加计数
        self.metrics_prediction_count.labels(
            model_name=model_name,
            status=status
        ).inc()
        
        # 如果有元数据,可以记录更多维度
        if metadata:
            # 例如记录用户ID的前缀,避免基数爆炸
            user_id = metadata.get('user_id', 'unknown')
            user_prefix = user_id[:5] if user_id != 'unknown' else 'unknown'
            
            # 创建带有额外标签的指标
            with self.registry._lock:
                counter_name = f'ai_prediction_detail_total'
                labels = {
                    'model_name': model_name,
                    'status': status,
                    'user_prefix': user_prefix
                }
                
                # 简化处理:实际中应该使用更高效的方式
                pass
    
    def push_metrics_to_gateway(self, job_name: str = 'ai_model_metrics'):
        """
        推送指标到Push Gateway(适用于批处理任务)
        
        Args:
            job_name: 任务名称
        """
        if not self.push_gateway_url:
            logger.warning("Push Gateway URL not configured")
            return
        
        try:
            push_to_gateway(
                self.push_gateway_url,
                job=job_name,
                registry=self.registry
            )
            logger.info(f"Metrics pushed to gateway {self.push_gateway_url}")
        except Exception as e:
            logger.error(f"Failed to push metrics to gateway: {e}")
    
    def start_periodic_update(self, interval_seconds: int = 60):
        """
        启动定期指标更新
        
        Args:
            interval_seconds: 更新间隔(秒)
        """
        def update_task():
            while True:
                try:
                    # 在这里实现定期更新逻辑
                    # 例如:重新计算PSI、更新数据质量指标等
                    self._periodic_update()
                except Exception as e:
                    logger.error(f"Periodic update failed: {e}")
                
                time.sleep(interval_seconds)
        
        # 启动后台线程
        thread = threading.Thread(target=update_task, daemon=True)
        thread.start()
        logger.info(f"Started periodic metrics update every {interval_seconds} seconds")
    
    def _periodic_update(self):
        """定期更新任务的具体实现"""
        # 这里可以实现各种定期计算的指标
        # 例如:滑动窗口的统计、模型漂移检测等
        pass

# 使用示例
def main():
    """主函数示例"""
    
    # 创建指标导出器
    exporter = AIModelMetricsExporter(
        prometheus_port=8000,
        push_gateway_url='http://localhost:9091'  # Push Gateway地址
    )
    
    # 模拟模型性能指标
    model_metrics = ModelMetrics(
        accuracy=0.92,
        precision=0.89,
        recall=0.91,
        f1_score=0.90,
        auc=0.95,
        inference_time_ms=45.6,
        psi_score=0.03
    )
    
    # 更新模型指标
    exporter.update_model_performance(
        model_name="recommendation_v1",
        model_version="1.2.3",
        metrics=model_metrics
    )
    
    # 模拟数据
    import numpy as np
    import pandas as pd
    
    # 生成测试数据
    np.random.seed(42)
    reference_data = pd.DataFrame({
        'feature1': np.random.normal(0, 1, 1000),
        'feature2': np.random.exponential(1, 1000),
        'feature3': np.random.randint(0, 10, 1000)
    })
    
    current_data = pd.DataFrame({
        'feature1': np.random.normal(0.1, 1.1, 1000),  # 略有漂移
        'feature2': np.random.exponential(1.2, 1000),  # 参数变化
        'feature3': np.random.randint(0, 10, 1000)
    })
    
    # 添加一些缺失值
    current_data.loc[np.random.choice(1000, 50), 'feature1'] = np.nan
    current_data.loc[np.random.choice(1000, 30), 'feature2'] = np.nan
    
    # 计算PSI
    exporter.calculate_and_update_psi(
        model_name="recommendation_v1",
        current_data=current_data,
        reference_data=reference_data,
        features=['feature1', 'feature2', 'feature3']
    )
    
    # 更新数据质量指标
    exporter.update_data_quality_metrics(
        dataset_name="training_data",
        data=current_data
    )
    
    # 记录一些预测
    for i in range(100):
        exporter.record_prediction(
            model_name="recommendation_v1",
            success=np.random.random() > 0.05,  # 95%成功率
            metadata={'user_id': f'user_{i % 10}'}
        )
    
    # 推送指标到Gateway(如果配置了)
    exporter.push_metrics_to_gateway(job_name="ai_model_batch_job")
    
    # 启动定期更新
    exporter.start_periodic_update(interval_seconds=300)  # 每5分钟更新一次
    
    # 保持程序运行
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        print("Shutting down...")

if __name__ == "__main__":
    main()

1.3 Prometheus配置详解

# prometheus.yml
global:
  scrape_interval: 15s  # 默认抓取间隔
  evaluation_interval: 15s  # 规则评估间隔
  external_labels:
    environment: 'production'
    region: 'us-east-1'

# 规则文件
rule_files:
  - "rules/*.yml"

# 告警配置
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

# 抓取配置
scrape_configs:
  # Java服务 - Spring Boot Actuator
  - job_name: 'java-ai-services'
    scrape_interval: 15s
    scrape_timeout: 10s
    metrics_path: '/actuator/prometheus'
    honor_labels: true
    
    # Kubernetes服务发现
    kubernetes_sd_configs:
      - role: pod
        namespaces:
          names:
            - ai-production
            - ai-staging
    
    relabel_configs:
      # 只抓取有特定注解的pod
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      
      # 获取metrics路径
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      
      # 获取端口
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
      
      # 添加标签
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: kubernetes_pod_name
      
      - source_labels: [__meta_kubernetes_pod_container_name]
        action: replace
        target_label: container_name
  
  # Python服务
  - job_name: 'python-ai-services'
    scrape_interval: 20s
    scrape_timeout: 15s
    static_configs:
      - targets:
        - ai-model-service-1:8000
        - ai-model-service-2:8000
        - ai-feature-service:8000
    metrics_path: '/metrics'
    
    # 添加额外标签
    relabel_configs:
      - source_labels: [__address__]
        target_label: instance
        regex: '([^:]+)(?::\d+)?'
        replacement: '$1'
      
      - source_labels: []
        target_label: service_type
        replacement: 'python_ai_service'
  
  # GPU节点监控
  - job_name: 'gpu-nodes'
    scrape_interval: 30s
    scrape_timeout: 25s
    static_configs:
      - targets:
        - gpu-node-1:9100  # node-exporter
        - gpu-node-2:9100
        - gpu-node-3:9100
    
    # 通过node-exporter获取系统指标
    # 需要额外安装nvidia-dcgm-exporter获取GPU指标
    relabel_configs:
      - source_labels: [__address__]
        target_label: instance
        regex: '([^:]+):\d+'
        replacement: '$1'
  
  # Push Gateway(用于批处理任务)
  - job_name: 'pushgateway'
    honor_labels: true  # 非常重要!保留推送的标签
    scrape_interval: 15s
    static_configs:
      - targets: ['pushgateway:9091']
  
  # Blackbox监控(服务健康检查)
  - job_name: 'blackbox-ai-services'
    scrape_interval: 60s
    metrics_path: /probe
    params:
      module: [http_2xx]
    static_configs:
      - targets:
        - http://ai-recommendation-service/health
        - http://ai-model-service/health
        - http://ai-feature-store/health
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: blackbox-exporter:9115

# 远程读写配置(可选)
remote_write:
  - url: "https://prometheus-cloud.example.com/api/v1/write"
    basic_auth:
      username: "${API_USER}"
      password: "${API_KEY}"
    queue_config:
      max_samples_per_send: 10000
      capacity: 20000
      max_shards: 50

remote_read:
  - url: "https://prometheus-cloud.example.com/api/v1/read"
    basic_auth:
      username: "${API_USER}"
      password: "${API_KEY}"

告警流程

数据存储与查询

监控数据流

抓取指标

抓取指标

抓取指标

抓取指标

抓取指标

注册

注册

提供端点

提供端点

存储

查询

查询

发送告警

发送告警

发送告警

展示

用户访问

处理告警

Kubernetes集群

暴露端口

暴露端口

暴露端口

AI服务Pod

Service

模型服务Pod

Service

特征服务Pod

Service

Prometheus Server

Java服务 /actuator/prometheus

Python服务 /metrics

GPU节点 /metrics

Push Gateway

Blackbox Exporter

Java服务

Micrometer Registry

Python服务

Prometheus Client

Prometheus TSDB

Grafana

Alert Manager

企业微信

邮件

短信

监控面板

运维团队

问题修复

2. Grafana集成与自定义面板

2.1 数据源配置与面板设计

// datasources/prod-prometheus.yaml
apiVersion: 1

datasources:
  - name: Prometheus-Production
    type: prometheus
    access: proxy
    url: http://prometheus.production.svc.cluster.local:9090
    isDefault: true
    jsonData:
      timeInterval: "15s"
      queryTimeout: "60s"
      httpMethod: "POST"
      manageAlerts: true
      prometheusType: "Prometheus"
      prometheusVersion: "2.40.0"
      cacheLevel: "High"
      incrementalQueryOverlapWindow: "10m"
    secureJsonData:
      basicAuthPassword: $${PROMETHEUS_PASSWORD}
    editable: true
  
  - name: Prometheus-Staging
    type: prometheus
    access: proxy
    url: http://prometheus.staging.svc.cluster.local:9090
    jsonData:
      timeInterval: "30s"
      queryTimeout: "30s"
    editable: true

// dashboards/ai-system-monitoring.json
{
  "dashboard": {
    "title": "AI系统监控中心",
    "description": "全方位监控AI系统性能、模型质量、数据漂移",
    "tags": ["ai", "machine-learning", "monitoring"],
    "style": "dark",
    "timezone": "browser",
    "panels": [
      {
        "id": 1,
        "title": "系统概览",
        "type": "stat",
        "gridPos": {"h": 3, "w": 12, "x": 0, "y": 0},
        "targets": [
          {
            "expr": "sum(rate(ai_prediction_total[5m]))",
            "legendFormat": "预测QPS",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "color": {"mode": "thresholds"},
            "mappings": [],
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 1000},
                {"color": "red", "value": 5000}
              ]
            },
            "unit": "reqps"
          }
        }
      },
      {
        "id": 2,
        "title": "模型准确率趋势",
        "type": "timeseries",
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 3},
        "targets": [
          {
            "expr": "ai_model_accuracy{model_name=~\"$model\"}",
            "legendFormat": "{{model_name}} - {{model_version}}",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "color": {"mode": "palette-classic"},
            "custom": {
              "drawStyle": "line",
              "lineInterpolation": "smooth",
              "barAlignment": 0,
              "lineWidth": 2,
              "fillOpacity": 10,
              "gradientMode": "none",
              "spanNulls": false,
              "showPoints": "auto",
              "pointSize": 5
            },
            "mappings": [],
            "thresholds": {
              "steps": [
                {"color": "red", "value": null},
                {"color": "yellow", "value": 0.8},
                {"color": "green", "value": 0.9}
              ]
            },
            "unit": "percentunit"
          }
        },
        "options": {
          "tooltip": {"mode": "single"},
          "legend": {"displayMode": "table", "placement": "bottom"}
        }
      },
      {
        "id": 3,
        "title": "推理延迟分布(P99)",
        "type": "timeseries",
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 3},
        "targets": [
          {
            "expr": "histogram_quantile(0.99, sum(rate(ai_model_inference_time_seconds_bucket[5m])) by (le, model_name))",
            "legendFormat": "{{model_name}}",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "s",
            "decimals": 3,
            "color": {"mode": "continuous-BlPu"}
          }
        }
      },
      {
        "id": 4,
        "title": "GPU监控",
        "type": "gauge",
        "gridPos": {"h": 6, "w": 6, "x": 0, "y": 11},
        "targets": [
          {
            "expr": "ai_gpu_utilization",
            "legendFormat": "GPU利用率",
            "refId": "A"
          },
          {
            "expr": "ai_gpu_memory_used / ai_gpu_memory_total * 100",
            "legendFormat": "显存使用率",
            "refId": "B"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 70},
                {"color": "red", "value": 90}
              ]
            },
            "max": 100,
            "min": 0
          }
        }
      },
      {
        "id": 5,
        "title": "数据质量监控",
        "type": "heatmap",
        "gridPos": {"h": 10, "w": 12, "x": 12, "y": 11},
        "targets": [
          {
            "expr": "sum(ai_data_missing_rate) by (dataset, feature)",
            "legendFormat": "{{dataset}} - {{feature}}",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "color": {"mode": "scheme", "scheme": "RdYlBu"},
            "custom": {
              "hideZero": false,
              "reverseYBuckets": false
            }
          }
        }
      },
      {
        "id": 6,
        "title": "PSI监控",
        "type": "bargauge",
        "gridPos": {"h": 6, "w": 6, "x": 6, "y": 11},
        "targets": [
          {
            "expr": "ai_model_psi_score",
            "legendFormat": "{{feature_name}}",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "none",
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 0.1},
                {"color": "orange", "value": 0.25},
                {"color": "red", "value": 0.5}
              ]
            },
            "mappings": [
              {
                "type": "value",
                "options": {
                  "0": {"text": "无漂移"},
                  "0.1": {"text": "轻度漂移"},
                  "0.25": {"text": "中度漂移"},
                  "0.5": {"text": "严重漂移"}
                }
              }
            ]
          }
        },
        "options": {
          "orientation": "horizontal",
          "displayMode": "gradient",
          "reduceOptions": {
            "values": false,
            "calcs": ["lastNotNull"]
          }
        }
      }
    ],
    "templating": {
      "list": [
        {
          "name": "model",
          "label": "模型选择",
          "type": "query",
          "query": "label_values(ai_model_accuracy, model_name)",
          "multi": true,
          "includeAll": true,
          "allValue": ".*"
        },
        {
          "name": "environment",
          "label": "环境",
          "type": "custom",
          "query": "production,staging,development",
          "multi": false,
          "includeAll": false
        },
        {
          "name": "time_range",
          "label": "时间范围",
          "type": "interval",
          "query": "1h,6h,12h,1d,7d,30d",
          "multi": false,
          "includeAll": false
        }
      ]
    },
    "time": {"from": "now-1h", "to": "now"},
    "refresh": "30s",
    "schemaVersion": 37,
    "version": 1,
    "uid": "ai-system-monitoring"
  }
}

2.2 智能告警规则配置

# rules/ai-alerts.yml
groups:
  - name: ai-system-alerts
    interval: 30s
    rules:
      # 模型准确率下降告警
      - alert: ModelAccuracyDrop
        expr: |
          ai_model_accuracy < 0.85
          and
          ai_prediction_total > 0
        for: 5m
        labels:
          severity: critical
          team: ai-ops
          component: model
        annotations:
          summary: "模型准确率下降: {{ $labels.model_name }}"
          description: |
            模型 {{ $labels.model_name }} 的准确率已降至 {{ $value | humanizePercentage }}。
            当前版本: {{ $labels.model_version }}
            建议检查训练数据和模型性能。
          dashboard: "AI系统监控中心"
          runbook: "https://wiki.example.com/runbooks/model-accuracy-drop"
      
      # 推理延迟过高告警
      - alert: HighInferenceLatency
        expr: |
          histogram_quantile(0.99, rate(ai_model_inference_time_seconds_bucket[5m])) > 1.0
        for: 2m
        labels:
          severity: warning
          team: ai-ops
          component: inference
        annotations:
          summary: "推理延迟过高: {{ $labels.model_name }}"
          description: |
            P99推理延迟超过1秒。
            当前延迟: {{ $value | humanizeDuration }}
            模型: {{ $labels.model_name }}
            建议检查模型服务负载和GPU状态。
      
      # GPU显存告警
      - alert: GPUHighMemoryUsage
        expr: |
          (ai_gpu_memory_used / ai_gpu_memory_total) * 100 > 90
        for: 3m
        labels:
          severity: critical
          team: infrastructure
          component: gpu
        annotations:
          summary: "GPU显存使用率过高: {{ $labels.instance }}"
          description: |
            GPU显存使用率超过90%。
            使用率: {{ $value | humanizePercentage }}
            GPU实例: {{ $labels.instance }}
            总显存: {{ query "ai_gpu_memory_total{instance='$labels.instance'}" | first | value | humanize1024 }}MB
            已使用: {{ query "ai_gpu_memory_used{instance='$labels.instance'}" | first | value | humanize1024 }}MB
      
      # 数据漂移告警
      - alert: DataDriftDetected
        expr: |
          ai_model_psi_score > 0.25
        for: 10m
        labels:
          severity: warning
          team: data-science
          component: data-quality
        annotations:
          summary: "检测到数据漂移: {{ $labels.feature_name }}"
          description: |
            特征 {{ $labels.feature_name }} 的PSI分数超过0.25,表示存在数据漂移。
            当前PSI: {{ $value }}
            模型: {{ $labels.model_name }}
            建议重新训练模型或调整特征处理。
      
      # 服务可用性告警
      - alert: AIServiceDown
        expr: |
          up{job=~"java-ai-services|python-ai-services"} == 0
        for: 1m
        labels:
          severity: critical
          team: ai-ops
          component: service
        annotations:
          summary: "AI服务不可用: {{ $labels.instance }}"
          description: |
            AI服务 {{ $labels.instance }} 无法访问。
            服务类型: {{ $labels.job }}
            Namespace: {{ $labels.kubernetes_namespace }}
            Pod: {{ $labels.kubernetes_pod_name }}
      
      # 预测成功率下降告警
      - alert: PredictionSuccessRateDrop
        expr: |
          (
            rate(ai_prediction_total{status="success"}[5m])
            /
            rate(ai_prediction_total[5m])
          ) * 100 < 95
        for: 5m
        labels:
          severity: warning
          team: ai-ops
          component: business
        annotations:
          summary: "预测成功率下降: {{ $labels.model_name }}"
          description: |
            预测成功率降至95%以下。
            当前成功率: {{ $value | humanizePercentage }}
            模型: {{ $labels.model_name }}
      
      # 数据缺失率过高告警
      - alert: HighDataMissingRate
        expr: |
          ai_data_missing_rate > 0.2
        for: 5m
        labels:
          severity: warning
          team: data-engineering
          component: data-quality
        annotations:
          summary: "数据缺失率过高: {{ $labels.dataset }} - {{ $labels.feature }}"
          description: |
            数据集 {{ $labels.dataset }} 的特征 {{ $labels.feature }} 缺失率超过20%。
            当前缺失率: {{ $value | humanizePercentage }}
      
      # 资源使用率趋势告警(基于预测)
      - alert: ResourceUsageTrendingHigh
        expr: |
          predict_linear(ai_gpu_memory_used[1h], 3600) / ai_gpu_memory_total > 0.95
        for: 5m
        labels:
          severity: warning
          team: infrastructure
          component: capacity-planning
        annotations:
          summary: "GPU显存使用趋势过高: {{ $labels.instance }}"
          description: |
            基于过去1小时趋势预测,GPU显存将在1小时内超过95%。
            当前使用率: {{ query "(ai_gpu_memory_used{instance='$labels.instance'} / ai_gpu_memory_total{instance='$labels.instance'}) * 100" | first | value | humanizePercentage }}
            预测1小时后使用率: {{ $value | humanizePercentage }}

2.3 告警通知配置

# alertmanager.yml
global:
  resolve_timeout: 5m
  wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
  wechat_api_secret: '${WECHAT_SECRET}'
  wechat_api_corp_id: '${WECHAT_CORP_ID}'

route:
  group_by: ['alertname', 'severity', 'component']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 12h
  receiver: 'ai-ops-team'
  
  # 子路由,按严重程度和团队分发
  routes:
    - match:
        severity: critical
      receiver: 'ai-ops-critical'
      group_wait: 10s
      continue: true
    
    - match:
        team: data-science
      receiver: 'data-science-team'
      continue: false
    
    - match:
        team: infrastructure
      receiver: 'infra-team'
      continue: false

receivers:
  - name: 'ai-ops-team'
    wechat_configs:
      - agent_id: '1000002'
        to_user: '@all'
        message: '{{ template "wechat.default.message" . }}'
        api_secret: '${WECHAT_SECRET}'
        corp_id: '${WECHAT_CORP_ID}'
    
    email_configs:
      - to: 'ai-ops@example.com'
        from: 'alertmanager@example.com'
        smarthost: 'smtp.example.com:587'
        auth_username: '${SMTP_USER}'
        auth_password: '${SMTP_PASSWORD}'
        headers:
          Subject: '[AI监控告警] {{ .GroupLabels.alertname }}'
        html: '{{ template "email.default.html" . }}'
    
    webhook_configs:
      - url: 'http://webhook-receiver:5000/webhook'
        send_resolved: true
        http_config:
          basic_auth:
            username: '${WEBHOOK_USER}'
            password: '${WEBHOOK_PASSWORD}'
  
  - name: 'ai-ops-critical'
    wechat_configs:
      - agent_id: '1000002'
        to_user: 'WangWei|ZhangSan|LiSi'
        message: '{{ template "wechat.critical.message" . }}'
        api_secret: '${WECHAT_SECRET}'
        corp_id: '${WECHAT_CORP_ID}'
    
    sms_configs:
      - to: '+8613800138000,+8613800138001'
        text: '{{ template "sms.default.text" . }}'
        from: 'AI-Monitor'
        api_url: 'https://sms-api.example.com/send'
        api_secret: '${SMS_SECRET}'
  
  - name: 'data-science-team'
    wechat_configs:
      - agent_id: '1000003'
        to_user: 'DataScienceTeam'
        message: '{{ template "wechat.datascience.message" . }}'
        api_secret: '${WECHAT_SECRET}'
        corp_id: '${WECHAT_CORP_ID}'
  
  - name: 'infra-team'
    wechat_configs:
      - agent_id: '1000004'
        to_user: 'InfraTeam'
        message: '{{ template "wechat.infra.message" . }}'
        api_secret: '${WECHAT_SECRET}'
        corp_id: '${WECHAT_CORP_ID}'

templates:
  - '/etc/alertmanager/templates/*.tmpl'

# 抑制规则(避免告警风暴)
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'instance']
  
  - source_match:
      alertname: 'AIServiceDown'
    target_match:
      alertname: 'HighInferenceLatency'
    equal: ['instance']

告警抑制机制

告警处理流程

告警处理

通知渠道

路由分发

告警分组

触发告警

critical

warning

data-science

infrastructure

防止告警风暴

抑制相关告警

抑制相关告警

Prometheus规则评估

Alert Manager

按规则分组

等待group_wait

去重与聚合

严重程度?

AI运维-关键

AI运维-常规

涉及团队?

数据科学团队

基础设施团队

企业微信-关键群

短信通知

电话呼叫

企业微信-常规群

邮件通知

企业微信-数据团队

企业微信-基础设施团队

告警确认

问题分析

修复实施

恢复验证

告警关闭

学习与改进

优化监控规则

抑制规则

服务不可用

性能下降

节点宕机

服务异常

3. Vue前端集成与扩展

3.1 Vue监控中心完整实现

<!-- src/views/AIMonitor.vue -->
<template>
  <div class="ai-monitor-container">
    <!-- 顶部导航 -->
    <div class="monitor-header">
      <el-page-header @back="goBack" title="返回">
        <template #content>
          <div class="header-content">
            <span class="title">AI系统监控中心</span>
            <div class="header-actions">
              <el-button-group>
                <el-button 
                  :type="activeTab === 'overview' ? 'primary' : ''"
                  @click="switchTab('overview')">
                  系统概览
                </el-button>
                <el-button 
                  :type="activeTab === 'metrics' ? 'primary' : ''"
                  @click="switchTab('metrics')">
                  指标查询
                </el-button>
                <el-button 
                  :type="activeTab === 'alerts' ? 'primary' : ''"
                  @click="switchTab('alerts')">
                  告警管理
                </el-button>
                <el-button 
                  :type="activeTab === 'dashboard' ? 'primary' : ''"
                  @click="switchTab('dashboard')">
                  仪表盘
                </el-button>
              </el-button-group>
              
              <div class="time-selector">
                <el-select v-model="timeRange" @change="updateTimeRange">
                  <el-option label="最近1小时" value="1h"></el-option>
                  <el-option label="最近6小时" value="6h"></el-option>
                  <el-option label="最近24小时" value="24h"></el-option>
                  <el-option label="最近7天" value="7d"></el-option>
                  <el-option label="最近30天" value="30d"></el-option>
                  <el-option label="自定义" value="custom"></el-option>
                </el-select>
                
                <el-date-picker
                  v-if="timeRange === 'custom'"
                  v-model="customTimeRange"
                  type="datetimerange"
                  range-separator="至"
                  start-placeholder="开始时间"
                  end-placeholder="结束时间"
                  @change="updateCustomTimeRange">
                </el-date-picker>
              </div>
            </div>
          </div>
        </template>
      </el-page-header>
    </div>
    
    <!-- 主要内容区域 -->
    <div class="monitor-content">
      <!-- 系统概览 -->
      <div v-if="activeTab === 'overview'" class="overview-tab">
        <el-row :gutter="20">
          <!-- 关键指标卡片 -->
          <el-col :xs="24" :sm="12" :md="6" v-for="metric in keyMetrics" :key="metric.name">
            <metric-card :metric="metric"></metric-card>
          </el-col>
        </el-row>
        
        <el-row :gutter="20" class="charts-row">
          <el-col :xs="24" :lg="12">
            <div class="chart-container">
              <h3>模型准确率趋势</h3>
              <accuracy-chart :data="accuracyData"></accuracy-chart>
            </div>
          </el-col>
          <el-col :xs="24" :lg="12">
            <div class="chart-container">
              <h3>推理延迟分布</h3>
              <latency-chart :data="latencyData"></latency-chart>
            </div>
          </el-col>
        </el-row>
        
        <el-row :gutter="20">
          <el-col :xs="24">
            <div class="chart-container">
              <h3>实时告警状态</h3>
              <alert-status :alerts="activeAlerts"></alert-status>
            </div>
          </el-col>
        </el-row>
      </div>
      
      <!-- 指标查询 -->
      <div v-if="activeTab === 'metrics'" class="metrics-tab">
        <promql-editor 
          @execute="executePromQL"
          @save-query="saveQuery"
          :saved-queries="savedQueries">
        </promql-editor>
        
        <div class="query-results">
          <el-tabs v-model="resultView" class="result-tabs">
            <el-tab-pane label="图表" name="chart">
              <query-chart 
                v-if="queryResult"
                :result="queryResult"
                :query="currentQuery">
              </query-chart>
            </el-tab-pane>
            <el-tab-pane label="表格" name="table">
              <query-table 
                v-if="queryResult"
                :result="queryResult">
              </query-table>
            </el-tab-pane>
            <el-tab-pane label="原始数据" name="raw">
              <pre class="raw-data">{{ queryResult ? JSON.stringify(queryResult, null, 2) : '' }}</pre>
            </el-tab-pane>
          </el-tabs>
        </div>
      </div>
      
      <!-- 告警管理 -->
      <div v-if="activeTab === 'alerts'" class="alerts-tab">
        <alert-manager 
          :alerts="allAlerts"
          @acknowledge="acknowledgeAlert"
          @silence="silenceAlert"
          @resolve="resolveAlert">
        </alert-manager>
      </div>
      
      <!-- Grafana仪表盘嵌入 -->
      <div v-if="activeTab === 'dashboard'" class="dashboard-tab">
        <dashboard-selector 
          :dashboards="availableDashboards"
          @select="loadDashboard">
        </dashboard-selector>
        
        <div v-if="selectedDashboard" class="grafana-iframe-container">
          <iframe 
            :src="grafanaUrl"
            frameborder="0"
            class="grafana-iframe"
            @load="onIframeLoad">
          </iframe>
          
          <div v-if="iframeLoading" class="iframe-loading">
            <el-skeleton :rows="5" animated />
          </div>
        </div>
      </div>
    </div>
    
    <!-- 侧边栏告警通知 -->
    <alert-notification-sidebar 
      :alerts="recentAlerts"
      @click-alert="handleAlertClick">
    </alert-notification-sidebar>
  </div>
</template>

<script>
import { ref, reactive, computed, onMounted, onUnmounted } from 'vue'
import { useRouter } from 'vue-router'
import { ElMessage, ElMessageBox } from 'element-plus'
import MetricCard from '../components/MetricCard.vue'
import AccuracyChart from '../components/charts/AccuracyChart.vue'
import LatencyChart from '../components/charts/LatencyChart.vue'
import AlertStatus from '../components/AlertStatus.vue'
import PromqlEditor from '../components/PromqlEditor.vue'
import QueryChart from '../components/QueryChart.vue'
import QueryTable from '../components/QueryTable.vue'
import AlertManager from '../components/AlertManager.vue'
import DashboardSelector from '../components/DashboardSelector.vue'
import AlertNotificationSidebar from '../components/AlertNotificationSidebar.vue'
import { 
  fetchKeyMetrics, 
  fetchAccuracyData, 
  fetchLatencyData,
  fetchAlerts,
  executePromQLQuery,
  getGrafanaDashboards,
  acknowledgeGrafanaAlert,
  silenceGrafanaAlert,
  resolveGrafanaAlert
} from '../api/monitor'

export default {
  name: 'AIMonitor',
  components: {
    MetricCard,
    AccuracyChart,
    LatencyChart,
    AlertStatus,
    PromqlEditor,
    QueryChart,
    QueryTable,
    AlertManager,
    DashboardSelector,
    AlertNotificationSidebar
  },
  setup() {
    const router = useRouter()
    
    // 响应式数据
    const activeTab = ref('overview')
    const timeRange = ref('1h')
    const customTimeRange = ref([])
    const resultView = ref('chart')
    const queryResult = ref(null)
    const currentQuery = ref('')
    const iframeLoading = ref(false)
    const selectedDashboard = ref(null)
    
    // 状态数据
    const state = reactive({
      keyMetrics: [],
      accuracyData: [],
      latencyData: [],
      activeAlerts: [],
      allAlerts: [],
      recentAlerts: [],
      savedQueries: JSON.parse(localStorage.getItem('savedPromQLQueries') || '[]'),
      availableDashboards: [],
      refreshInterval: null
    })
    
    // 计算属性
    const grafanaUrl = computed(() => {
      if (!selectedDashboard.value) return ''
      
      const baseUrl = selectedDashboard.value.url
      const from = getTimeRangeStart()
      const to = getTimeRangeEnd()
      const refresh = timeRange.value === '1h' ? '30s' : '1m'
      
      return `${baseUrl}?from=${from}&to=${to}&refresh=${refresh}&kiosk`
    })
    
    // 方法
    const goBack = () => {
      router.go(-1)
    }
    
    const switchTab = (tab) => {
      activeTab.value = tab
      
      // 加载对应标签页的数据
      if (tab === 'overview') {
        loadOverviewData()
      } else if (tab === 'alerts') {
        loadAlerts()
      } else if (tab === 'dashboard') {
        loadDashboards()
      }
    }
    
    const updateTimeRange = () => {
      if (timeRange.value !== 'custom') {
        // 重新加载数据
        if (activeTab.value === 'overview') {
          loadOverviewData()
        }
      }
    }
    
    const updateCustomTimeRange = () => {
      if (customTimeRange.value && customTimeRange.value.length === 2) {
        // 重新加载数据
        if (activeTab.value === 'overview') {
          loadOverviewData()
        }
      }
    }
    
    const getTimeRangeStart = () => {
      if (timeRange.value === 'custom' && customTimeRange.value[0]) {
        return customTimeRange.value[0].getTime()
      }
      
      const ranges = {
        '1h': Date.now() - 3600000,
        '6h': Date.now() - 21600000,
        '24h': Date.now() - 86400000,
        '7d': Date.now() - 604800000,
        '30d': Date.now() - 2592000000
      }
      
      return ranges[timeRange.value] || Date.now() - 3600000
    }
    
    const getTimeRangeEnd = () => {
      if (timeRange.value === 'custom' && customTimeRange.value[1]) {
        return customTimeRange.value[1].getTime()
      }
      return Date.now()
    }
    
    const loadOverviewData = async () => {
      try {
        const [metrics, accuracy, latency, alerts] = await Promise.all([
          fetchKeyMetrics(getTimeRangeStart(), getTimeRangeEnd()),
          fetchAccuracyData(getTimeRangeStart(), getTimeRangeEnd()),
          fetchLatencyData(getTimeRangeStart(), getTimeRangeEnd()),
          fetchAlerts('active')
        ])
        
        state.keyMetrics = metrics
        state.accuracyData = accuracy
        state.latencyData = latency
        state.activeAlerts = alerts
      } catch (error) {
        ElMessage.error('加载概览数据失败: ' + error.message)
      }
    }
    
    const loadAlerts = async () => {
      try {
        const [active, all, recent] = await Promise.all([
          fetchAlerts('active'),
          fetchAlerts('all'),
          fetchAlerts('recent')
        ])
        
        state.activeAlerts = active
        state.allAlerts = all
        state.recentAlerts = recent
      } catch (error) {
        ElMessage.error('加载告警数据失败: ' + error.message)
      }
    }
    
    const loadDashboards = async () => {
      try {
        state.availableDashboards = await getGrafanaDashboards()
      } catch (error) {
        ElMessage.error('加载仪表盘列表失败: ' + error.message)
      }
    }
    
    const executePromQL = async (query) => {
      currentQuery.value = query
      
      try {
        const result = await executePromQLQuery(query, getTimeRangeStart(), getTimeRangeEnd())
        queryResult.value = result
      } catch (error) {
        ElMessage.error('执行PromQL查询失败: ' + error.message)
      }
    }
    
    const saveQuery = (query) => {
      if (!state.savedQueries.includes(query)) {
        state.savedQueries.push(query)
        localStorage.setItem('savedPromQLQueries', JSON.stringify(state.savedQueries))
        ElMessage.success('查询已保存')
      }
    }
    
    const loadDashboard = (dashboard) => {
      selectedDashboard.value = dashboard
      iframeLoading.value = true
    }
    
    const onIframeLoad = () => {
      iframeLoading.value = false
    }
    
    const acknowledgeAlert = async (alertId) => {
      try {
        await acknowledgeGrafanaAlert(alertId)
        ElMessage.success('告警已确认')
        loadAlerts() // 重新加载告警列表
      } catch (error) {
        ElMessage.error('确认告警失败: ' + error.message)
      }
    }
    
    const silenceAlert = async (alertId, duration) => {
      try {
        await silenceGrafanaAlert(alertId, duration)
        ElMessage.success('告警已静默')
        loadAlerts()
      } catch (error) {
        ElMessage.error('静默告警失败: ' + error.message)
      }
    }
    
    const resolveAlert = async (alertId) => {
      try {
        await resolveGrafanaAlert(alertId)
        ElMessage.success('告警已解决')
        loadAlerts()
      } catch (error) {
        ElMessage.error('解决告警失败: ' + error.message)
      }
    }
    
    const handleAlertClick = (alert) => {
      // 切换到告警标签页并选中该告警
      activeTab.value = 'alerts'
      // 这里可以添加滚动到特定告警的逻辑
    }
    
    const startAutoRefresh = () => {
      // 每30秒刷新一次数据
      state.refreshInterval = setInterval(() => {
        if (activeTab.value === 'overview') {
          loadOverviewData()
        }
      }, 30000)
    }
    
    const stopAutoRefresh = () => {
      if (state.refreshInterval) {
        clearInterval(state.refreshInterval)
        state.refreshInterval = null
      }
    }
    
    // 生命周期
    onMounted(() => {
      loadOverviewData()
      loadAlerts()
      startAutoRefresh()
    })
    
    onUnmounted(() => {
      stopAutoRefresh()
    })
    
    return {
      activeTab,
      timeRange,
      customTimeRange,
      resultView,
      queryResult,
      currentQuery,
      iframeLoading,
      selectedDashboard,
      grafanaUrl,
      ...state,
      
      goBack,
      switchTab,
      updateTimeRange,
      updateCustomTimeRange,
      executePromQL,
      saveQuery,
      loadDashboard,
      onIframeLoad,
      acknowledgeAlert,
      silenceAlert,
      resolveAlert,
      handleAlertClick
    }
  }
}
</script>

<style scoped>
.ai-monitor-container {
  height: 100vh;
  display: flex;
  flex-direction: column;
  background: #f5f7fa;
}

.monitor-header {
  background: white;
  box-shadow: 0 2px 12px 0 rgba(0, 0, 0, 0.1);
  padding: 16px 24px;
  z-index: 100;
}

.header-content {
  display: flex;
  justify-content: space-between;
  align-items: center;
  width: 100%;
}

.header-content .title {
  font-size: 20px;
  font-weight: 600;
  color: #303133;
}

.header-actions {
  display: flex;
  align-items: center;
  gap: 16px;
}

.time-selector {
  display: flex;
  align-items: center;
  gap: 8px;
}

.monitor-content {
  flex: 1;
  padding: 24px;
  overflow: auto;
}

.overview-tab .charts-row {
  margin-top: 20px;
  margin-bottom: 20px;
}

.chart-container {
  background: white;
  border-radius: 8px;
  padding: 20px;
  box-shadow: 0 2px 12px 0 rgba(0, 0, 0, 0.1);
  margin-bottom: 20px;
  height: 400px;
}

.chart-container h3 {
  margin: 0 0 20px 0;
  color: #303133;
  font-size: 16px;
}

.metrics-tab {
  display: flex;
  flex-direction: column;
  height: calc(100vh - 160px);
}

.query-results {
  flex: 1;
  margin-top: 20px;
  background: white;
  border-radius: 8px;
  padding: 20px;
  box-shadow: 0 2px 12px 0 rgba(0, 0, 0, 0.1);
}

.result-tabs {
  height: 100%;
}

.result-tabs :deep(.el-tabs__content) {
  height: calc(100% - 55px);
  overflow: auto;
}

.raw-data {
  background: #f6f8fa;
  padding: 16px;
  border-radius: 6px;
  font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
  font-size: 12px;
  line-height: 1.5;
  overflow: auto;
  max-height: 600px;
}

.dashboard-tab {
  height: calc(100vh - 160px);
}

.grafana-iframe-container {
  position: relative;
  height: calc(100% - 60px);
  margin-top: 20px;
  border-radius: 8px;
  overflow: hidden;
  box-shadow: 0 2px 12px 0 rgba(0, 0, 0, 0.1);
}

.grafana-iframe {
  width: 100%;
  height: 100%;
  border: none;
}

.iframe-loading {
  position: absolute;
  top: 0;
  left: 0;
  right: 0;
  bottom: 0;
  background: rgba(255, 255, 255, 0.8);
  display: flex;
  align-items: center;
  justify-content: center;
  padding: 40px;
}

/* 响应式设计 */
@media (max-width: 768px) {
  .header-content {
    flex-direction: column;
    align-items: flex-start;
    gap: 12px;
  }
  
  .header-actions {
    width: 100%;
    flex-direction: column;
    align-items: flex-start;
  }
  
  .time-selector {
    width: 100%;
  }
  
  .monitor-content {
    padding: 12px;
  }
  
  .chart-container {
    height: 300px;
    padding: 12px;
  }
}
</style>

3.2 PromQL查询编辑器组件

<!-- src/components/PromqlEditor.vue -->
<template>
  <div class="promql-editor">
    <div class="editor-header">
      <span class="title">PromQL查询编辑器</span>
      <div class="header-actions">
        <el-button type="primary" @click="executeQuery" :loading="loading">
          执行查询
        </el-button>
        <el-button @click="saveQuery" :disabled="!query.trim()">
          保存查询
        </el-button>
        <el-button @click="clearQuery">
          清空
        </el-button>
        <el-dropdown @command="handleTemplateCommand">
          <el-button>
            查询模板<el-icon class="el-icon--right"><arrow-down /></el-icon>
          </el-button>
          <template #dropdown>
            <el-dropdown-menu>
              <el-dropdown-item 
                v-for="template in queryTemplates" 
                :key="template.name"
                :command="template">
                {{ template.name }}
              </el-dropdown-item>
            </el-dropdown-menu>
          </template>
        </el-dropdown>
      </div>
    </div>
    
    <div class="editor-container">
      <div class="editor-main">
        <!-- 使用CodeMirror作为PromQL编辑器 -->
        <div ref="editorRef" class="code-editor"></div>
      </div>
      
      <div class="editor-sidebar">
        <div class="sidebar-section">
          <h4>保存的查询</h4>
          <div class="saved-queries">
            <div 
              v-for="(savedQuery, index) in savedQueries"
              :key="index"
              class="saved-query-item"
              @click="loadSavedQuery(savedQuery)">
              <span class="query-text">{{ truncateQuery(savedQuery) }}</span>
              <el-icon class="delete-icon" @click.stop="deleteSavedQuery(index)">
                <Delete />
              </el-icon>
            </div>
            <div v-if="savedQueries.length === 0" class="empty-message">
              暂无保存的查询
            </div>
          </div>
        </div>
        
        <div class="sidebar-section">
          <h4>常用函数</h4>
          <div class="function-list">
            <div 
              v-for="func in commonFunctions"
              :key="func.name"
              class="function-item"
              @click="insertFunction(func)">
              <span class="func-name">{{ func.name }}</span>
              <span class="func-desc">{{ func.description }}</span>
            </div>
          </div>
        </div>
        
        <div class="sidebar-section">
          <h4>指标列表</h4>
          <el-input
            v-model="metricFilter"
            placeholder="过滤指标..."
            size="small"
            clearable>
          </el-input>
          <div class="metric-list">
            <div 
              v-for="metric in filteredMetrics"
              :key="metric"
              class="metric-item"
              @click="insertMetric(metric)">
              {{ metric }}
            </div>
            <div v-if="filteredMetrics.length === 0" class="empty-message">
              未找到匹配的指标
            </div>
          </div>
        </div>
      </div>
    </div>
    
    <div v-if="history.length > 0" class="query-history">
      <h4>查询历史</h4>
      <div class="history-list">
        <div 
          v-for="(item, index) in history.slice(0, 5)"
          :key="index"
          class="history-item"
          @click="loadHistoryQuery(item.query)">
          <span class="query-text">{{ truncateQuery(item.query) }}</span>
          <span class="query-time">{{ formatTime(item.timestamp) }}</span>
        </div>
      </div>
    </div>
  </div>
</template>

<script>
import { ref, reactive, computed, onMounted, onUnmounted, nextTick } from 'vue'
import { ElMessage } from 'element-plus'
import { Delete, ArrowDown } from '@element-plus/icons-vue'
import CodeMirror from 'codemirror'
import 'codemirror/lib/codemirror.css'
import 'codemirror/mode/promql/promql'
import 'codemirror/theme/dracula.css'
import 'codemirror/addon/hint/show-hint'
import 'codemirror/addon/hint/show-hint.css'
import 'codemirror/addon/hint/anyword-hint'
import 'codemirror/addon/edit/matchbrackets'
import 'codemirror/addon/edit/closebrackets'
import { getMetricsList } from '../api/monitor'

export default {
  name: 'PromqlEditor',
  components: {
    Delete,
    ArrowDown
  },
  props: {
    savedQueries: {
      type: Array,
      default: () => []
    }
  },
  emits: ['execute', 'save-query'],
  setup(props, { emit }) {
    const editorRef = ref(null)
    const loading = ref(false)
    const metricFilter = ref('')
    
    let editor = null
    
    // 状态数据
    const state = reactive({
      query: '',
      metrics: [],
      history: JSON.parse(localStorage.getItem('promqlQueryHistory') || '[]'),
      queryTemplates: [
        {
          name: 'CPU使用率',
          query: '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
        },
        {
          name: '内存使用率',
          query: '(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100'
        },
        {
          name: '模型准确率',
          query: 'ai_model_accuracy'
        },
        {
          name: '推理延迟P99',
          query: 'histogram_quantile(0.99, sum(rate(ai_model_inference_time_seconds_bucket[5m])) by (le, model_name))'
        },
        {
          name: '预测QPS',
          query: 'sum(rate(ai_prediction_total[5m]))'
        }
      ],
      commonFunctions: [
        { name: 'rate()', description: '计算增长率' },
        { name: 'increase()', description: '计算增长量' },
        { name: 'sum()', description: '求和' },
        { name: 'avg()', description: '平均值' },
        { name: 'max()', description: '最大值' },
        { name: 'min()', description: '最小值' },
        { name: 'count()', description: '计数' },
        { name: 'histogram_quantile()', description: '分位数计算' },
        { name: 'predict_linear()', description: '线性预测' },
        { name: 'label_replace()', description: '标签替换' }
      ]
    })
    
    // 计算属性
    const filteredMetrics = computed(() => {
      if (!metricFilter.value) {
        return state.metrics.slice(0, 50) // 限制显示数量
      }
      
      return state.metrics
        .filter(metric => metric.toLowerCase().includes(metricFilter.value.toLowerCase()))
        .slice(0, 50)
    })
    
    // 方法
    const initEditor = () => {
      editor = CodeMirror(editorRef.value, {
        mode: 'promql',
        theme: 'dracula',
        lineNumbers: true,
        lineWrapping: true,
        matchBrackets: true,
        autoCloseBrackets: true,
        extraKeys: {
          'Ctrl-Space': 'autocomplete',
          'Enter': () => executeQuery()
        },
        hintOptions: {
          completeSingle: false,
          alignWithWord: true
        }
      })
      
      // 监听变化
      editor.on('change', () => {
        state.query = editor.getValue()
      })
      
      // 设置初始值
      editor.setValue(state.query)
    }
    
    const loadMetrics = async () => {
      try {
        state.metrics = await getMetricsList()
      } catch (error) {
        console.error('加载指标列表失败:', error)
      }
    }
    
    const executeQuery = () => {
      const query = editor.getValue().trim()
      if (!query) {
        ElMessage.warning('请输入查询语句')
        return
      }
      
      loading.value = true
      
      // 保存到历史记录
      addToHistory(query)
      
      // 触发执行
      emit('execute', query)
      
      // 模拟加载状态
      setTimeout(() => {
        loading.value = false
      }, 500)
    }
    
    const saveQuery = () => {
      const query = editor.getValue().trim()
      if (!query) {
        ElMessage.warning('请输入查询语句')
        return
      }
      
      emit('save-query', query)
    }
    
    const clearQuery = () => {
      editor.setValue('')
    }
    
    const handleTemplateCommand = (template) => {
      editor.setValue(template.query)
    }
    
    const loadSavedQuery = (query) => {
      editor.setValue(query)
    }
    
    const deleteSavedQuery = (index) => {
      const newSaved = [...props.savedQueries]
      newSaved.splice(index, 1)
      localStorage.setItem('savedPromQLQueries', JSON.stringify(newSaved))
      ElMessage.success('查询已删除')
    }
    
    const insertFunction = (func) => {
      const cursor = editor.getCursor()
      editor.replaceRange(func.name, cursor)
      editor.focus()
    }
    
    const insertMetric = (metric) => {
      const cursor = editor.getCursor()
      editor.replaceRange(metric, cursor)
      editor.focus()
    }
    
    const loadHistoryQuery = (query) => {
      editor.setValue(query)
    }
    
    const addToHistory = (query) => {
      const historyItem = {
        query,
        timestamp: Date.now()
      }
      
      // 避免重复
      const existingIndex = state.history.findIndex(item => item.query === query)
      if (existingIndex !== -1) {
        state.history.splice(existingIndex, 1)
      }
      
      // 添加到开头
      state.history.unshift(historyItem)
      
      // 限制历史记录数量
      if (state.history.length > 50) {
        state.history = state.history.slice(0, 50)
      }
      
      // 保存到本地存储
      localStorage.setItem('promqlQueryHistory', JSON.stringify(state.history))
    }
    
    const truncateQuery = (query) => {
      if (query.length > 60) {
        return query.substring(0, 57) + '...'
      }
      return query
    }
    
    const formatTime = (timestamp) => {
      const date = new Date(timestamp)
      const now = new Date()
      const diff = now - date
      
      if (diff < 60000) { // 1分钟内
        return '刚刚'
      } else if (diff < 3600000) { // 1小时内
        return Math.floor(diff / 60000) + '分钟前'
      } else if (diff < 86400000) { // 1天内
        return Math.floor(diff / 3600000) + '小时前'
      } else {
        return date.toLocaleDateString()
      }
    }
    
    // 生命周期
    onMounted(async () => {
      await nextTick()
      initEditor()
      loadMetrics()
    })
    
    onUnmounted(() => {
      if (editor) {
        editor.toTextArea() // 清理CodeMirror实例
      }
    })
    
    return {
      editorRef,
      loading,
      metricFilter,
      ...state,
      filteredMetrics,
      
      executeQuery,
      saveQuery,
      clearQuery,
      handleTemplateCommand,
      loadSavedQuery,
      deleteSavedQuery,
      insertFunction,
      insertMetric,
      loadHistoryQuery,
      truncateQuery,
      formatTime
    }
  }
}
</script>

<style scoped>
.promql-editor {
  height: 100%;
  display: flex;
  flex-direction: column;
  background: white;
  border-radius: 8px;
  box-shadow: 0 2px 12px 0 rgba(0, 0, 0, 0.1);
}

.editor-header {
  padding: 16px 20px;
  border-bottom: 1px solid #ebeef5;
  display: flex;
  justify-content: space-between;
  align-items: center;
}

.editor-header .title {
  font-size: 16px;
  font-weight: 600;
  color: #303133;
}

.header-actions {
  display: flex;
  gap: 8px;
}

.editor-container {
  flex: 1;
  display: flex;
  min-height: 400px;
}

.editor-main {
  flex: 3;
  border-right: 1px solid #ebeef5;
}

.code-editor {
  height: 100%;
}

.code-editor :deep(.CodeMirror) {
  height: 100%;
  font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
  font-size: 14px;
}

.code-editor :deep(.CodeMirror-scroll) {
  min-height: 300px;
}

.editor-sidebar {
  flex: 1;
  min-width: 250px;
  max-width: 300px;
  padding: 16px;
  overflow-y: auto;
  border-left: 1px solid #ebeef5;
}

.sidebar-section {
  margin-bottom: 24px;
}

.sidebar-section h4 {
  margin: 0 0 12px 0;
  color: #606266;
  font-size: 14px;
  font-weight: 500;
}

.saved-queries {
  max-height: 200px;
  overflow-y: auto;
}

.saved-query-item {
  padding: 8px 12px;
  border-radius: 4px;
  margin-bottom: 4px;
  background: #f5f7fa;
  cursor: pointer;
  display: flex;
  justify-content: space-between;
  align-items: center;
}

.saved-query-item:hover {
  background: #ebeef5;
}

.saved-query-item .query-text {
  font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
  font-size: 12px;
  color: #303133;
  flex: 1;
  overflow: hidden;
  text-overflow: ellipsis;
  white-space: nowrap;
}

.delete-icon {
  color: #f56c6c;
  cursor: pointer;
  font-size: 14px;
  margin-left: 8px;
}

.delete-icon:hover {
  color: #f78989;
}

.empty-message {
  color: #909399;
  font-size: 12px;
  text-align: center;
  padding: 12px;
}

.function-list {
  max-height: 300px;
  overflow-y: auto;
}

.function-item {
  padding: 8px 12px;
  border-radius: 4px;
  margin-bottom: 4px;
  background: #f5f7fa;
  cursor: pointer;
}

.function-item:hover {
  background: #ebeef5;
}

.func-name {
  display: block;
  font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
  font-size: 12px;
  color: #409eff;
  font-weight: 500;
}

.func-desc {
  display: block;
  font-size: 11px;
  color: #909399;
  margin-top: 2px;
}

.metric-list {
  max-height: 300px;
  overflow-y: auto;
  margin-top: 8px;
}

.metric-item {
  padding: 6px 12px;
  font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
  font-size: 12px;
  color: #303133;
  cursor: pointer;
  border-radius: 4px;
  margin-bottom: 2px;
}

.metric-item:hover {
  background: #f5f7fa;
}

.query-history {
  padding: 16px 20px;
  border-top: 1px solid #ebeef5;
}

.history-list {
  max-height: 150px;
  overflow-y: auto;
}

.history-item {
  padding: 8px 12px;
  border-radius: 4px;
  margin-bottom: 4px;
  background: #f5f7fa;
  cursor: pointer;
  display: flex;
  justify-content: space-between;
  align-items: center;
}

.history-item:hover {
  background: #ebeef5;
}

.history-item .query-text {
  font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
  font-size: 12px;
  color: #303133;
  flex: 1;
  overflow: hidden;
  text-overflow: ellipsis;
  white-space: nowrap;
}

.history-item .query-time {
  font-size: 11px;
  color: #909399;
  margin-left: 8px;
}

/* 响应式设计 */
@media (max-width: 768px) {
  .editor-container {
    flex-direction: column;
  }
  
  .editor-main {
    border-right: none;
    border-bottom: 1px solid #ebeef5;
    height: 300px;
  }
  
  .editor-sidebar {
    max-width: none;
    border-left: none;
    border-top: 1px solid #ebeef5;
  }
  
  .header-actions {
    flex-wrap: wrap;
    justify-content: flex-end;
  }
}
</style>

(由于篇幅限制,本文只展示了部分核心代码。完整实现还包括:告警管理组件、图表组件、API层封装、响应式设计优化、数据缓存策略等。)

4. 实战:AI系统监控中心完整架构

优化层

通知渠道

可视化层

存储计算层

数据采集层

Micrometer

prometheus-client

node-exporter

Push Gateway

Java AI服务

Prometheus格式

Python AI服务

Prometheus格式

GPU节点

系统指标

模型批处理

批处理指标

Prometheus集群

TSDB存储

PromQL查询引擎

实时计算

Alert Manager

告警规则引擎

Grafana

监控面板

告警配置

数据源管理

Vue监控中心

自定义查询

告警管理

仪表盘嵌入

实时通知

企业微信

运维团队

邮件

管理人员

短信

值班人员

Webhook

自动化系统

Redis缓存

API响应加速

防抖机制

避免重复告警

响应式设计

多端适配

数据压缩

存储优化

5. 性能优化与最佳实践

5.1 监控数据缓存策略

// src/utils/cache.js
import Redis from 'ioredis'

class MonitorCache {
  constructor() {
    this.redis = new Redis({
      host: process.env.REDIS_HOST || 'localhost',
      port: process.env.REDIS_PORT || 6379,
      password: process.env.REDIS_PASSWORD,
      keyPrefix: 'ai-monitor:'
    })
    
    // 默认缓存时间(秒)
    this.defaultTTL = {
      metrics: 300,      // 5分钟
      alerts: 60,        // 1分钟
      dashboards: 1800,  // 30分钟
      queries: 600       // 10分钟
    }
  }
  
  /**
   * 获取缓存数据
   * @param {string} key 缓存键
   * @returns {Promise<any>} 缓存数据
   */
  async get(key) {
    try {
      const data = await this.redis.get(key)
      return data ? JSON.parse(data) : null
    } catch (error) {
      console.error('Cache get error:', error)
      return null
    }
  }
  
  /**
   * 设置缓存数据
   * @param {string} key 缓存键
   * @param {any} data 数据
   * @param {number} ttl 过期时间(秒)
   * @returns {Promise<void>}
   */
  async set(key, data, ttl = null) {
    try {
      const ttlValue = ttl || this._getDefaultTTL(key)
      await this.redis.setex(key, ttlValue, JSON.stringify(data))
    } catch (error) {
      console.error('Cache set error:', error)
    }
  }
  
  /**
   * 删除缓存
   * @param {string} key 缓存键
   * @returns {Promise<void>}
   */
  async del(key) {
    try {
      await this.redis.del(key)
    } catch (error) {
      console.error('Cache delete error:', error)
    }
  }
  
  /**
   * 获取带有缓存的API数据
   * @param {string} cacheKey 缓存键
   * @param {Function} fetchFn 数据获取函数
   * @param {number} ttl 缓存时间
   * @returns {Promise<any>}
   */
  async getWithCache(cacheKey, fetchFn, ttl = null) {
    // 尝试从缓存获取
    const cached = await this.get(cacheKey)
    if (cached) {
      return cached
    }
    
    // 缓存未命中,调用API
    const freshData = await fetchFn()
    
    // 设置缓存(异步,不阻塞返回)
    this.set(cacheKey, freshData, ttl).catch(console.error)
    
    return freshData
  }
  
  /**
   * 批量获取缓存
   * @param {string[]} keys 键数组
   * @returns {Promise<Object>} 缓存数据映射
   */
  async mget(keys) {
    try {
      const values = await this.redis.mget(keys)
      const result = {}
      
      keys.forEach((key, index) => {
        if (values[index]) {
          result[key] = JSON.parse(values[index])
        }
      })
      
      return result
    } catch (error) {
      console.error('Cache mget error:', error)
      return {}
    }
  }
  
  /**
   * 获取缓存统计信息
   * @returns {Promise<Object>}
   */
  async getStats() {
    try {
      const info = await this.redis.info()
      const keys = await this.redis.keys('*')
      
      return {
        totalKeys: keys.length,
        memoryUsage: info.match(/used_memory_human:(\S+)/)?.[1],
        hitRate: await this._calculateHitRate(),
        uptime: info.match(/uptime_in_seconds:(\d+)/)?.[1]
      }
    } catch (error) {
      console.error('Get cache stats error:', error)
      return {}
    }
  }
  
  /**
   * 计算缓存命中率
   * @private
   */
  async _calculateHitRate() {
    // 这里可以实现更精确的命中率计算
    // 例如使用Redis的INFO命令中的keyspace_hits和keyspace_misses
    const info = await this.redis.info()
    const hits = parseInt(info.match(/keyspace_hits:(\d+)/)?.[1] || 0)
    const misses = parseInt(info.match(/keyspace_misses:(\d+)/)?.[1] || 0)
    const total = hits + misses
    
    return total > 0 ? (hits / total * 100).toFixed(2) : 0
  }
  
  /**
   * 根据键前缀获取默认TTL
   * @private
   */
  _getDefaultTTL(key) {
    if (key.startsWith('metrics:')) return this.defaultTTL.metrics
    if (key.startsWith('alerts:')) return this.defaultTTL.alerts
    if (key.startsWith('dashboards:')) return this.defaultTTL.dashboards
    if (key.startsWith('queries:')) return this.defaultTTL.queries
    return 300 // 默认5分钟
  }
}

// 监控API封装,带缓存
class CachedMonitorAPI {
  constructor() {
    this.cache = new MonitorCache()
    this.cacheEnabled = process.env.CACHE_ENABLED !== 'false'
  }
  
  /**
   * 获取关键指标(带缓存)
   */
  async getKeyMetrics(start, end) {
    const cacheKey = `metrics:key:${start}:${end}`
    
    if (!this.cacheEnabled) {
      return this._fetchKeyMetrics(start, end)
    }
    
    return this.cache.getWithCache(
      cacheKey,
      () => this._fetchKeyMetrics(start, end)
    )
  }
  
  /**
   * 获取告警列表(带缓存)
   */
  async getAlerts(state = 'active') {
    const cacheKey = `alerts:${state}:${Math.floor(Date.now() / 60000)}` // 按分钟缓存
    
    if (!this.cacheEnabled) {
      return this._fetchAlerts(state)
    }
    
    return this.cache.getWithCache(
      cacheKey,
      () => this._fetchAlerts(state),
      60 // 告警数据缓存1分钟
    )
  }
  
  /**
   * 执行PromQL查询(带缓存)
   */
  async executeQuery(query, start, end) {
    const queryHash = this._hashString(query)
    const cacheKey = `queries:${queryHash}:${start}:${end}`
    
    // 如果是实时查询(最近5分钟),不缓存
    const isRealTime = (Date.now() - start) < 300000
    
    if (!this.cacheEnabled || isRealTime) {
      return this._executePromQL(query, start, end)
    }
    
    return this.cache.getWithCache(
      cacheKey,
      () => this._executePromQL(query, start, end),
      600 // 查询结果缓存10分钟
    )
  }
  
  /**
   * 批量获取指标(带缓存)
   */
  async batchGetMetrics(queries) {
    if (!this.cacheEnabled || queries.length === 0) {
      return Promise.all(queries.map(q => this.executeQuery(...q)))
    }
    
    // 构建缓存键
    const cacheKeys = queries.map(([query, start, end]) => {
      const queryHash = this._hashString(query)
      return `queries:${queryHash}:${start}:${end}`
    })
    
    // 批量获取缓存
    const cachedResults = await this.cache.mget(cacheKeys)
    
    // 找出需要重新查询的
    const results = []
    const toFetch = []
    
    queries.forEach((queryParams, index) => {
      const cacheKey = cacheKeys[index]
      const cached = cachedResults[cacheKey]
      
      if (cached) {
        results[index] = cached
      } else {
        results[index] = null
        toFetch.push({ index, queryParams })
      }
    })
    
    // 并发获取缺失的数据
    if (toFetch.length > 0) {
      const fetchPromises = toFetch.map(({ index, queryParams }) => 
        this._executePromQL(...queryParams)
          .then(data => {
            results[index] = data
            // 异步设置缓存
            const cacheKey = cacheKeys[index]
            this.cache.set(cacheKey, data, 600).catch(console.error)
          })
      )
      
      await Promise.all(fetchPromises)
    }
    
    return results
  }
  
  /**
   * 清理查询缓存
   */
  async clearQueryCache(query) {
    if (!query) {
      // 清理所有查询缓存
      const keys = await this.cache.redis.keys('ai-monitor:queries:*')
      if (keys.length > 0) {
        await this.cache.redis.del(...keys)
      }
    } else {
      const queryHash = this._hashString(query)
      const pattern = `ai-monitor:queries:${queryHash}:*`
      const keys = await this.cache.redis.keys(pattern)
      if (keys.length > 0) {
        await this.cache.redis.del(...keys)
      }
    }
  }
  
  /**
   * 获取缓存统计
   */
  async getCacheStats() {
    return this.cache.getStats()
  }
  
  // 私有方法
  async _fetchKeyMetrics(start, end) {
    // 实际API调用
    const response = await fetch(`/api/metrics/key?start=${start}&end=${end}`)
    return response.json()
  }
  
  async _fetchAlerts(state) {
    const response = await fetch(`/api/alerts?state=${state}`)
    return response.json()
  }
  
  async _executePromQL(query, start, end) {
    const response = await fetch('/api/query', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ query, start, end })
    })
    return response.json()
  }
  
  _hashString(str) {
    let hash = 0
    for (let i = 0; i < str.length; i++) {
      const char = str.charCodeAt(i)
      hash = ((hash << 5) - hash) + char
      hash = hash & hash // 转换为32位整数
    }
    return Math.abs(hash).toString(36)
  }
}

// 导出单例实例
export const cachedAPI = new CachedMonitorAPI()

5.2 告警防抖与降级策略

// src/utils/alertThrottle.js
class AlertThrottle {
  constructor() {
    // 存储告警状态
    this.alertStates = new Map()
    
    // 配置
    this.config = {
      // 相同告警的最小间隔(毫秒)
      minInterval: 5 * 60 * 1000, // 5分钟
      
      // 最大告警频率(次/小时)
      maxFrequency: 12,
      
      // 静默期(毫秒)
      silencePeriod: 30 * 60 * 1000, // 30分钟
      
      // 告警升级规则
      escalationRules: [
        { count: 3, within: 3600000, level: 'warning' },      // 1小时内3次 -> warning
        { count: 5, within: 3600000, level: 'critical' },     // 1小时内5次 -> critical
        { count: 10, within: 86400000, level: 'emergency' }   // 24小时内10次 -> emergency
      ]
    }
  }
  
  /**
   * 检查是否应该发送告警
   * @param {Object} alert 告警对象
   * @returns {Object} 处理结果 { shouldSend: boolean, reason: string, level: string }
   */
  shouldSendAlert(alert) {
    const alertKey = this._getAlertKey(alert)
    const now = Date.now()
    
    // 获取或初始化告警状态
    let state = this.alertStates.get(alertKey)
    if (!state) {
      state = {
        lastSent: 0,
        sentCount: 0,
        firstSent: 0,
        lastLevel: 'normal',
        silencedUntil: 0
      }
      this.alertStates.set(alertKey, state)
    }
    
    // 检查是否在静默期
    if (now < state.silencedUntil) {
      return {
        shouldSend: false,
        reason: 'alert_silenced',
        level: state.lastLevel,
        retryAfter: state.silencedUntil - now
      }
    }
    
    // 检查最小间隔
    const timeSinceLast = now - state.lastSent
    if (timeSinceLast < this.config.minInterval) {
      return {
        shouldSend: false,
        reason: 'min_interval_not_met',
        level: state.lastLevel,
        retryAfter: this.config.minInterval - timeSinceLast
      }
    }
    
    // 检查频率限制
    const hourAgo = now - 3600000
    const recentCount = this._getRecentCount(alertKey, hourAgo)
    
    if (recentCount >= this.config.maxFrequency) {
      // 超过频率限制,进入静默期
      state.silencedUntil = now + this.config.silencePeriod
      
      return {
        shouldSend: true, // 发送静默通知
        reason: 'rate_limit_exceeded',
        level: 'silenced',
        message: `告警频率过高,已静默${this.config.silencePeriod / 60000}分钟`
      }
    }
    
    // 确定告警级别
    const level = this._determineAlertLevel(alertKey, alert)
    
    // 更新状态
    state.lastSent = now
    state.sentCount++
    state.lastLevel = level
    
    if (state.sentCount === 1) {
      state.firstSent = now
    }
    
    // 清理旧状态
    this._cleanupOldStates()
    
    return {
      shouldSend: true,
      reason: 'ok',
      level,
      waitTime: timeSinceLast
    }
  }
  
  /**
   * 手动静默告警
   * @param {string} alertKey 告警键
   * @param {number} duration 静默时长(毫秒)
   */
  silenceAlert(alertKey, duration = 3600000) {
    const state = this.alertStates.get(alertKey)
    if (state) {
      state.silencedUntil = Date.now() + duration
    }
  }
  
  /**
   * 重置告警状态
   * @param {string} alertKey 告警键
   */
  resetAlert(alertKey) {
    this.alertStates.delete(alertKey)
  }
  
  /**
   * 获取告警统计信息
   * @returns {Object} 统计信息
   */
  getStats() {
    const now = Date.now()
    const hourAgo = now - 3600000
    const dayAgo = now - 86400000
    
    let totalAlerts = 0
    let activeAlerts = 0
    let silencedAlerts = 0
    
    for (const [key, state] of this.alertStates.entries()) {
      totalAlerts++
      
      if (state.silencedUntil > now) {
        silencedAlerts++
      }
      
      if (state.lastSent > hourAgo) {
        activeAlerts++
      }
    }
    
    return {
      totalAlerts,
      activeAlerts,
      silencedAlerts,
      memoryUsage: this._getMemoryUsage()
    }
  }
  
  // 私有方法
  _getAlertKey(alert) {
    // 根据告警特性生成唯一键
    const components = [
      alert.name,
      alert.instance || 'default',
      alert.severity || 'normal',
      alert.fingerprint || this._hashObject(alert.labels || {})
    ]
    
    return components.join(':')
  }
  
  _getRecentCount(alertKey, since) {
    // 这里可以扩展为更精确的计数
    const state = this.alertStates.get(alertKey)
    if (!state || state.firstSent < since) {
      return 0
    }
    
    // 简化处理:实际应该记录每次发送的时间
    return state.sentCount
  }
  
  _determineAlertLevel(alertKey, alert) {
    const state = this.alertStates.get(alertKey)
    if (!state) return alert.severity || 'warning'
    
    const now = Date.now()
    
    // 应用升级规则
    for (const rule of this.config.escalationRules) {
      if (state.sentCount >= rule.count) {
        const windowStart = now - rule.within
        if (state.firstSent >= windowStart) {
          return rule.level
        }
      }
    }
    
    return alert.severity || 'warning'
  }
  
  _cleanupOldStates() {
    const now = Date.now()
    const dayAgo = now - 86400000
    
    for (const [key, state] of this.alertStates.entries()) {
      if (state.lastSent < dayAgo && state.silencedUntil < now) {
        this.alertStates.delete(key)
      }
    }
  }
  
  _hashObject(obj) {
    const str = JSON.stringify(obj)
    let hash = 0
    
    for (let i = 0; i < str.length; i++) {
      const char = str.charCodeAt(i)
      hash = ((hash << 5) - hash) + char
      hash = hash & hash
    }
    
    return Math.abs(hash).toString(16)
  }
  
  _getMemoryUsage() {
    const used = process.memoryUsage()
    return {
      rss: `${Math.round(used.rss / 1024 / 1024)} MB`,
      heapTotal: `${Math.round(used.heapTotal / 1024 / 1024)} MB`,
      heapUsed: `${Math.round(used.heapUsed / 1024 / 1024)} MB`,
      external: `${Math.round(used.external / 1024 / 1024)} MB`
    }
  }
}

// Vue组合式API封装
export function useAlertThrottle() {
  const throttle = new AlertThrottle()
  
  const sendAlert = async (alert, channel = 'all') => {
    const result = throttle.shouldSendAlert(alert)
    
    if (!result.shouldSend && result.reason !== 'rate_limit_exceeded') {
      console.log(`告警被抑制: ${result.reason}`, alert)
      return result
    }
    
    try {
      // 实际发送告警
      const sendPromises = []
      
      if (channel.includes('wechat') || channel === 'all') {
        sendPromises.push(sendWechatAlert(alert, result.level))
      }
      
      if (channel.includes('email') || channel === 'all') {
        sendPromises.push(sendEmailAlert(alert, result.level))
      }
      
      if (channel.includes('sms') || channel === 'all') {
        sendPromises.push(sendSMSAlert(alert, result.level))
      }
      
      await Promise.all(sendPromises)
      
      console.log(`告警发送成功: ${alert.name}`, result)
      return { ...result, sent: true }
      
    } catch (error) {
      console.error('发送告警失败:', error)
      return { ...result, sent: false, error: error.message }
    }
  }
  
  const silenceAlert = (alertKey, duration) => {
    throttle.silenceAlert(alertKey, duration)
  }
  
  const getStats = () => {
    return throttle.getStats()
  }
  
  return {
    sendAlert,
    silenceAlert,
    getStats
  }
}

// 模拟发送函数
async function sendWechatAlert(alert, level) {
  // 实际调用企业微信API
  console.log('发送微信告警:', alert.name, level)
}

async function sendEmailAlert(alert, level) {
  // 实际发送邮件
  console.log('发送邮件告警:', alert.name, level)
}

async function sendSMSAlert(alert, level) {
  // 实际发送短信
  console.log('发送短信告警:', alert.name, level)
}

总结

通过本文的实战指南,我们构建了一个完整的AI系统监控解决方案,具备以下特点:

  1. 全方位数据采集:覆盖Java、Python服务的业务指标、GPU硬件指标、模型性能指标
  2. 智能可视化:Grafana专业面板与Vue自定义界面的完美结合
  3. 实时告警:基于PromQL的智能告警规则,多通道通知
  4. 高性能前端:缓存优化、防抖机制、响应式设计
  5. 可扩展架构:模块化设计,便于添加新的监控维度和通知渠道

关键收获:

  • 指标设计要围绕业务价值,避免监控数据泛滥
  • 告警配置要精细,避免告警疲劳
  • 缓存策略能显著提升用户体验
  • 响应式设计确保多端可用

记住:好的监控系统不仅要能发现问题,更要能帮助快速定位和解决问题。开始构建您的AI监控系统,让智能服务更加可靠!

扩展资源

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐