TFMA(tensorflow_model_analysis)常用操作(使用简单的dataframe评估模型)
TFMA(tensorflow_model_analysis)是TFX的组成部分,本文包括TFMA的常用操作——使用简单的dataframe评估模型,这种方法与模型使用的框架无关,且可视化操作有交互性。
·
目录
import pandas as pd
from google.protobuf import text_format
import tensorflow_model_analysis as tfma
2023-06-26 21:43:20.786840: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-26 21:43:23.042179: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 21:43:23.042551: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/TensorRT/lib:/usr/local/cuda-11.7/lib64
2023-06-26 21:43:23.042563: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
使用dataframe进行评估
这里假设预测是由多列值组成。如果是单列,那么更加简单,与label一样
df=pd.DataFrame({'label':[0,1,2],'prediction':[[1,2,3],[4,5,6],[7,8,9]], 'iden':['male','female','male']})
df
label | prediction | iden | |
---|---|---|---|
0 | 0 | [1, 2, 3] | male |
1 | 1 | [4, 5, 6] | female |
2 | 2 | [7, 8, 9] | male |
eval_config = text_format.Parse("""
model_specs{
label_key: 'label',
prediction_key: 'prediction'
}
metrics_specs{
#aggregate{micro_average: True} #这里的bool可以大写开头或小写
metrics{class_name: "ExampleCount"}
metrics{
class_name: "FairnessIndicators"
config: '{"thresholds": [0.5]}'
}
metrics{class_name: "MultiClassConfusionMatrixPlot"}
metrics{
class_name: "SparseCategoricalCrossentropy"
config:'{"axis":-1 ,"from_logits":true}' #这里的bool需要小写,否则报错
}
metrics{class_name: "SparseCategoricalAccuracy"}
}
slicing_specs{}
slicing_specs{
feature_keys: 'iden'
}
""",tfma.EvalConfig())
eval_config
model_specs {
label_key: "label"
prediction_key: "prediction"
}
slicing_specs {
}
slicing_specs {
feature_keys: "iden"
}
metrics_specs {
metrics {
class_name: "ExampleCount"
}
metrics {
class_name: "FairnessIndicators"
config: "{\"thresholds\": [0.5]}"
}
metrics {
class_name: "MultiClassConfusionMatrixPlot"
}
metrics {
class_name: "SparseCategoricalCrossentropy"
config: "{\"axis\":-1 ,\"from_logits\":true}"
}
metrics {
class_name: "SparseCategoricalAccuracy"
}
}
output_path = './test4'
eval_result = tfma.analyze_raw_data(
data=df,
eval_config=eval_config,
output_path=output_path)
WARNING:apache_beam.io.filebasedsink:Deleting 1 existing files in target path matching:
WARNING:apache_beam.io.filebasedsink:Deleting 1 existing files in target path matching: -*-of-%(num_shards)05d
WARNING:apache_beam.io.filebasedsink:Deleting 1 existing files in target path matching: -*-of-%(num_shards)05d
WARNING:apache_beam.io.filebasedsink:Deleting 1 existing files in target path matching: -*-of-%(num_shards)05d
WARNING:apache_beam.io.filebasedsink:Deleting 1 existing files in target path matching:
可视化切片——指标
tfma.view.render_slicing_metrics(eval_result,slicing_column='iden')
可视化Plot
tfma.view.render_plot(eval_result,slicing_spec=tfma.SlicingSpec(feature_values={'iden':'male'}))
可视化Fairness
tfma.addons.fairness.view.widget_view.render_fairness_indicator(eval_result)
同时可视化多个评估结果——比较
###output_paths:List[str]多个评估结果输出路径的List,未测试
# eval_results_from_disk = tfma.load_eval_results(output_paths)
# tfma.view.render_time_series(eval_results_from_disk)
评估结果获取
eval_result.get_slice_names()
[(('iden', 'male'),), (), (('iden', 'female'),)]
eval_result.get_metric_names()
['fairness_indicators_metrics/true_negative_rate@0.5',
'fairness_indicators_metrics/false_positive_rate@0.5',
'fairness_indicators_metrics/recall@0.5',
'fairness_indicators_metrics/false_omission_rate@0.5',
'fairness_indicators_metrics/false_negative_rate@0.5',
'fairness_indicators_metrics/true_positive_rate@0.5',
'example_count',
'fairness_indicators_metrics/precision@0.5',
'fairness_indicators_metrics/false_discovery_rate@0.5',
'sparse_categorical_accuracy',
'sparse_categorical_crossentropy',
'fairness_indicators_metrics/positive_rate@0.5',
'fairness_indicators_metrics/negative_rate@0.5']
eval_result.get_metrics_for_all_slices()
{(('iden',
'male'),): {'sparse_categorical_crossentropy': {'doubleValue': 1.4076058864593506},
'sparse_categorical_accuracy': {'doubleValue': 0.5},
'example_count': {'doubleValue': 2.0},
'fairness_indicators_metrics/false_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/false_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/true_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/true_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/false_discovery_rate@0.5': {'doubleValue': 0.6666666666666666},
'fairness_indicators_metrics/false_omission_rate@0.5': {'doubleValue': 'NaN'},
'fairness_indicators_metrics/precision@0.5': {'doubleValue': 0.3333333333333333},
'fairness_indicators_metrics/recall@0.5': {'doubleValue': 1.0}},
(): {'sparse_categorical_crossentropy': {'doubleValue': 1.4076060056686401},
'sparse_categorical_accuracy': {'doubleValue': 0.3333333432674408},
'example_count': {'doubleValue': 3.0},
'fairness_indicators_metrics/false_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/false_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/true_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/true_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/false_discovery_rate@0.5': {'doubleValue': 0.6666666666666666},
'fairness_indicators_metrics/false_omission_rate@0.5': {'doubleValue': 'NaN'},
'fairness_indicators_metrics/precision@0.5': {'doubleValue': 0.3333333333333333},
'fairness_indicators_metrics/recall@0.5': {'doubleValue': 1.0}},
(('iden',
'female'),): {'sparse_categorical_crossentropy': {'doubleValue': 1.4076058864593506},
'sparse_categorical_accuracy': {'doubleValue': 0.0},
'example_count': {'doubleValue': 1.0},
'fairness_indicators_metrics/false_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/false_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/true_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/true_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/false_discovery_rate@0.5': {'doubleValue': 0.6666666666666666},
'fairness_indicators_metrics/false_omission_rate@0.5': {'doubleValue': 'NaN'},
'fairness_indicators_metrics/precision@0.5': {'doubleValue': 0.3333333333333333},
'fairness_indicators_metrics/recall@0.5': {'doubleValue': 1.0}}}
male_slice = (('iden', 'male'),)
eval_result.get_metrics_for_slice(male_slice)
{'sparse_categorical_crossentropy': {'doubleValue': 1.4076058864593506},
'sparse_categorical_accuracy': {'doubleValue': 0.5},
'example_count': {'doubleValue': 2.0},
'fairness_indicators_metrics/false_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/false_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/true_positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/true_negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/positive_rate@0.5': {'doubleValue': 1.0},
'fairness_indicators_metrics/negative_rate@0.5': {'doubleValue': 0.0},
'fairness_indicators_metrics/false_discovery_rate@0.5': {'doubleValue': 0.6666666666666666},
'fairness_indicators_metrics/false_omission_rate@0.5': {'doubleValue': 'NaN'},
'fairness_indicators_metrics/precision@0.5': {'doubleValue': 0.3333333333333333},
'fairness_indicators_metrics/recall@0.5': {'doubleValue': 1.0}}
评估结果——dataframe
#获取metrics的转换为pandas
dfs=tfma.experimental.dataframe.metrics_as_dataframes(tfma.load_metrics(output_path))
dfs.double_value
slices | metric_keys | metric_values | ||||||
---|---|---|---|---|---|---|---|---|
iden | Overall | name | model_name | output_name | example_weighted | is_diff | double_value | |
0 | b'male' | NaN | sparse_categorical_crossentropy | False | False | 1.407606 | ||
1 | b'male' | NaN | sparse_categorical_accuracy | False | False | 0.500000 | ||
2 | b'male' | NaN | example_count | False | False | 2.000000 | ||
3 | b'male' | NaN | fairness_indicators_metrics/false_positive_rat... | False | False | 1.000000 | ||
4 | b'male' | NaN | fairness_indicators_metrics/false_negative_rat... | False | False | 0.000000 | ||
5 | b'male' | NaN | fairness_indicators_metrics/true_positive_rate... | False | False | 1.000000 | ||
6 | b'male' | NaN | fairness_indicators_metrics/true_negative_rate... | False | False | 0.000000 | ||
7 | b'male' | NaN | fairness_indicators_metrics/positive_rate@0.5 | False | False | 1.000000 | ||
8 | b'male' | NaN | fairness_indicators_metrics/negative_rate@0.5 | False | False | 0.000000 | ||
9 | b'male' | NaN | fairness_indicators_metrics/false_discovery_ra... | False | False | 0.666667 | ||
10 | b'male' | NaN | fairness_indicators_metrics/false_omission_rat... | False | False | NaN | ||
11 | b'male' | NaN | fairness_indicators_metrics/precision@0.5 | False | False | 0.333333 | ||
12 | b'male' | NaN | fairness_indicators_metrics/recall@0.5 | False | False | 1.000000 | ||
13 | NaN | sparse_categorical_crossentropy | False | False | 1.407606 | |||
14 | NaN | sparse_categorical_accuracy | False | False | 0.333333 | |||
15 | NaN | example_count | False | False | 3.000000 | |||
16 | NaN | fairness_indicators_metrics/false_positive_rat... | False | False | 1.000000 | |||
17 | NaN | fairness_indicators_metrics/false_negative_rat... | False | False | 0.000000 | |||
18 | NaN | fairness_indicators_metrics/true_positive_rate... | False | False | 1.000000 | |||
19 | NaN | fairness_indicators_metrics/true_negative_rate... | False | False | 0.000000 | |||
20 | NaN | fairness_indicators_metrics/positive_rate@0.5 | False | False | 1.000000 | |||
21 | NaN | fairness_indicators_metrics/negative_rate@0.5 | False | False | 0.000000 | |||
22 | NaN | fairness_indicators_metrics/false_discovery_ra... | False | False | 0.666667 | |||
23 | NaN | fairness_indicators_metrics/false_omission_rat... | False | False | NaN | |||
24 | NaN | fairness_indicators_metrics/precision@0.5 | False | False | 0.333333 | |||
25 | NaN | fairness_indicators_metrics/recall@0.5 | False | False | 1.000000 | |||
26 | b'female' | NaN | sparse_categorical_crossentropy | False | False | 1.407606 | ||
27 | b'female' | NaN | sparse_categorical_accuracy | False | False | 0.000000 | ||
28 | b'female' | NaN | example_count | False | False | 1.000000 | ||
29 | b'female' | NaN | fairness_indicators_metrics/false_positive_rat... | False | False | 1.000000 | ||
30 | b'female' | NaN | fairness_indicators_metrics/false_negative_rat... | False | False | 0.000000 | ||
31 | b'female' | NaN | fairness_indicators_metrics/true_positive_rate... | False | False | 1.000000 | ||
32 | b'female' | NaN | fairness_indicators_metrics/true_negative_rate... | False | False | 0.000000 | ||
33 | b'female' | NaN | fairness_indicators_metrics/positive_rate@0.5 | False | False | 1.000000 | ||
34 | b'female' | NaN | fairness_indicators_metrics/negative_rate@0.5 | False | False | 0.000000 | ||
35 | b'female' | NaN | fairness_indicators_metrics/false_discovery_ra... | False | False | 0.666667 | ||
36 | b'female' | NaN | fairness_indicators_metrics/false_omission_rat... | False | False | NaN | ||
37 | b'female' | NaN | fairness_indicators_metrics/precision@0.5 | False | False | 0.333333 | ||
38 | b'female' | NaN | fairness_indicators_metrics/recall@0.5 | False | False | 1.000000 |
#切片为行,评价为列
tfma.experimental.dataframe.auto_pivot(dfs.double_value)
(metric_keys, name) | example_count | fairness_indicators_metrics/false_discovery_rate@0.5 | fairness_indicators_metrics/false_negative_rate@0.5 | fairness_indicators_metrics/false_omission_rate@0.5 | fairness_indicators_metrics/false_positive_rate@0.5 | fairness_indicators_metrics/negative_rate@0.5 | fairness_indicators_metrics/positive_rate@0.5 | fairness_indicators_metrics/precision@0.5 | fairness_indicators_metrics/recall@0.5 | fairness_indicators_metrics/true_negative_rate@0.5 | fairness_indicators_metrics/true_positive_rate@0.5 | sparse_categorical_accuracy | sparse_categorical_crossentropy |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
slices | |||||||||||||
Overall: | 3.0 | 0.666667 | 0.0 | NaN | 1.0 | 0.0 | 1.0 | 0.333333 | 1.0 | 0.0 | 1.0 | 0.333333 | 1.407606 |
iden:b'female' | 1.0 | 0.666667 | 0.0 | NaN | 1.0 | 0.0 | 1.0 | 0.333333 | 1.0 | 0.0 | 1.0 | 0.000000 | 1.407606 |
iden:b'male' | 2.0 | 0.666667 | 0.0 | NaN | 1.0 | 0.0 | 1.0 | 0.333333 | 1.0 | 0.0 | 1.0 | 0.500000 | 1.407606 |
#fileter slices
df_double = dfs.double_value
df_filtered = df_double.loc[df_double.slices.iden==b'male']
tfma.experimental.dataframe.auto_pivot(df_filtered)
(metric_keys, name) | example_count | fairness_indicators_metrics/false_discovery_rate@0.5 | fairness_indicators_metrics/false_negative_rate@0.5 | fairness_indicators_metrics/false_omission_rate@0.5 | fairness_indicators_metrics/false_positive_rate@0.5 | fairness_indicators_metrics/negative_rate@0.5 | fairness_indicators_metrics/positive_rate@0.5 | fairness_indicators_metrics/precision@0.5 | fairness_indicators_metrics/recall@0.5 | fairness_indicators_metrics/true_negative_rate@0.5 | fairness_indicators_metrics/true_positive_rate@0.5 | sparse_categorical_accuracy | sparse_categorical_crossentropy |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
slices | |||||||||||||
iden:b'male' | 2.0 | 0.666667 | 0.0 | NaN | 1.0 | 0.0 | 1.0 | 0.333333 | 1.0 | 0.0 | 1.0 | 0.5 | 1.407606 |
#加载validation结果,需要Model Validation(多模型),由于这里没有多个模型且缺少change_threshold
#所以没有validation结果
tfma.load_validation_result(output_path)
missing_thresholds: true
Model Validation(多模型)
由于没有数据、模型,这里不能运行,下面摘录自https://tensorflow.google.cn/tfx/tutorials/model_analysis/tfma_basic。未测试
# Setup tfma.EvalConfig setting
eval_config_with_thresholds = text_format.Parse("""
## Model information
model_specs {
name: "candidate"
# For keras we need to add a `label_key`.
label_key: "big_tipper"
}
model_specs {
name: "baseline"
# For keras we need to add a `label_key`.
label_key: "big_tipper"
is_baseline: true
}
## Post training metric information
metrics_specs {
metrics { class_name: "ExampleCount" }
metrics { class_name: "BinaryAccuracy" }
metrics { class_name: "BinaryCrossentropy" }
metrics {
class_name: "AUC"
threshold {
# Ensure that AUC is always > 0.9
value_threshold {
lower_bound { value: 0.9 }
}
# Ensure that AUC does not drop by more than a small epsilon
# e.g. (candidate - baseline) > -1e-10 or candidate > baseline - 1e-10
change_threshold {
direction: HIGHER_IS_BETTER
absolute { value: -1e-10 }
}
}
}
metrics { class_name: "AUCPrecisionRecall" }
metrics { class_name: "Precision" }
metrics { class_name: "Recall" }
metrics { class_name: "MeanLabel" }
metrics { class_name: "MeanPrediction" }
metrics { class_name: "Calibration" }
metrics { class_name: "CalibrationPlot" }
metrics { class_name: "ConfusionMatrixPlot" }
# ... add additional metrics and plots ...
}
## Slicing information
slicing_specs {} # overall slice
slicing_specs {
feature_keys: ["trip_start_hour"]
}
slicing_specs {
feature_keys: ["trip_start_day"]
}
slicing_specs {
feature_keys: ["trip_start_month"]
}
slicing_specs {
feature_keys: ["trip_start_hour", "trip_start_day"]
}
""", tfma.EvalConfig())
# Create tfma.EvalSharedModels that point at our keras models.
candidate_model_path = os.path.join(MODELS_DIR, 'keras', '2')
baseline_model_path = os.path.join(MODELS_DIR, 'keras', '1')
eval_shared_models = [
tfma.default_eval_shared_model(
model_name=tfma.CANDIDATE_KEY,
eval_saved_model_path=candidate_model_path,
eval_config=eval_config_with_thresholds),
tfma.default_eval_shared_model(
model_name=tfma.BASELINE_KEY,
eval_saved_model_path=baseline_model_path,
eval_config=eval_config_with_thresholds),
]
validation_output_path = os.path.join(OUTPUT_DIR, 'validation')
# Run TFMA
eval_result_with_validation = tfma.run_model_analysis(
eval_shared_models,
eval_config=eval_config_with_thresholds,
data_location=tfrecord_file,
output_path=validation_output_path)
其他评估模型的方法(使用Beam或非Tensorflow模型)
更多推荐
所有评论(0)