今日的示例代码包含2个部分

  1. notebook文件夹内的ipynb文件,介绍下今天的思路
  2. 项目文件夹中其他部分:拆分后的信贷项目,学习下如何拆分的,未来你看到的很多大项目都是类似的拆分方法

知识点回顾

  1. 规范的文件命名
  2. 规范的文件夹管理
  3. 机器学习项目的拆分
  4. 编码格式和类型注解

作业:尝试针对之前的心脏病项目ipynb,将他按照今天的示例项目整理成规范的形式,思考下哪些部分可以未来复用。

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score
import shap
import matplotlib.pyplot as plt
import seaborn as sns

# 步骤1:创建项目结构
project_root = '/mnt/heart_disease_prediction'
data_dir = os.path.join(project_root, 'data')
raw_data_dir = os.path.join(data_dir, 'raw')
processed_data_dir = os.path.join(data_dir, 'processed')
src_dir = os.path.join(project_root, 'src')
data_code_dir = os.path.join(src_dir, 'data')
models_code_dir = os.path.join(src_dir,'models')
visualization_code_dir = os.path.join(src_dir, 'visualization')
notebooks_dir = os.path.join(project_root, 'notebooks')

dirs = [
    project_root, data_dir, raw_data_dir, processed_data_dir,
    src_dir, data_code_dir, models_code_dir, visualization_code_dir,
    notebooks_dir
]
for d in dirs:
    if not os.path.exists(d):
        os.makedirs(d)

# 步骤2:整理数据
# 假设 heart.csv 在 /mnt 目录下,将其移动到 data/raw/ 目录
import shutil
shutil.copy('/mnt/heart.csv', raw_data_dir)

# 步骤3:提取数据处理代码到 src/data/preprocessing.py
preprocessing_code = """
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def handle_missing_values(data):
    # 处理缺失值的代码逻辑
    return data.fillna(method='ffill')  # 示例,具体根据实际情况调整

def encode_features(data):
    # 标签编码
    label_encoder = LabelEncoder()
    # 假设某些列需要标签编码,这里以示例说明
    data['categorical_column_1'] = label_encoder.fit_transform(data['categorical_column_1'])
    
    # 独热编码
    onehot_encoder = OneHotEncoder()
    # 假设某些列需要独热编码,这里以示例说明
    encoded = onehot_encoder.fit_transform(data[['categorical_column_2']]).toarray()
    encoded_df = pd.DataFrame(encoded, columns=onehot_encoder.get_feature_names_out(['categorical_column_2']))
    data = pd.concat([data.reset_index(drop=True), encoded_df], axis=1)
    data = data.drop(['categorical_column_2'], axis=1)
    
    return data

def split_dataset(data, target_column):
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test
"""
preprocessing_file_path = os.path.join(data_code_dir, 'preprocessing.py')
with open(preprocessing_file_path, 'w') as file:
    file.write(preprocessing_code)

# 步骤4:提取特征工程代码到 src/data/feature_engineering.py
feature_engineering_code = """
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

def process_continuous_features(data):
    # 连续特征处理逻辑,如标准化等
    for col in data.select_dtypes(include=['float64', 'int64']).columns:
        data[col] = (data[col] - data[col].mean()) / data[col].std()
    return data

def analyze_feature_importance(X_train, y_train):
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    feature_importance = pd.Series(model.feature_importances_, index=X_train.columns)
    return feature_importance
"""
feature_engineering_file_path = os.path.join(data_code_dir, 'feature_engineering.py')
with open(feature_engineering_file_path, 'w') as file:
    file.write(feature_engineering_code)

# 步骤5:提取模型训练代码到 src/models/train.py
train_code = """
import shap
from sklearn.ensemble import RandomForestClassifier

def train_random_forest(X_train, y_train):
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    return model

def explain_model(model, X_train):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_train)
    return shap_values
"""
train_file_path = os.path.join(models_code_dir, 'train.py')
with open(train_file_path, 'w') as file:
    file.write(train_code)

# 步骤6:提取模型评估代码到 src/models/evaluate.py
evaluate_code = """
from sklearn.metrics import accuracy_score, recall_score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    return accuracy, recall
"""
evaluate_file_path = os.path.join(models_code_dir, 'evaluate.py')
with open(evaluate_file_path, 'w') as file:
    file.write(evaluate_code)

# 步骤7:提取可视化代码到 src/visualization/plots.py
plots_code = """
import matplotlib.pyplot as plt
import seaborn as sns
import shap

def plot_feature_importance(feature_importance):
    plt.figure(figsize=(10, 6))
    feature_importance.nlargest(10).plot(kind='barh')
    plt.title('Top 10 Feature Importance')
    plt.show()

def plot_shap_values(shap_values, X_train):
    shap.summary_plot(shap_values, X_train)
"""
plots_file_path = os.path.join(visualization_code_dir, 'plots.py')
with open(plots_file_path, 'w') as file:
    file.write(plots_code)

# 步骤8:更新 Jupyter Notebook 内容到 notebooks/model_development.ipynb
notebook_content = """
```python
from src.data.preprocessing import handle_missing_values, encode_features, split_dataset
from src.data.feature_engineering import process_continuous_features, analyze_feature_importance
from src.models.train import train_random_forest, explain_model
from src.models.evaluate import evaluate_model
from src.visualization.plots import plot_feature_importance, plot_shap_values

# 加载数据
data = pd.read_csv('../data/raw/heart.csv')

# 数据预处理
data = handle_missing_values(data)
data = encode_features(data)

# 特征工程
data = process_continuous_features(data)
X_train, X_test, y_train, y_test = split_dataset(data, 'target_column')  # 替换 target_column 为实际列名
feature_importance = analyze_feature_importance(X_train, y_train)

# 模型训练
model = train_random_forest(X_train, y_train)
shap_values = explain_model(model, X_train)

# 模型评估
accuracy, recall = evaluate_model(model, X_test, y_test)
print(f"Accuracy: {accuracy}, Recall: {recall}")

# 可视化
plot_feature_importance(feature_importance)
plot_shap_values(shap_values, X_train)

@浙大疏锦行

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐