一、baseline代码流程梳理

1、引入必要的包

import os
import random
import cv2
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, GroupKFold
import numpy as np
import pandas as pd
from sklearn import *
import glob
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from os import path
from pathlib import Path
from seglearn.feature_functions import base_features, emg_features
from tsflex.features import FeatureCollection, MultipleFeatureDescriptors
from tsflex.features.integrations import seglearn_feature_dict_wrapper
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.base import clone
from sklearn.metrics import average_precision_score
import warnings
warnings.filterwarnings('ignore')

2、导入数据

train中defog文件夹中的一系列csv为在病人家里获取的数据,每个csv的名称为患者的编号,因此读取数据时候循环这些csv,将名称加入dataframe,变成dataframe中的一个列:

DATA_ROOT_DEFOG = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog/'
defog = pd.DataFrame()
for root, dirs, files in os.walk(DATA_ROOT_DEFOG):
    for name in files:       
        f = os.path.join(root, name)
        df_list= pd.read_csv(f)
        words = name.split('.')[0]
        df_list['file']= name.split('.')[0]
        defog = pd.concat([defog, df_list], axis=0)

defog

用之前optiver那篇blog中的reduce_memory_usage函数减少内存占用:

defog = reduce_memory_usage(defog)

只取用有效的数据:

defog = defog[(defog['Task']==1)&(defog['Valid']==1)]
defog

将患者信息导入并根据患者id合并进train的主表中:

defog_metadata = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv")
defog_m= defog_metadata.merge(defog, how = 'inner', left_on = 'Id', right_on = 'file')
defog_m.drop(['file','Valid','Task'], axis = 1, inplace = True)
defog_m

一个很有用的汇总描述函数,可复用:

# summary table function
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values * 100
    summ['%missing'] = df.isnull().sum().values / len(df)
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['first value'] = df.loc[0].values
    summ['second value'] = df.loc[1].values
    summ['third value'] = df.loc[2].values
    
    return summ

输出描述汇总结果:

summary(defog_m)
# garbage collection for memory
import gc
gc.collect()

完全相同方法获取实验室收集的train数据(tdcsfog):

DATA_ROOT_TDCSFOG = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/'
tdcsfog = pd.DataFrame()
for root, dirs, files in os.walk(DATA_ROOT_TDCSFOG):
    for name in files:       
        f = os.path.join(root, name)
        df_list= pd.read_csv(f)
        words = name.split('.')[0]
        df_list['file']= name.split('.')[0]
        tdcsfog = pd.concat([tdcsfog, df_list], axis=0)

tdcsfog = reduce_memory_usage(tdcsfog)
tdcsfog_metadata = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv")

tdcsfog_m= tdcsfog_metadata.merge(tdcsfog, how = 'inner', left_on = 'Id', right_on = 'file')
tdcsfog_m.drop(['file'], axis = 1, inplace = True)

# garbage collection for memory
import gc
gc.collect()

3、特征工程

用event这列来标注所处的状态:正常、StartHesitation、Turn、Walking,这四类是互斥的,只能处于一个状态,此处用了很有用的函数np.select:

conditions = [
    (defog_m['StartHesitation'] == 1),
    (defog_m['Turn'] == 1),
    (defog_m['Walking'] == 1)]
choices = ['StartHesitation', 'Turn', 'Walking']
defog_m['event'] = np.select(conditions, choices, default='Normal')
defog_m['event'].value_counts().to_frame().style.background_gradient()

构造baseline最简单的模型,只用到三列最有用特征以及有待预测的event列:

train_df = defog_m[['AccV','AccML','AccAP','event']]

把event列变成0、1、2、3四类:

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

train_df['target'] = le.fit_transform(train_df['event'])

把数据划分为X、y,为训练模型做准备:

X = train_df.drop(['event','target'], axis=1)
y = train_df['target']

4、训练模型

训练多分类(4个类别)的模型:

import lightgbm as lgb


# split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1004)

#Converting the dataset in proper LGB format
d_train=lgb.Dataset(X_train, label=y_train)
#setting up the parameters
params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='multiclass' #Multi-class target feature
params['metric']='multi_logloss' #metric for multi-class
params['max_depth']=7
params['num_class']=4 #no.of unique values in the target class not inclusive of the end value
params['verbose']=-1
#training the model
clf=lgb.train(params,d_train,1000)  #training the model on 1,000 epocs
#prediction on the test dataset
y_pred_1=clf.predict(X_test)

预测的是四个类别各自的概率,取概率最大值则为预测的类别:

y_pred_1[:1]

根据赛题表述,用四个类别平均的precision来进行模型效果的评估:

# 'macro' option is to calculate metrics for each label, and find their unweighted mean. 
# This does not take label imbalance into account.
from sklearn.metrics import precision_score
precision_score(y_test, np.argmax(y_pred_1, axis=-1), average='macro')

5、测试集预测

读取测试集,把特征做成train相同的特征:

test_defog_path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog/02ab235146.csv'
test_defog = pd.read_csv(test_defog_path)
name = os.path.basename(test_defog_path)
id_value = name.split('.')[0]
test_defog['Id_value'] = id_value
test_defog['Id'] = test_defog['Id_value'].astype(str) + '_' + test_defog['Time'].astype(str)
test_defog = test_defog[['Id','AccV','AccML','AccAP']]
test_defog.set_index('Id',inplace=True)

用训练好的模型做预测:

# predict event probability
test_defog_pred=clf.predict(test_defog)
test_defog['event'] = np.argmax(test_defog_pred, axis=-1)

用np.where函数把得到的event列转化为和sample_submission中一样的三列:

# expand event column it to three columns
test_defog['StartHesitation'] = np.where(test_defog['event']==1, 1, 0)
test_defog['Turn'] = np.where(test_defog['event']==2, 1, 0)
test_defog['Walking'] = np.where(test_defog['event']==3, 1, 0)
test_defog.reset_index('Id', inplace=True)

对于实验室收集的数据tdcsfog,用相同的方法训练模型以及预测(此处代码省略),最后将预测的dataframe和test_defog拼接在一起:

submit = pd.concat([test_tdcsfog,test_defog])
submit = submit[['Id', 'StartHesitation', 'Turn','Walking']]

此篇代码可以跑通,但是缺少特征工程以及模型调参,因此结果不算理想,可以作为baseline代码,在此基础上进行迭代,补充以下的特征工程和提交思路,供比较参考。

二、完整数据导入、特征工程以及提交的流程

1、导入数据及特征工程

参考Gait Prediction | Kaggle

下载关于时间序列特征提取的包:

# Install tsflex and seglearn
!pip install tsflex --no-index --find-links=file:///kaggle/input/time-series-tools
!pip install seglearn --no-index --find-links=file:///kaggle/input/time-series-tools

读取必要的csv文件,先获得实验室和家庭采集的两种数据合并的表格full_metadata:

root = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/'

train = glob.glob(path.join(root, 'train/**/**'))
test = glob.glob(path.join(root, 'test/**/**'))

subjects = pd.read_csv(path.join(root, 'subjects.csv'))
tasks = pd.read_csv(path.join(root, 'tasks.csv'))
events = pd.read_csv(path.join(root, 'events.csv'))

tdcsfog_metadata = pd.read_csv(path.join(root, 'tdcsfog_metadata.csv'))
defog_metadata = pd.read_csv(path.join(root, 'defog_metadata.csv')) 

tdcsfog_metadata['Module'] = 'tdcsfog'
defog_metadata['Module'] = 'defog'

full_metadata = pd.concat([tdcsfog_metadata, defog_metadata])

处理subject和task两个csv的数据,进行必要的特征工程(主要应用kmeans的方法分组):

seed = 100
cluster_size = 8
subjects['Sex'] = subjects['Sex'].factorize()[0]
subjects = subjects.fillna(0).groupby('Subject').median()
subjects['s_group'] = cluster.KMeans(n_clusters = cluster_size, random_state = seed).fit_predict(subjects[subjects.columns[1:]])
new_names = {'Visit':'s_visit','Age':'s_age','YearsSinceDx':'s_years','UPDRSIII_On':'s_on','UPDRSIII_Off':'s_off','NFOGQ':'s_NFOGQ', 'Sex': 's_sex'}
subjects = subjects.rename(columns = new_names)
subjects
tasks['Duration'] = tasks['End'] - tasks['Begin']
tasks = pd.pivot_table(tasks, values=['Duration'], index=['Id'], columns=['Task'], aggfunc='sum', fill_value=0)
tasks.columns = [c[1] for c in tasks.columns]
tasks = tasks.reset_index()
tasks['t_group'] = cluster.KMeans(n_clusters = cluster_size, random_state = seed).fit_predict(tasks[tasks.columns[1:]])
tasks

将metadata和subject两个数据表合并:

# merge the subjects with the metadata
metadata_w_subjects = full_metadata.merge(subjects, how='left', on='Subject').copy()
metadata_w_subjects['Medication'] = metadata_w_subjects['Medication'].factorize()[0]
#features = metadata_w_subjects.columns
metadata_w_subjects

用时间序列特征提取的包提取定义fc,用来处理生成时间相关特征:

basic_feats = MultipleFeatureDescriptors(
    functions=seglearn_feature_dict_wrapper(base_features()),
    series_names=['AccV', 'AccML', 'AccAP'],
    windows=[5000],
    strides=[5000],
)

emg_feats = emg_features()
del emg_feats['simple square integral'] # is same as abs_energy (which is in base_features)

emg_feats = MultipleFeatureDescriptors(
    functions=seglearn_feature_dict_wrapper(emg_feats),
    series_names=['AccV', 'AccML', 'AccAP'],
    windows=[5000],
    strides=[5000],
)

fc = FeatureCollection([basic_feats, emg_feats])

定义reader函数,用于逐个读取train的csv(只保留时间、三个重要特征以及三个标签),增加Id列储存名称上的序号,增加Module列储存是家庭/实验室数据,,增加Time_frac列代表时间的比例,按照Id列合并tasks表的聚类列t_group,按照Id列合并metadata_w_subjects中重要的列,对合并后的df用fc做时间序列窗口特征生成,再根据index(Time)把时间相关特征和原df合并,最后填充空值:

def reader(file):
    try:
        df = pd.read_csv(file, index_col='Time', usecols=['Time', 'AccV', 'AccML', 'AccAP', 'StartHesitation', 'Turn' , 'Walking'])

        path_split = file.split('/')
        df['Id'] = path_split[-1].split('.')[0]
        dataset = Path(file).parts[-2]
        df['Module'] = dataset
        
        # this is done because the speeds are at different rates for the datasets
#         if dataset == 'tdcsfog':
#             df.AccV = df.AccV / 9.80665
#             df.AccML = df.AccML / 9.80665
#             df.AccAP = df.AccAP / 9.80665

        df['Time_frac']=(df.index/df.index.max()).values
        
        df = pd.merge(df, tasks[['Id','t_group']], how='left', on='Id').fillna(-1)
        
        df = pd.merge(df, metadata_w_subjects[['Id','Subject', 'Visit','Test','Medication','s_group']], how='left', on='Id').fillna(-1)
        
        df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin").astype(np.float32)
        df = df.merge(df_feats, how="left", left_index=True, right_index=True)
        
#         # stride
#         df["Stride"] = df["AccV"] + df["AccML"] + df["AccAP"]

#         # step
#         df["Step"] = np.sqrt(abs(df["Stride"]))
    
        df.fillna(method="ffill", inplace=True)
        
        return df
    except: pass

调用上述函数,并定义特征列cols、标签列pcols以及提交的列scols:

train = pd.concat([reader(f) for f in tqdm(train)]).fillna(0); print(train.shape)
cols = [c for c in train.columns if c not in ['Id','Subject','Module', 'Time', 'StartHesitation', 'Turn' , 'Walking', 'Valid', 'Task','Event']]
pcols = ['StartHesitation', 'Turn' , 'Walking']
scols = ['Id', 'StartHesitation', 'Turn' , 'Walking']
train=train.reset_index(drop=True)

2、构建模型

定义最佳系数以及自定义评估指标函数custom_average_precision:

best_params_ = {'colsample_bytree': 0.5282057895135501,
 'learning_rate': 0.22659963168004743,
 'max_depth': 8,
 'min_child_weight': 3.1233911067827616,
 'n_estimators': 291,
 'subsample': 0.9961057796456088,
 }

def custom_average_precision(y_true, y_pred):
    score = average_precision_score(y_true, y_pred)
    return 'average_precision', score, True

为lgbm模型构建多标签输出,定义多个回归器,对每个标签分别进行fit:

class LGBMMultiOutputRegressor(MultiOutputRegressor):
    def fit(self, X, y, eval_set=None, **fit_params):
        self.estimators_ = [clone(self.estimator) for _ in range(y.shape[1])]
        
        for i, estimator in enumerate(self.estimators_):
            if eval_set:
                fit_params['eval_set'] = [(eval_set[0], eval_set[1][:, i])]
            estimator.fit(X, y[:, i], **fit_params)
        
        return self

用GroupKFold进行五折交叉验证,将subject一样的作为一个组,同一组不同时分配到训练集和验证集中,保证模型泛化能力。用regs存储五个模型,用cvs存储五次验证集精确度得分:

kfold = GroupKFold(5)
groups=kfold.split(train, groups=train.Subject)

regs = []
cvs = []

for _, (tr_idx, te_idx) in enumerate(tqdm(groups, total=5, desc="Folds")):
    
    tr_idx = pd.Series(tr_idx).sample(n=2000000,random_state=42).values

    multioutput_regressor = LGBMMultiOutputRegressor(lgb.LGBMRegressor(**best_params_))

    x_train = train.loc[tr_idx, cols].to_numpy()
    y_train = train.loc[tr_idx, pcols].to_numpy()
    
    x_test = train.loc[te_idx, cols].to_numpy()
    y_test = train.loc[te_idx, pcols].to_numpy()

    multioutput_regressor.fit(
        x_train, y_train,
        eval_set=(x_test, y_test),
        eval_metric=custom_average_precision,
        early_stopping_rounds=15,
        verbose = 0,
    )
    
    regs.append(multioutput_regressor)
    
    cv = metrics.average_precision_score(y_test, multioutput_regressor.predict(x_test).clip(0.0,1.0))
    
    cvs.append(cv)
    
print(cvs)
print(np.mean(cvs))

3、进行预测并保存提交结果

读取sample_submission文件,再逐个读取test文件,跟train文件用相同方式做处理,保证有同样的特征用于做预测。用五折交叉验证得到的五个模型,分别对测试集进行预测,结果拼接在一起再做平均,得到每个test文件的预测值dataframe。再将标签和测试集特征的df横向拼接在一起,创造和sample_submission文件相同的Id列,将每个测试集得到的Id列和标签列组成的dataframe append到一个列表中:

sub = pd.read_csv(path.join(root, 'sample_submission.csv'))
submission = []

for f in test:
    df = pd.read_csv(f)
    df.set_index('Time', drop=True, inplace=True)

    df['Id'] = f.split('/')[-1].split('.')[0]
            
    df['Time_frac']=(df.index/df.index.max()).values
    df = pd.merge(df, tasks[['Id','t_group']], how='left', on='Id').fillna(-1)

    df = pd.merge(df, metadata_w_subjects[['Id','Subject', 'Visit','Test','Medication','s_group']], how='left', on='Id').fillna(-1)
    df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin")
    df = df.merge(df_feats, how="left", left_index=True, right_index=True)
    df.fillna(method="ffill", inplace=True)

    res_vals = []
    
    for i_fold in range(5):
        
        pred = regs[i_fold].predict(df[cols]).clip(0.0,1.0)
        res_vals.append(np.expand_dims(np.round(pred, 3), axis = 2))
        
    res_vals = np.mean(np.concatenate(res_vals, axis = 2), axis = 2)
    res = pd.DataFrame(res_vals, columns=pcols)
    
    df = pd.concat([df,res], axis=1)
    df['Id'] = df['Id'].astype(str) + '_' + df.index.astype(str)
    submission.append(df[scols])

将每个test文件得到的结果列表拼接(默认纵向)在一起,再根据Id列merge到sample_submission文件中,填充nan值后转化为csv文件,即可提交:

submission = pd.concat(submission)
submission = pd.merge(sub[['Id']], submission, how='left', on='Id').fillna(0.0)
submission[scols].to_csv('submission.csv', index=False)
Id StartHesitation Turn Walking
0 003f117e14_0 0.0124 0.008 0.0126
1 003f117e14_1 0.0124 0.008 0.0126
2 003f117e14_2 0.0124 0.008 0.0126
3 003f117e14_3 0.0124 0.008 0.0126
4 003f117e14_4 0.0124 0.008 0.0126
... ... ... ... ...
286365 02ab235146_281683 0.0112 0.058 0.0120
286366 02ab235146_281684 0.0112 0.058 0.0120
286367 02ab235146_281685 0.0112 0.058 0.0120
286368 02ab235146_281686 0.0112 0.058 0.0120
286369 02ab235146_281687 0.0112 0.058 0.0120

上表为提交的示例,实际提交结果0.237左右,后续会继续做提升并持续更新,欢迎点赞收藏关注,下一篇会继续发布kaggle M5时间序列赛题梳理~

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐