1:报名地址

     https://aistudio.baidu.com/aistudio/competition/detail/51

        

2:排名分数

        

 3:证书登场

 

位置先空着

4: 模型源码

废话不多说,直接上源码

import pandas as pd
import numpy as np
import pickle

#数据加载
raw=pd.read_csv('./train.csv')
train_raw=raw[raw['order_pay_time']<='2013-07-31 23:59:59']
raw.sort_values('order_pay_time',ascending=True,inplace=True)

#下个月8月份 购买的用户集合
label_raw=set(raw[raw['order_pay_time']>'2013-07-31 23:59:59']['customer_id'].dropna())

#数据预处理
def preprocess(raw,train = 'train'):
    # 按照customer_id ,统计特征
    data = pd.DataFrame(
        # 如果 gender 为空, 则填充为0 
        raw.groupby('customer_id')['customer_gender'].last().fillna(0)
    )  
    # 用户与商品的交互特征(最后一次行为)
    data[['goods_id_last','goods_status_last','goods_price_last','goods_has_discount_last','goods_list_time_last',
          'goods_delist_time_last']]= \
        raw.groupby('customer_id')['goods_id','goods_status','goods_price','goods_has_discount','goods_list_time',
                                   'goods_delist_time'].last() 
    
    # 用户与订单的交互特征(最后一次行为)
    data[['order_total_num_last','order_amount_last','order_total_payment_last','order_total_discount_last','order_pay_time_last',
          'order_status_last','order_count_last','is_customer_rate_last','order_detail_status_last', 'order_detail_goods_num_last', 
          'order_detail_amount_last','order_detail_payment_last', 'order_detail_discount_last']]= \
        raw.groupby('customer_id')['order_total_num', 'order_amount','order_total_payment', 'order_total_discount', 'order_pay_time',
               'order_status', 'order_count', 'is_customer_rate','order_detail_status', 'order_detail_goods_num', 
                'order_detail_amount','order_detail_payment', 'order_detail_discount'].last()     
    
    
    # 用户与会员的交互特征(最后一次行为)++
    data[['member_id_last','member_status_last','is_member_actived_last']]= \
        raw.groupby('customer_id')['member_id','member_status','is_member_actived'].last() 
    
    # 商品原始价格(多种统计字段)
    data[['goods_price_min','goods_price_max','goods_price_mean','goods_price_std']]= \
        raw.groupby('customer_id',as_index = False)['goods_price'].agg({'goods_price_min':'min','goods_price_max':'max','goods_price_mean':'mean','goods_price_std':'std'}).drop(['customer_id'],axis=1)
    
    
    #订单实付金额(多种统计字段)
    data[['order_total_payment_min','order_total_payment_max','order_total_payment_mean','order_total_payment_std']]= \
        raw.groupby('customer_id',as_index = False)['order_total_payment'].agg({'order_total_payment_min':'min','order_total_payment_max':'max',
          'order_total_payment_mean':'mean','order_total_payment_std':'std'}).drop(['customer_id'],axis=1)
    
    #用户购买的订单数量
    data[['order_count']] = raw.groupby('customer_id',as_index = False)['order_id'].count().drop(['customer_id'],axis=1)
    
    #用户购买商品数量
    data[['goods_count']] = raw.groupby('customer_id',as_index = False)['goods_id'].count().drop(['customer_id'],axis=1)
    
    #用户所在省份
    data[['customer_province']] = raw.groupby('customer_id')['customer_province'].last()
    
    #用户所在城市
    data[['customer_city']] = raw.groupby('customer_id')['customer_city'].last()
    
    #用户是否评价,统计结果(平均,总和)
    data[['is_customer_rate_mean','is_customer_rate_sum']]=raw.groupby('customer_id')['is_customer_rate'].agg([
        ('is_customer_rate_mean',np.mean),
        ('is_customer_rate_sum',np.sum)
    ])
    
    #应付金额除以实付金额 ++,优惠比例越大,越容易购买
    data['discount']=data['order_detail_amount_last']/data['order_detail_payment_last']
    
    #用户的会员状态,++
    data[['member_status_mean','member_status_sum']]=raw.groupby('customer_id')['member_status'].agg([
        ('member_status_mean',np.mean),
        ('member_status_sum',np.sum)
    ])
    
    #订单优惠金额  订单优惠金额越多,越容易购买
    data[['order_detail_discount_mean','order_detail_discount_sum']]=raw.groupby('customer_id')['order_detail_discount'].agg([
        ('order_detail_discount_mean',np.mean),
        ('order_detail_discount_sum',np.sum)
    ])      
    
    #商品库存状态
    data[['goods_status_mean','goods_status_sum']]=raw.groupby('customer_id')['goods_status'].agg([
        ('goods_status_mean',np.mean),
        ('goods_status_sum',np.sum)
    ])   
    
    #会员激活状态
    data[['is_member_actived_mean','is_member_actived_sum']]=raw.groupby('customer_id')['is_member_actived'].agg([
        ('is_member_actived_mean',np.mean),
        ('is_member_actived_sum',np.sum)
    ])  
    
    #订单状态 
    data[['order_status_mean','order_status_sum']]=raw.groupby('customer_id')['order_status'].agg([
        ('order_status_mean',np.mean),
        ('order_status_sum',np.sum)
    ])
    
    #用户购买的goods数量
    data[['order_detail_count']] = raw.groupby('customer_id')['customer_id'].count()
    
    #商品折扣统计属性    
    data[['goods_has_discount_mean','goods_has_discount_sum']]= raw.groupby('customer_id')['goods_has_discount'].agg([
        ('goods_has_discount_mean',np.mean),
        ('goods_has_discount_sum',np.sum)
    ])
    
    #订单实付金额 统计属性
    data[['order_total_payment_mean','order_total_payment_sum']]= raw.groupby('customer_id')['order_total_payment'].agg([
        ('order_total_payment_mean',np.mean),
        ('order_total_payment_sum',np.sum)
    ])
        
    #订单商品数量 统计属性
    data[['order_total_num_mean','order_total_num_sum']]= raw.groupby('customer_id')['order_total_num'].agg([
        ('order_total_num_mean',np.mean),
        ('order_total_num_sum',np.sum)
    ])    
    data['order_pay_time_last'] = pd.to_datetime(data['order_pay_time_last'])
    data['order_pay_time_last_m'] = data['order_pay_time_last'].dt.month
    data['order_pay_time_last_d'] = data['order_pay_time_last'].dt.day
    data['order_pay_time_last_h'] = data['order_pay_time_last'].dt.hour
    data['order_pay_time_last_min'] = data['order_pay_time_last'].dt.minute
    data['order_pay_time_last_s'] = data['order_pay_time_last'].dt.second
    data['order_pay_time_last_weekday'] = data['order_pay_time_last'].dt.weekday
    
    #计算order_pay_time_last的时间diff
    t_min=pd.to_datetime('2012-10-11 00:00:00')
    data['order_pay_time_last_diff'] = (data['order_pay_time_last']-t_min).dt.days
    
    #商品最新上架时间diff :假设其实时间为2012-10-11 00:00:00    
    data['goods_list_time_last'] =pd.to_datetime(data['goods_list_time_last'])    
    data['goods_list_time_diff'] = (data['goods_list_time_last']-t_min).dt.days
    
    #商品最新下架时间diff :假设其实时间为2012-10-11 00:00:00
    data['goods_delist_time_last'] =pd.to_datetime(data['goods_delist_time_last'])    
    data['goods_delist_time_diff'] = (data['goods_delist_time_last']-t_min).dt.days
    
    #商品展示时间(下架时间-上架时间)
    data['goods_time_diff'] =  data['goods_delist_time_diff']-data['goods_list_time_diff']
    return data

#训练集预处理
train_raw2=preprocess(train_raw)
train_raw2['label']=train_raw2.index.map(lambda x:int(x in label_raw))
train_raw2.drop(['goods_list_time_last','goods_delist_time_last','order_pay_time_last'],axis=1,inplace=True)

#测试集预处理
test=preprocess(raw)
test.drop(['goods_list_time_last','goods_delist_time_last','order_pay_time_last'],axis=1,inplace=True)

#训练集与测试集-省市进行LabelEncoder
test['customer_province'] = test['customer_province'].astype('str') 
test['customer_city'] = test['customer_city'].astype('str')
train_raw2['customer_province'] = train_raw2['customer_province'].astype('str') 
train_raw2['customer_city'] = train_raw2['customer_city'].astype('str')
from sklearn.preprocessing import LabelEncoder
lel=LabelEncoder()
test['customer_province']=lel.fit_transform(test['customer_province'])
train_raw2['customer_province']=lel.fit_transform(train_raw2['customer_province'])
le2=LabelEncoder()
test['customer_city']=le2.fit_transform(test['customer_city'])
train_raw2['customer_city']=le2.fit_transform(train_raw2['customer_city'])

from sklearn.preprocessing import LabelEncoder
lel=LabelEncoder()
test['customer_province']=lel.fit_transform(test['customer_province'])
train_raw2['customer_province']=lel.fit_transform(train_raw2['customer_province'])
le2=LabelEncoder()
test['customer_city']=le2.fit_transform(test['customer_city'])
train_raw2['customer_city']=le2.fit_transform(train_raw2['customer_city'])


#预处理数据临时保存
import pickle
test[test.index==1585917]['customer_city']
train_raw2.to_pickle('./train_raw.pkl')
test.to_pickle('./test.pkl')


#加载预处理的文件
with open('./train_raw.pkl', 'rb') as file:
    train_raw2 = pickle.load(file)

with open('./test.pkl', 'rb') as file:
    test = pickle.load(file)

train_raw2=train_raw2.reset_index()
test=test.reset_index()
all_df=pd.concat([train_raw2,test],axis=0)
train_raw2=all_df[all_df['label'].notnull()]
test=all_df[all_df['label'].isnull()]

#LGBM建模
import lightgbm as lgb
# LGBMClassifier经验参数
clf = lgb.LGBMClassifier(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
            n_estimators=2500, subsample=1, colsample_bytree=1,
        )
clf.fit(train_raw2.drop(['label','customer_id'],axis=1),train_raw2['label'])


#结果处理
#buy_num设置的值
#0.70457,300000
#0.7139,400000
#0.71512,500000
#0.70902 600000
#0.71555 450000
cols=train_raw2.columns.tolist()
cols.remove('label')
cols.remove('customer_id')

y_pred=clf.predict_proba(test.drop(['label','customer_id'],axis=1))[:,1] 
result=pd.read_csv('./submission.csv')
result['result']=y_pred
result2=result.sort_values('result',ascending=False).copy()
buy_num=450000
result2.index=range(len(result2))
result2.loc[result.index<=buy_num,'result']=1
result2.loc[result.index>buy_num,'result']=0
result2.sort_values('customer_id',ascending=True,inplace=True)
result2.to_csv('./baseline_0.7155.csv',index=False)

5:提分要领

1:结果集buy_num参数的调整对结果集的影响比较大

        刚刚开始buy_num设置为10000左右分数比较低 0.5左右

        我们在面对参数值调参的过程中可以采用2倍方式调整,例如第一次buy_num=10000,后面依次是20000,40000,80000....每次都以指数的方式增加,当出现分数有下降的趋势,则可以缩小访问,一步步的逼近当前模型的最高分

2:特征值的处理永远都是提分的关键

        如何抓取到准确的特征值,更多的是考个人对当前领域场景的把控。例如本次是购买预测,可以结合自己在购买东西时候的关注点,再进行分析。

        例如折扣费用,库存,优惠金额等等都是作为一个购买者考虑的问题。   

6:相关知识

LGBMClassifier经验参数

import lightgbm as lgb

clf = lgb.LGBMClassifier(            

num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',             max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,             n_estimators=2000, subsample=1, colsample_bytree=1,        

)

num_leavel=2**5-1 #树的最大叶子数,对比XGBoost一般为2^(max_depth) reg_alpha,L1正则化系数

reg_lambda,L2正则化系数

max_depth,最大树的深度

n_estimators,树的个数,相当于训练的轮数

subsample,训练样本采样率(行采样)

colsample_bytree,训练特征采样率(列采样)

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐