数据挖掘竞赛常用代码段

摘要:
常用库importgcimportosimportcsvimporttimeimportmathimportdatetimeimportcollectionsimportpandasaspdimportnumpyasnpfromtqdmimporttqdm,tqdm_notebook,trangefromsklearnimportpreprocessingimportlightgbmaslgbim

常用库

import gc
import os
import csv
import time
import math
import datetime
import collections
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook, trange
from sklearn import preprocessing

import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

降低内存

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

常用统计特征

一阶

 tmp_df = use_Mb_info_n.groupby("user_id").agg({
        "x1": ["sum","max","min","mean"],
        "x2": ["count","nunique"],
        "x3":["nunique"],
        "x4":["sum"]
    })
tmp_df.columns = ['_'.join(str(xx) for xx in x) for x in tmp_df.columns.ravel()]
tmp_df = tmp_df.reset_index()
feature_df = pd.merge(feature_df, tmp_df, how='left', on='user_id')

二阶

tmp_df = train_data.groupby(["user_id", "date"]).agg({
        "x1": ["sum", "mean", "max", "skew", pd.DataFrame.kurt],
        "x2": ["nunique"],
        "x3": ["sum"],
        "x4": ["sum"]
    })
tmp_df.columns = ["_".join((str(xx) for xx in x)) for x in tmp_df.columns.ravel()]
tmp_df = tmp_df.unstack(level=-1)
tmp_df.columns = ["_".join((str(xx) for xx in x)) for x in tmp_df.columns.ravel()]
tmp_df = tmp_df.reset_index()
feature_df = pd.merge(feature_df, tmp_df, how='left', on='uid')

画图

协方差

def correlation_heatmap(df):
    _ , ax = plt.subplots(figsize =(50, 50))
    colormap = sns.diverging_palette(220, 10, as_cmap = True)
    
    _ = sns.heatmap(
        df.corr(), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':.9 }, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':12 }
    )
    
    plt.title('Pearson Correlation of Features', y=1.05, size=15)

correlation_heatmap(your_df)

正常显示中文

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

大小

plt.rcParams['figure.figsize'] = (10,5)
plt.rcParams['figure.dpi'] = 200

count条状图

sns.countplot(y="店铺名称", data=df, color="c")

训练

lgb

n_splits = 5    # 分为5折
seed = 19950115     # 随机种子
gbm=None
# lgb 参数
lgb_params = {
    "learning_rate": 0.005,
    "lambda_l2": 0.15,
    "max_depth": 9,
    "objective": "binary",
    "verbose": -1,
    # 'feature_fraction': 0.9,
    # "min_split_gain": 0.1,
    "boosting_type": "gbdt",
    "subsample": 0.75,
    "colsample_bytree": 0.75,
    # "colsample_bylevel": 0.9,
    "scale_pos_weight": 16,
    'metric': ['auc'],  # 评估函数
}

df_train_columns = [c for c in data.columns if c not in ["label", "uid", "user_id"]]
label = data['label']

predictions = 0
feature_importance_df = pd.DataFrame()
skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)

for fold_, (trn_idx, val_idx) in enumerate(skf.split(data, label.values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(data.iloc[trn_idx][df_train_columns], label=label.iloc[trn_idx])
    val_data = lgb.Dataset(data.iloc[val_idx][df_train_columns], label=label.iloc[val_idx])

    gbm = lgb.train(lgb_params, 
                    trn_data, 
                    # init_model=gbm,  
                    num_boost_round=150000, 
                    valid_sets=[trn_data, val_data],
                    early_stopping_rounds=200, 
                    verbose_eval=200)     # 训练
    # clf = joblib.load("model/lgb_{}.m".format(index))     # 保存模型
    # joblib.dump(clf, "model/lgb_{}.m".format(index))      # 加载模型
    # gbm.save_model(MODEL_PATH+'/lgb_more_fea.model', num_iteration=gbm.best_iteration)
    y_pred = gbm.predict(data.iloc[val_idx][df_train_columns], num_iteration=gbm.best_iteration)
    # qauc_score = qauc(y_pred, data.iloc[val_idx][df_train_columns], label.iloc[val_idx])
    # print("qauc: ", qauc_score)
    # y_score.append(qauc_score)  # 计算auc值
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = df_train_columns
    fold_importance_df["importance"] = gbm.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += y_pred.T[0] / skf.n_splits

画特征重要性

cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure(figsize=(14,26))
sns.barplot(x="importance",
            y="Feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()

画树

fig, ax = plt.subplots(figsize=(100, 100))
lgb.plot_tree(gbm, ax=ax)

xgb

xgb1 = xgb.XGBClassifier(max_depth=9,
                       learning_rate=0.005,
                       n_estimators=10000,
                       colsample_bytree=0.75,
                       sub_sample=0.75,
                       reg_lambda=0.15,
                       n_jobs=4,
                       random_state=3,
                       scale_pos_weight = 16)

df_train_columns = [c for c in data.columns if c not in ["label", "uid", "user_id"]]
label = data['label']

n_splits = 5    # 分为5折
seed = 19950115     # 随机种子
skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)

for fold_, (trn_idx, val_idx) in enumerate(skf.split(data, label.values)):
    print("fold {}".format(fold_))
    X_train = data.iloc[trn_idx][df_train_columns]
    y_train = label.iloc[trn_idx]
    X_valid = data.iloc[val_idx][df_train_columns]
    y_valid = label.iloc[val_idx]
    
    watchlist = [(X_train,y_train),(X_valid,y_valid)]

    xbm = xgb1.fit(
                X=X_train,
               y=y_train,
                eval_set  = watchlist,
                early_stopping_rounds=200,
                verbose =100,
                eval_metric='auc',
                )     # 训练

画特征重要性

fig, ax = plt.subplots(figsize=(14, 26))
xgb.plot_importance(xbm, ax=ax, height=0.3)

画树

xgb.plot_tree(clf, num_trees=0, fmap='xgb.fmap')
fig = plt.gcf()
fig.set_size_inches(150, 100)
plt.show()

免责声明:文章转载自《数据挖掘竞赛常用代码段》仅用于学习参考。如对内容有疑问,请及时联系本站处理。

上篇@font-face在vue中的使用curl在windows下和linux中使用的一个区别下篇

宿迁高防,2C2G15M,22元/月;香港BGP,2C5G5M,25元/月 雨云优惠码:MjYwNzM=

随便看看

数据不平衡的相关

大多数常见的机器学习算法不能很好地处理不平衡的数据集。例如,搜索引擎的点击预测(点击页面往往占很小的比例)、电子商务中的产品推荐(正在购买的推荐产品的比例很低)、信用卡欺诈检测、网络攻击识别、癌症检测等。处理数据不平衡的方法主要有以下几种。2.数据级别2.1重新采样2.1.1欠采样(下采样)欠采样通过减少丰富类的大小来平衡数据集。它试图通过增加稀有样本的数量...

电脑不识别USB blaster驱动问题

电脑不识别USB blaster,如下图: 解决办法:手动更新...

kafka命令

启动kafka:./kafka-server-start.sh../config/server.properties&查看topic./kafka-topics.sh--zookeeper192.168.8.56:2181,192.168.8.70:2181,192.168.8.147:2181--describe--topicliuhangjun....

sqlserver 计算 百分比

selectltrim+'%'As百分比NUMERIC(P,S)P的默认值是:38S的默认值是:-84~127numeric(a,b)函数有两个参数,前面一个为总的位数,后面一个参数是小数点后的位数,例如numeric(5,2)是总位数为5,小数点后为2位的数,也就是说这个字段的整数位最大是3位。...

adb

ADB(AndroidDebugBridge)ANR(ApplicationNoResponding)ADB实际上是Android调试桥AndroidDebugBridge的缩写。adb是C/S体系结构的命令行工具。这里我们介绍一些常用的命令:adbdevices,获取设备列表和设备状态[xuxu:~]$adbdevicesList-devicesattac...

mysql 视图

如果更新的值不在视图范围内,则不允许更新。如果创建视图时未使用withcheck选项,则MySQL在更新视图中的某些数据时不会执行有效性检查。对于上面的团队视图,MySQL将使用视图的公式来替换它,视图公式将合并到select中。也就是说,它最终被提交给MySQL来处理SQL语句。具体来说,MySQL首先获得视图执行结果,该结果形成中间结果,并临时存储在内存...