江东的笔记

Be overcome difficulties is victory

0%

系统认证风险预测-Baseline

构建用户认证行为特征模型和风险异常评估模型,利用风险评估模型去判断当前用户认证行为是否存在风险

比赛任务

本赛题中,参赛团队将基于用户认证行为数据及风险异常标记结构,构建用户认证行为特征模型和风险异常评估模型,利用风险评估模型去判断当前用户认证行为是否存在风险

利用用户认证数据构建行为基
采用监督学习模型,基于用户认证行为特征,构建风险异常评估模型,判断当前用户认证行为是否存在风险

比赛数据集

https://www.heywhale.com/mw/dataset/6189288bebdfaf0017562059/file

赛题baseline:

导入包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import warnings
warnings.simplefilter('ignore')

import os
import re
import gc
import json

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.3f' % x)
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

合并数据集

1
2
3
train = pd.read_csv("E://data//DF//CCK-系统认证风险预测/train_dataset.csv", sep='\t')
test = pd.read_csv('E://data//DF//CCK-系统认证风险预测/test_dataset.csv', sep='\t')
data = pd.concat([train, test])

特征转换

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
data['location_first_lvl'] = data['location'].astype(str).apply(lambda x: json.loads(x)['first_lvl'])
data['location_sec_lvl'] = data['location'].astype(str).apply(lambda x: json.loads(x)['sec_lvl'])
data['location_third_lvl'] = data['location'].astype(str).apply(lambda x: json.loads(x)['third_lvl'])


data['auth_type'].fillna('__NaN__', inplace=True)

for col in tqdm(['user_name', 'action', 'auth_type', 'ip',
'ip_location_type_keyword', 'ip_risk_level', 'location', 'device_model',
'os_type', 'os_version', 'browser_type', 'browser_version',
'bus_system_code', 'op_target', 'location_first_lvl', 'location_sec_lvl',
'location_third_lvl']):
lbl = LabelEncoder()
data[col] = lbl.fit_transform(data[col])

时间的处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
data['op_date'] = pd.to_datetime(data['op_date'])
data['op_ts'] = data["op_date"].values.astype(np.int64) // 10 ** 9
# data["op_date"].values.astype(np.int64)
# data = data.sort_values(by=['user_name', 'op_ts', "action"]).reset_index(drop=True)
data = data.sort_values(by=['user_name', 'op_ts', "action"]).reset_index(drop=True)
# data['last_ts'] = data.groupby(['user_name',"action"])['op_ts'].shift(1)
data['last_ts'] = data.groupby(['user_name'])['op_ts'].shift(1)
data['ts_diff1'] = data['op_ts'] - data['last_ts']
data["weekday"] = data["op_date"].dt.dayofweek+1
# data.groupby("weekday")["ts_diff1"].sum()
data["year"] = data["op_date"].dt.year
data["year"] = data["year"].map({2018:0, 2019:1, 2020:2})
data["month"] = data["op_date"].dt.month
data["day"] = data["op_date"].dt.day

特征构建:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
data["ts_diff1_log"] = data["ts_diff1"].apply(np.log)
data["ts_diff1_log_log"] = data["ts_diff1"].apply(np.log).apply(np.log)
for f in ['ip', 'location', 'device_model', 'os_version']:
data[f'user_{f}_nunique'] = data.groupby(['user_name',"action"])[f].transform('nunique')
for method in ['mean', 'max', 'min', 'std',"prod"]:
data[f'ts_diff1_{method}'] = data.groupby(['user_name',"action"])['ts_diff1'].transform(method)

=```
构建与标签相关性高的数据(但是并没有业务意义)

```python
data["auth_type//ip_risk_level"] = data["auth_type"]/data["ip_risk_level"]
data["ip_risk_level//auth_type"] = data["ip_risk_level"]/data["auth_type"]
data["browser_type//auth_type"] = data["browser_type"]/data["auth_type"]
data["browser_version//auth_type"] = data["browser_version"]/data["auth_type"]

查看相关性

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
data.corr()["risk_label"]


# 与标签的相关性
ip_risk_level//auth_type -0.032
auth_type//ip_risk_level -0.028
browser_type//auth_type -0.025
browser_version//auth_type -0.021
op_ts -0.021
last_ts -0.021
year -0.017
browser_version -0.009
browser_type -0.009
weekday -0.009
day -0.008
month -0.008
ip_risk_level -0.007
auth_type -0.006

删除没用的特征

1
2
3
4
data.drop(['client_type', 'browser_source', "user_name", "bus_system_code"], axis=1, inplace=True)
# data.drop(['client_type', 'browser_source', "browser_type"], axis=1, inplace=True)
train = data[data['risk_label'].notna()]
test = data[data['risk_label'].isna()]

查看相关性热力图
在这里插入图片描述
选取feature_names

1
2
3
ycol = 'risk_label'
feature_names = list(
filter(lambda x: x not in [ycol, 'session_id', 'op_date', 'last_ts'], train.columns))

训练模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
model = lgb.LGBMClassifier(objective='binary',
boosting_type='gbdt',
tree_learner='serial',
num_leaves=32,
max_depth=6,
learning_rate=0.05,
n_estimators=3000,
subsample=0.8,
feature_fraction=0.6,
reg_alpha=0.,
reg_lambda=0.,
random_state=1983,
is_unbalance=True,
metric='auc')


oof = []
prediction = test[['session_id']]
prediction[ycol] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1983)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
X_train = train.iloc[trn_idx][feature_names]
Y_train = train.iloc[trn_idx][ycol]

X_val = train.iloc[val_idx][feature_names]
Y_val = train.iloc[val_idx][ycol]

print('\nFold_{} Training ================================\n'.format(fold_id+1))

lgb_model = model.fit(X_train,
Y_train,
eval_names=['train', 'valid'],
eval_set=[(X_train, Y_train), (X_val, Y_val)],
verbose=500,
eval_metric='auc',
early_stopping_rounds=50)

pred_val = lgb_model.predict_proba(
X_val, num_iteration=lgb_model.best_iteration_)
df_oof = train.iloc[val_idx][['session_id', ycol]].copy()
df_oof['pred'] = pred_val[:, 1]
oof.append(df_oof)

pred_test = lgb_model.predict_proba(
test[feature_names], num_iteration=lgb_model.best_iteration_)
prediction[ycol] += pred_test[:, 1] / kfold.n_splits

df_importance = pd.DataFrame({
'column': feature_names,
'importance': lgb_model.feature_importances_,
})
df_importance_list.append(df_importance)

del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
gc.collect()


df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
'mean').sort_values(ascending=False).reset_index()
df_oof = pd.concat(oof)
print('roc_auc_score', roc_auc_score(df_oof[ycol], df_oof['pred']))

## roc_auc_score 0.5131761316649879

prediction['id'] = range(len(prediction))
prediction['id'] = prediction['id'] + 1
prediction = prediction[['id', 'risk_label']].copy()
prediction.columns = ['id', 'ret']
prediction.head()

#

id ret
6147 1 0.378
6148 2 0.488
6149 3 0.502
6150 4 0.509
6151 5 0.480