ML之DT:基于DT决策树算法(交叉验证FS+for遍历最佳FS)对Titanic(泰坦尼克号)数据集进行二分类预测
ML之DT:基于DT决策树算法(交叉验证FS+for遍历最佳FS)对Titanic(泰坦尼克号)数据集进行二分类预测
输出结果
设计思路
核心代码
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile = i)
X_train_fs = fs.fit_transform(X_train, y_train)
scores = cross_val_score(dt, X_train_fs, y_train, cv=5)
class SelectPercentile(_BaseFilter):
"""Select features according to a percentile of the highest scores.
Read more in the :ref:`User Guide <univariate_feature_selection>`.
Parameters
----------
score_func : callable
Function taking two arrays X and y, and returning a pair of arrays
(scores, pvalues) or a single array with scores.
Default is f_classif (see below "See also"). The default function only
works with classification tasks.
percentile : int, optional, default=10
Percent of features to keep.
Attributes
----------
scores_ : array-like, shape=(n_features,)
Scores of features.
pvalues_ : array-like, shape=(n_features,)
p-values of feature scores, None if `score_func` returned only scores.
Notes
-----
Ties between features with equal scores will be broken in an unspecified
way.
See also
--------
f_classif: ANOVA F-value between label/feature for classification tasks.
mutual_info_classif: Mutual information for a discrete target.
chi2: Chi-squared stats of non-negative features for classification tasks.
f_regression: F-value between label/feature for regression tasks.
mutual_info_regression: Mutual information for a continuous target.
SelectKBest: Select features based on the k highest scores.
SelectFpr: Select features based on a false positive rate test.
SelectFdr: Select features based on an estimated false discovery rate.
SelectFwe: Select features based on family-wise error rate.
GenericUnivariateSelect: Univariate feature selector with configurable mode.
"""
def __init__(self, score_func=f_classif, percentile=10):
super(SelectPercentile, self).__init__(score_func)
self.percentile = percentile
def _check_params(self, X, y):
if not 0 <= self.percentile <= 100:
raise ValueError(
"percentile should be >=0, <=100; got %r" % self.percentile)
def _get_support_mask(self):
check_is_fitted(self, 'scores_')
# Cater for NaNs
if self.percentile == 100:
return np.ones(len(self.scores_), dtype=np.bool)
elif self.percentile == 0:
return np.zeros(len(self.scores_), dtype=np.bool)
scores = _clean_nans(self.scores_)
treshold = stats.scoreatpercentile(scores,
100 - self.percentile)
mask = scores > treshold
ties = np.where(scores == treshold)[0]
if len(ties):
max_feats = int(len(scores) * self.percentile / 100)
kept_ties = ties[:max_feats - mask.sum()]
mask[kept_ties] = True
return mask
赞 (0)