训练集、测试集(train_test_split)

如果拿所有原始数据来训练,存在的问题:

  1. 模型很差无法调整;

  2. 真实环境难以拿到真实 label;

所以将数据区分为 训练数据 和 测试数据(train test split);
将训练数据来训练模型;然后用测试数据测试模型;

使用这种方式也存在问题;


python 原生分离 iris 数据集

import numpy as npfrom sklearn import datasetsimport matplotlib.pyplot as plt iris = datasets.load_iris() X = iris.data y = iris.target X'''     (array([[5.1, 3.5, 1.4, 0.2],            [4.9, 3. , 1.4, 0.2],  ...            [6.2, 3.4, 5.4, 2.3],            [5.9, 3. , 5.1, 1.8]])'''y '''    array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,           0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])'''X.shape, y.shape #((150, 4), (150,)) # shuffle shuffle_indexes = np.random.permutation(len(X))  # 0--len(X) 的随机排列shuffle_indexes# array([ 22,   4, 142,  24,   7, 146,  ... 9,  95, 130,  29, 124]) test_ratio = 0.2test_size = int(len(X) * test_ratio)test_size # 30 test_indexes = shuffle_indexes[:test_size]train_indexes = shuffle_indexes[test_size:] test_indexes  '''     array([ 22,   4, 142,  24,   7, 146,  70,  77, 144,  14,  40, 119,  46, 85,  74,  87,  86,  60,  91, 120,  78,  45,  65, 105, 113,  39, 83,  80, 134,  16])'''X_train = X[train_indexes]y_train = y[train_indexes]X_test = X[test_indexes]y_test = y[test_indexes]  X_test.shape, X_train.shape # ((30, 4), (120, 4))

封装 train_test_split 函数

def train_test_split(X, y, test_ratio=0.2, seed=None):        assert x.shape[0] == y.shape[0], "the size of X must be equal to the size of y"    assert 0.0 <= test_ratio <= 1.0, "test_ ration must be valid"    if seed:        np.random.seed(seed)            shuffle_indexes = np.random.permutation(len(X))      test_size = int(len(X) * test_ratio)        test_indexes = shuffle_indexes[:test_size]    train_indexes = shuffle_indexes[test_size:]    X_train = X[train_indexes]    y_train = y[train_indexes]    X_test = X[test_indexes]    y_test = y[test_indexes]        return X_train, y_train, X_test, y_test

sklearn 中的 train_test_split

from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X, y)

train_test_split(*arrays, **options)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)


来源:https://www.icode9.com/content-4-842901.html

(0)

相关推荐