训练集、测试集(train_test_split)
如果拿所有原始数据来训练,存在的问题:
模型很差无法调整;
真实环境难以拿到真实 label;
所以将数据区分为 训练数据 和 测试数据(train test split);
将训练数据来训练模型;然后用测试数据测试模型;
使用这种方式也存在问题;
python 原生分离 iris 数据集
import numpy as npfrom sklearn import datasetsimport matplotlib.pyplot as plt iris = datasets.load_iris() X = iris.data y = iris.target X''' (array([[5.1, 3.5, 1.4, 0.2], [4.9, 3. , 1.4, 0.2], ... [6.2, 3.4, 5.4, 2.3], [5.9, 3. , 5.1, 1.8]])'''y ''' array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])'''X.shape, y.shape #((150, 4), (150,)) # shuffle shuffle_indexes = np.random.permutation(len(X)) # 0--len(X) 的随机排列shuffle_indexes# array([ 22, 4, 142, 24, 7, 146, ... 9, 95, 130, 29, 124]) test_ratio = 0.2test_size = int(len(X) * test_ratio)test_size # 30 test_indexes = shuffle_indexes[:test_size]train_indexes = shuffle_indexes[test_size:] test_indexes ''' array([ 22, 4, 142, 24, 7, 146, 70, 77, 144, 14, 40, 119, 46, 85, 74, 87, 86, 60, 91, 120, 78, 45, 65, 105, 113, 39, 83, 80, 134, 16])'''X_train = X[train_indexes]y_train = y[train_indexes]X_test = X[test_indexes]y_test = y[test_indexes] X_test.shape, X_train.shape # ((30, 4), (120, 4))
封装 train_test_split 函数
def train_test_split(X, y, test_ratio=0.2, seed=None): assert x.shape[0] == y.shape[0], "the size of X must be equal to the size of y" assert 0.0 <= test_ratio <= 1.0, "test_ ration must be valid" if seed: np.random.seed(seed) shuffle_indexes = np.random.permutation(len(X)) test_size = int(len(X) * test_ratio) test_indexes = shuffle_indexes[:test_size] train_indexes = shuffle_indexes[test_size:] X_train = X[train_indexes] y_train = y[train_indexes] X_test = X[test_indexes] y_test = y[test_indexes] return X_train, y_train, X_test, y_test
sklearn 中的 train_test_split
from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X, y)
train_test_split(*arrays, **options)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)
来源:https://www.icode9.com/content-4-842901.html
赞 (0)