import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,\
                                    LeaveOneOut,\
                                    LeavePOut,\
                                    KFold,\
                                    ShuffleSplit,\
                                    GridSearchCV,\
                                    StratifiedKFold,\
                                    StratifiedShuffleSplit,\
                                    LeaveOneGroupOut,\
                                    LeavePGroupsOut,\
                                    GroupKFold,\
                                    GroupShuffleSplit,\
                                    cross_val_score

from sklearn import datasets
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.collections import LineCollection

sns.set(style='whitegrid', font_scale=1.3, palette='Set2')

# загружаем датасет
data_full = datasets.load_iris()
print("Shape: {}".format(data_full.data.shape))

Shape: (150, 4)

X_train, X_test, y_train, y_test = train_test_split(
    # *arrays: принимает индексируемые объекты с совпадающей shape[0].
    # Например: list, np.array, pd.DataFrame.
    data_full.data, data_full.target,
    test_size=0.4,  # доля данных, которые берем в тестовую выборку
    random_state=0,  # фиксируем случайность
    shuffle=True,  # перемешивает данные в случайном порядке
    stratify=None  # если не None, то сохраняет доли классов при разбиении
)

print("Shape of train data: {} {}".format(X_train.shape, y_train.shape))
print("Shape of test data: {} {}".format(X_test.shape, y_test.shape))

Shape of train data: (90, 4) (90,)
Shape of test data: (60, 4) (60,)

plt.figure(figsize=(17,5))
split_cases = [data_full.target, y_train, y_test]
colors = ['orange', '#0066FF', '#00CC66']
labels = ['Распределение классов в data_full',
          'Распределение классов в train',
          'Распределение классов в test']
for i in range(3):
    plt.subplot(1, 3, i + 1)
    values, counts = np.unique(split_cases[i], return_counts=True)
    plt.bar(values, counts, width=0.5, color=colors[i])
    plt.ylim(0, 55)
    plt.xticks([0, 1, 2])
    plt.xlabel('Класс')
    plt.ylabel('Количество объектов')
    plt.title(labels[i])
plt.show()

X_train, X_test, y_train, y_test = train_test_split(
    # *arrays: принимает индексируемые объекты с совпадающей shape[0].
    # Например: list, np.array, pd.DataFrame.
    data_full.data, data_full.target,
    test_size=0.4, # доля данных, которые берем в тестовую выборку
    random_state=0, # фиксируем случайность
    shuffle=True, # перемешивает данные в случайном порядке
    # сохраняем доли классов при разбиении как в таргете
    stratify=data_full.target
)

plt.figure(figsize = (17,5))
split_cases = [data_full.target, y_train, y_test]
colors = ['orange', '#0066FF', '#00CC66']
labels = ['Распределение классов в data_full',
          'Распределение классов в train',
          'Распределение классов в test']
for i in range(3):
    plt.subplot(1, 3, i + 1)
    values, counts = np.unique(split_cases[i], return_counts=True)
    plt.bar(values, counts, width=0.5, color=colors[i])
    plt.ylim(0, 55)
    plt.xticks([0, 1, 2])
    plt.xlabel('Класс')
    plt.ylabel('Количество объектов')
    plt.title(labels[i])
plt.show()

housing = datasets.fetch_california_housing()

X = pd.DataFrame(data=housing['data'], columns=housing['feature_names'])
y = housing['target']

print(housing['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bureau publishes sample data (a block group typically has a population
of 600 to 3,000 people).

A household is a group of people residing within a home. Since the average
number of rooms and bedrooms in this dataset are provided per household, these
columns may take surprisingly large values for block groups with few households
and many empty houses, such as vacation resorts.

It can be downloaded/loaded using the
:func:`sklearn.datasets.fetch_california_housing` function.

.. topic:: References

    - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
      Statistics and Probability Letters, 33 (1997) 291-297

X.head()

X.shape

(20640, 8)

model = LinearRegression()
scores = cross_val_score(
    estimator=model, # модель, качество которой хотим оценить
    X=X, # данные для обучения (не содержат целевую переменную)
    y=y, # значения целевой переменной
    cv=5, # количество фолдов
    scoring='neg_mean_squared_error', # метрика качества
    n_jobs=-1 # количество ядер для вычислений, -1 - использование всех ядер
)
scores

array([-0.48485857, -0.62249739, -0.64621047, -0.5431996 , -0.49468484])

plt.figure(figsize=(6, 4))
plt.hist(y)
plt.xlabel('Цена квартиры (сотни тыс. $)')
plt.ylabel('Количество')
plt.title('Распределение цен на квартиры в Калифорнии')
plt.show()

plt.figure(figsize=(6, 4))
plt.bar(range(1, 6), (-1)*scores, width=0.7)
plt.hlines(np.mean((-1)*scores), 0.5, 5.5, color='#FF6600', lw=3, label='Среднее по фолдам')
plt.xlabel('Номер фолда')
plt.ylabel('Значение MSE')
plt.title('MSE на различных фолдах')
plt.ylim((0, 0.8))
plt.legend()
plt.show()

kf = KFold(
    n_splits=2,  # количество фолдов
    shuffle=False  # перемешиваем ли данные перед разбиением
)
kf

KFold(n_splits=2, random_state=None, shuffle=False)

kf.get_n_splits()

2

kf.split(
    X=X  # данные для разбиения
)

<generator object _BaseKFold.split at 0x7bd78ed95150>

data = np.array([[81, 27], [26, 45], [83, 64], [25, 98]])
data

array([[81, 27],
       [26, 45],
       [83, 64],
       [25, 98]])

for train_index, test_index in kf.split(data):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [2 3] TEST: [0 1]
TRAIN: [0 1] TEST: [2 3]

scores = cross_val_score(estimator=model, X=X, y=y, cv=kf,
                         scoring='neg_mean_squared_error')
scores

array([-0.59144828, -0.54821269])

X = [1, 2, 3, 4]
loo = LeaveOneOut()

# итерируемся по разбиениям множества индексов
for train, test in loo.split(X):
    print("%s %s" % (train, test))

[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]

X = [0.76, 0.43, 0.47, 0.82, 0.22]  # какая-то выборка размера 5
lpo = LeavePOut(p=2)  # p - количество элементов в отложенном фолде
for train, test in lpo.split(X):
     print("%s %s" % (train, test))

[2 3 4] [0 1]
[1 3 4] [0 2]
[1 2 4] [0 3]
[1 2 3] [0 4]
[0 3 4] [1 2]
[0 2 4] [1 3]
[0 2 3] [1 4]
[0 1 4] [2 3]
[0 1 3] [2 4]
[0 1 2] [3 4]

X = np.arange(10)  # какая-то выборка размера 10
ss = ShuffleSplit(
    # количество итераций перемешивания с разбиением на train и test
    n_splits=5,
    # доля объектов, которые хотим класть в test на каждой итерации
    test_size=0.25,
    random_state=0
)

for train_index, test_index in ss.split(X):
    print("%s %s" % (train_index, test_index))

[9 1 6 7 3 0 5] [2 8 4]
[2 9 8 0 6 7 4] [3 5 1]
[4 5 1 0 6 9 7] [2 3 8]
[2 7 5 8 0 3 4] [6 1 9]
[4 1 0 6 8 9 3] [5 2 7]

iris = datasets.load_iris()
X = pd.DataFrame(data=iris['data'], columns=iris['feature_names'])
y = iris['target']

skf = StratifiedKFold(n_splits=3, shuffle=True)

colors = ['orange', '#0066FF', '#00CC66']

plt.figure(figsize=(7, 10))
for i, (train, test) in enumerate(skf.split(X, y)):
    plt.subplot(3,2,2*i + 1)
    if i == 0:
        plt.title('train')
    values, counts = np.unique(y[train], return_counts=True)
    plt.bar(values, counts, width=0.5, color=colors[i],
            label='{} итерация'.format(i + 1))
    plt.legend()
    plt.ylim(0, 55)
    plt.xticks([0, 1, 2])
    if 2*i + 1 == 5:
        plt.xlabel('Класс')
    plt.ylabel('Количество объектов')
    plt.subplot(3,2, 2*i + 2)
    if i == 0:
        plt.title('test')
    values, counts = np.unique(y[test], return_counts=True)
    plt.bar(values, counts, width=0.5, color=colors[i],
            label='{} итерация'.format(i + 1))
    plt.legend()
    plt.ylim(0, 55)
    plt.xticks([0, 1, 2])
    if 2*i + 2 == 6:
        plt.xlabel('Класс')

plt.show()

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2,
                             random_state=0)
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 0, 1, 1, 1])

for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [2 5 1 3] TEST: [0 4]
TRAIN: [0 4 3 1] TEST: [2 5]
TRAIN: [0 4 3 1] TEST: [2 5]
TRAIN: [5 4 1 2] TEST: [0 3]
TRAIN: [1 5 2 4] TEST: [0 3]

X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]

gkf = GroupKFold(n_splits=3)

for train, test in gkf.split(X, y, groups=groups):
     print("%s %s" % (train, test))

[0 1 2 3 4 5] [6 7 8 9]
[0 1 2 6 7 8 9] [3 4 5]
[3 4 5 6 7 8 9] [0 1 2]

X = [1, 5, 10, 50, 60, 70, 80]
y = [0, 1, 1, 2, 2, 2, 2]
groups = [1, 1, 2, 2, 3, 3, 3]
logo = LeaveOneGroupOut()
for train, test in logo.split(X, y, groups=groups):
     print("%s %s" % (train, test))

[2 3 4 5 6] [0 1]
[0 1 4 5 6] [2 3]
[0 1 2 3] [4 5 6]

X = np.arange(6)
y = [1, 1, 1, 2, 2, 2]
groups = [1, 1, 2, 2, 3, 3]
lpgo = LeavePGroupsOut(n_groups=2)
for train, test in lpgo.split(X, y, groups=groups):
    print("%s %s" % (train, test))

[4 5] [0 1 2 3]
[2 3] [0 1 4 5]
[0 1] [2 3 4 5]

X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
y = ["a", "b", "b", "b", "c", "c", "c", "a"]
groups = [1, 1, 2, 2, 3, 3, 4, 4]
gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
for train, test in gss.split(X, y, groups=groups):
    print("%s %s" % (train, test))

[0 1 2 3] [4 5 6 7]
[2 3 6 7] [0 1 4 5]
[2 3 4 5] [0 1 6 7]
[4 5 6 7] [0 1 2 3]

	MedInc	HouseAge	AveRooms	AveBedrms	Population	AveOccup	Latitude	Longitude
0	8.3252	41.0	6.984127	1.023810	322.0	2.555556	37.88	-122.23
1	8.3014	21.0	6.238137	0.971880	2401.0	2.109842	37.86	-122.22
2	7.2574	52.0	8.288136	1.073446	496.0	2.802260	37.85	-122.24
3	5.6431	52.0	5.817352	1.073059	558.0	2.547945	37.85	-122.25
4	3.8462	52.0	6.281853	1.081081	565.0	2.181467	37.85	-122.25

Phystech@DataScience ¶

Валидация¶

1 Валидация на отложенной выборке (holdout validation)¶

2 Кросс-валидация (cross-validation)¶

2.1. k-Fold Cross Validation¶

2.2. Leave One Out (LOO)¶

2.3. LeavePOut¶

2.4. ShuffleSplit¶

3 Стратифицированная Кросс-валидация¶

3.1. Stratified KFold¶

3.2. Stratified Shuffle Split¶

4 Групповая кросс-валидация¶

4.1. Group KFold¶

4.2. Leave One Group Out¶

4.3. Leave P Groups Out¶

4.4. Group Shuffle Split¶

Итоги¶

Phystech@DataScience¶

Валидация¶

1 Валидация на отложенной выборке (holdout validation)¶

2 Кросс-валидация (cross-validation)¶

2.1. k-Fold Cross Validation¶

2.2. Leave One Out (LOO)¶

2.3. LeavePOut¶

2.4. ShuffleSplit¶

3 Стратифицированная Кросс-валидация¶

3.1. Stratified KFold¶

3.2. Stratified Shuffle Split¶

4 Групповая кросс-валидация¶

4.1. Group KFold¶

4.2. Leave One Group Out¶

4.3. Leave P Groups Out¶

4.4. Group Shuffle Split¶

Итоги¶

Phystech@DataScience ¶