import numpy as np
import seaborn as sns
from sklearn.datasets import load_diabetes

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
)

sns.set(font_scale=1.3)

data = load_diabetes()
data.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

print(data["DESCR"])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.
(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)

data["data"].shape

(442, 10)

data["target"].shape

(442,)

X, Y = <...>

X_train, X_test, y_train, y_test = <...>

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

model = <...>
<...>

<...>

y_pred = <...>

def MSE(y_true, y_pred):
    <...>

def MAE(y_true, y_pred):
    <...>

def MAPE(y_true, y_pred):
    <...>

y_pred_train = model.predict(X_train)

print(f"MSE_train = {round(MSE(y_train, y_pred_train), 3)}")
print(f"MSE_test = {round(MSE(y_test, y_pred), 3)}")

print(f"MAE_train = {round(MAE(y_train, y_pred_train), 3)}")
print(f"MAE_test = {round(MAE(y_test, y_pred), 3)}")

print(f"MAPE_train = {round(MAPE(y_train, y_pred_train), 3)}")
print(f"MAPE_test = {round(MAPE(y_test, y_pred), 3)}")

metrics_to_check = [
    (MSE, mean_squared_error, "MSE"), # ваша реализация MSE, реализация из sklearn.metrics, название метрики
    (MAE, mean_absolute_error, "MAE"), # ваша реализация MAE, реализация из sklearn.metrics, название метрики
    (MAPE, mean_absolute_percentage_error, "MAPE"), # ваша реализация MAPE, реализация из sklearn.metrics, название метрики
]

for your_metrics, sklearn_metrics, name in metrics_to_check:
    assert (
        np.abs(your_metrics(y_test, y_pred) - sklearn_metrics(y_test, y_pred))
        < 1e-4
    ), f"Ошибка в реализации {name}"

<...>

feature_names = <...>
coefficients = <...>

fig, ax = plt.subplots(figsize=(15, 5))
bars = plt.bar(feature_names, coefficients)

norm = mcolors.Normalize(vmin=0, vmax=max(abs(coefficients)))
cmap = plt.cm.viridis

for bar, value in zip(bars, coefficients):
    bar.set_color(cmap(norm(abs(value))))

sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
cbar = plt.colorbar(sm)
cbar.set_label('Абсолютное значение')

ax.set_title('Коэффициенты модели');

print(data["DESCR"].split("baseline\n\n")[-1].split("\n\nNote")[0])

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Линейная регрессия¶

1. Загрузка данных¶

2. Обучение моделей¶

3. Интерпретация результатов¶