import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import scipy.stats as sps
%matplotlib inline

sns.set(palette="pastel", style='whitegrid', font_scale=1.3)

import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

data = pd.read_csv('superstore_data.csv')
data

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   int64  
 16  NumWebPurchases      2240 non-null   int64  
 17  NumCatalogPurchases  2240 non-null   int64  
 18  NumStorePurchases    2240 non-null   int64  
 19  NumWebVisitsMonth    2240 non-null   int64  
 20  Response             2240 non-null   int64  
 21  Complain             2240 non-null   int64  
dtypes: float64(1), int64(18), object(3)
memory usage: 385.1+ KB
None

data = data[data['Income'].notna()].copy()

duplicate_rows = data.duplicated()
print(f"Полных дубликатов: {duplicate_rows.sum()}")

Полных дубликатов: 0

duplicate_rows = data['Id'].duplicated()
print(f"Дубликаты клиентов: {duplicate_rows.sum()}")

Дубликаты клиентов: 0

data['Dt_Customer'] = pd.to_datetime(data['Dt_Customer'], format='%m/%d/%Y')
# data['Age'] = 1990 - data['Year_Birth'] пока не будем добавлять, сделаем это позже
data['Total_Children'] = data['Kidhome'] + data['Teenhome']
data['Total_Spent'] = data[['MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds']].sum(axis=1)

cat_columns = ['Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'Total_Children', 'Response', 'Complain']
num_columns = ['Year_Birth', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
               'NumDealsPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
               'NumWebVisitsMonth', 'Total_Spent', 'Income', 'Recency']

edu_counts = data['Education'].value_counts(normalize=True) * 100
sns.barplot(x=edu_counts.index, y=edu_counts.values, palette='pastel')
plt.ylabel('% от общего числа')
plt.title('Распределение Education')
plt.xticks(rotation=45)
plt.show()

edu_counts = data['Education'].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(
    edu_counts,
    labels=edu_counts.index,
    autopct='%1.1f%%',
    startangle=90,
    colors=sns.color_palette('pastel'),
    wedgeprops={'edgecolor': 'black'}
)
plt.title('Распределение Education')
plt.axis('equal')
plt.show()

n_cat = len(cat_columns)
cols = 3
rows = math.ceil(n_cat / cols)

fig_cat, axes_cat = plt.subplots(rows, cols, figsize=(cols * 10, rows * 7))
axes_cat = axes_cat.flatten()

for i, col in enumerate(cat_columns):
    ax = axes_cat[i]
    order = data[col].value_counts().index  # Чтобы порядок был по убыванию
    counts = data[col].value_counts()
    total = len(data)

    sns.countplot(data=data, x=col, ax=ax, order=order, palette="pastel", legend=False)
    ax.set_title(col)
    ax.tick_params(axis='x', rotation=45)

    for p in ax.patches:
        count = int(p.get_height())
        percent = 100 * count / total
        ax.annotate(f'{count} ({percent:.1f}%)',
                    (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center', va='bottom', fontsize=12)
    ax.set_ylim(0, max(counts.values) * 1.15)

# Удаляем лишние оси
for j in range(i + 1, len(axes_cat)):
    fig_cat.delaxes(axes_cat[j])

fig_cat.suptitle("Категориальные признаки - барплоты с количеством и процентами", fontsize=24)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

# Объединяем категории
data['Marital_Status'] = data['Marital_Status'].replace({'Alone': 'Single'})

# Удаляем нерелевантные значения
data = data[~data['Marital_Status'].isin(['YOLO', 'Absurd'])].copy()

# Создаем кросс-таблицу с абсолютными значениями
cross_counts = pd.crosstab(data['Education'], data['Marital_Status'])

# Преобразуем таблицу в длинный формат для seaborn
grouped_data = cross_counts.reset_index().melt(id_vars='Education',
                                              var_name='Marital_Status',
                                              value_name='Count')


# Stacked Bar
cross_tab = pd.crosstab(data['Education'], data['Marital_Status'], normalize='index') * 100
cross_tab.plot(kind='bar', stacked=True)
plt.ylabel('% внутри Education')
plt.title('Семейное положение в зависимости от образования')
plt.legend(title='Marital Status', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Grouped Barplot
plt.figure(figsize=(10, 6))
sns.barplot(data=grouped_data, x='Education', y='Count', hue='Marital_Status')
plt.title('Распределение образования по семейному положению (grouped bar)')
plt.xticks(rotation=45)
plt.legend(title='Marital Status', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cross_counts, annot=True, fmt='d')
plt.title('Кросстаблица: Education vs Marital_Status')
plt.ylabel('Education')
plt.xlabel('Marital Status')
plt.tight_layout()
plt.show()

# Гистограмма
plt.figure(figsize=(8, 5))
sns.histplot(data['Income'], bins=30)
plt.title('Гистограмма дохода')
plt.xlabel('Доход')
plt.ylabel('Частота')
plt.show()

# Боксплот
plt.figure(figsize=(6, 4))
sns.boxplot(y=data['Income'], width=0.2)
plt.title('Boxplot дохода')
plt.ylabel('Доход')
plt.show()

# Violin plot
plt.figure(figsize=(6, 4))
sns.violinplot(y=data['Income'], inner='box', width=0.2)
plt.title('Violin plot дохода')
plt.ylabel('Доход')
plt.show()

plt.figure(figsize=(8, 5))

sns.histplot(data['Income'], bins=30, stat='density', kde=True, edgecolor='black')

# Rugplot (штрихи снизу)
sns.rugplot(data['Income'], color='black', height=0.05)

# Теоретическая плотность по нормальному распределению
xmin, xmax = data['Income'].min(), data['Income'].max()
x = np.linspace(xmin, xmax, 500)
params = sps.norm.fit(data['Income'])  # fit вернет (mu, sigma)
pdf_fitted = sps.norm.pdf(x, *params)
plt.plot(x, pdf_fitted, 'r--', label='Нормальное распределение (fit)')

plt.title('Оценка распределения дохода')
plt.xlabel('Доход')
plt.ylabel('Плотность')
plt.legend()
plt.show()

n_num = len(num_columns)
cols = 3
rows = math.ceil(n_num / cols)

fig_hist, axes_hist = plt.subplots(rows, cols, figsize=(cols * 5, rows * 4))
axes_hist = axes_hist.flatten()

for i, col in enumerate(num_columns):
    ax = axes_hist[i]
    sns.histplot(data[col], kde=True, ax=ax, edgecolor='black', bins=30)
    kde_line = ax.lines[0]
    kde_line.set_color("crimson")

for j in range(i + 1, len(axes_hist)):
    fig_hist.delaxes(axes_hist[j])

fig_hist.suptitle("Числовые признаки - Гистограммы с KDE", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

bar_columns = ['NumStorePurchases', 'NumDealsPurchases', 'NumWebVisitsMonth']

n_num = len(bar_columns)
cols = 3
rows = math.ceil(n_num / cols)

fig_hist, axes_hist = plt.subplots(rows, cols, figsize=(cols * 5, rows * 4))
axes_hist = axes_hist.flatten()

for i, col in enumerate(bar_columns):
    ax = axes_hist[i]
    unique_vals = sorted(data[col].dropna().unique())  # получаем уникальные значения
    sns.histplot(
        data[col],
        kde=True,
        bins=[x - 0.5 for x in unique_vals] + [unique_vals[-1] + 0.5],  # бин вокруг каждой целой точки
        ax=ax,
        edgecolor='black'
    )
    if len(unique_vals) > 10: # отображаем только каждый второй тик
        ax.set_xticks(unique_vals[::2])  # чтобы тики совпадали с целыми
    kde_line = ax.lines[0]
    kde_line.set_color("crimson")

for j in range(i + 1, len(axes_hist)):
    fig_hist.delaxes(axes_hist[j])

fig_hist.suptitle("Целые числовые признаки - Гистограммы с KDE")
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

n_num = len(num_columns)
cols = 3
rows = math.ceil(n_num / cols)

fig_num, axes_num = plt.subplots(rows, cols, figsize=(cols * 5, rows * 2))
axes_num = axes_num.flatten()

for i, col in enumerate(num_columns):
    ax = axes_num[i]
    sns.boxplot(data=data, x=col, ax=ax, width=0.2)

# Удаляем пустые оси
for j in range(i + 1, len(axes_num)):
    fig_num.delaxes(axes_num[j])

fig_num.suptitle("Количественные признаки - Box Plots")
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

data[data.Year_Birth == min(data['Year_Birth'])]

n_num = len(num_columns)
cols = 3
rows = math.ceil(n_num / cols)

fig_num, axes_num = plt.subplots(rows, cols, figsize=(cols * 5, rows * 4))
axes_num = axes_num.flatten()

for i, col in enumerate(num_columns):
    sns.violinplot(data=data, x=col, ax=axes_num[i], inner='box')
    axes_num[i].set_title(col)

for j in range(i + 1, len(axes_num)):
    fig_num.delaxes(axes_num[j])

fig_num.suptitle("Количественные признаки - Violin Plots")
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

subset_cols = num_columns + ['Response']
clean_data = data[subset_cols]
clean_data

g = sns.PairGrid(clean_data, diag_sharey=False, hue='Response')
g.map_lower(sns.kdeplot)
g.map_upper(plt.scatter, alpha=0.5)
g.map_diag(sns.kdeplot, lw=3);

sns.pairplot(clean_data, hue='Response', vars=num_columns, corner=True)
plt.suptitle('Парные отношения между признаками с разделением по Response', y=1.02)
plt.show()

corr = data.corr(numeric_only=True)

# Убираем корреляции ближе к 0
mask = np.abs(corr) < 0.2
filtered_corr = corr.mask(mask)

plt.figure(figsize=(min(2 + len(filtered_corr.columns), 16), 10))
mask_upper = np.triu(np.ones_like(filtered_corr, dtype=bool))

# всё, что мало коррелирует — прозрачное, + верх треуг
sns.heatmap(filtered_corr, mask=mask_upper, annot=True, fmt=".2f",
            cmap='coolwarm', center=0, linewidths=0.5, cbar_kws={"shrink": 0.8},
            annot_kws={"size": 8})

plt.title('Отфильтрованная корреляционная матрица (|r| > 0.2)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

sns.clustermap(data.corr(numeric_only=True), cmap='coolwarm', annot=False, figsize=(12, 10))
plt.title('Clustermap корреляций признаков')
plt.show()

sns.jointplot(x=data['Income'], y=data['MntWines'], kind='kde', height=7, space=0, fill=True, ylim=(0, 1500))

<seaborn.axisgrid.JointGrid at 0x799dfb583d50>

sns.jointplot(x=data['Income'], y=data['MntWines'], height=7, space=0)

<seaborn.axisgrid.JointGrid at 0x799e13c8e490>

# центральный график
graph = sns.jointplot(x=data['Income'], y=data['MntWines'], color="xkcd:dark sea green")

# верхний график
graph.ax_marg_x.clear()
sns.kdeplot(data['Income'], color="xkcd:azure", ax=graph.ax_marg_x)

# правый график
graph.ax_marg_y.clear()
sns.distplot(data['MntWines'], vertical=True, kde=False,
              color="xkcd:orange", ax=graph.ax_marg_y);

data['Age'] = 2020 - data['Year_Birth']
plt.figure(figsize=(12, 8))

# Плотность для клиентов, которые не ответили
ax = sns.kdeplot(x=data['Age'][data['Response'] == 0], y=data['Total_Spent'][data['Response'] == 0],
                  label="No Response", cmap='Blues', fill=False, thresh=0.0)

# Плотность для клиентов, которые ответили
ax = sns.kdeplot(x=data['Age'][data['Response'] == 1], y=data['Total_Spent'][data['Response'] == 1],
                  label="Responded", cmap='Reds', fill=False, thresh=0.0)

ax.set_title("Density Plot: Age vs Total Spending by Response")
ax.set_xlabel("Age")
ax.set_ylabel("Total Spending")
ax.set_ylim(0, 2500)

# Легенда
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=labels, title="Marketing Response")

ax.legend().get_frame().set_facecolor("white")

plt.show()

sns.boxplot(x='Education', y='Total_Spent', data=data, width = 0.6)
plt.title('Total Spent vs Education')
plt.xticks(rotation=45)
plt.show()

sns.boxplot(x='Marital_Status', y='Total_Spent', data=data)
plt.title('Total Spent vs Marital Status')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(20, 10))
sns.boxplot(x='Total_Children', y='Total_Spent', data=data, hue='Education')
plt.title('Траты vs Total Children vs Education')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(18, 12))
colors = sns.color_palette('pastel')
responded_color = colors[0]
not_responded_color = colors[1]

# KDE Plot
plt.subplot(2, 2, 1)
sns.kdeplot(data=data[data['Response'] == 1]['Total_Spent'],
            label='Responded',
            fill=True,
            color=responded_color,
            alpha=0.5)
sns.kdeplot(data=data[data['Response'] == 0]['Total_Spent'],
            label='Did Not Respond',
            fill=True,
            color=not_responded_color,
            alpha=0.5)
plt.title('Density Plot: Total Spending')
plt.xlabel('Total Spending')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)

# Boxplot
plt.subplot(2, 2, 2)
sns.boxplot(x='Response', y='Total_Spent', data=data,
            width=0.5,
            palette=[not_responded_color, responded_color])
plt.title('Boxplot: Total Spending by Response')
plt.xticks([0, 1], ['Did Not Respond', 'Responded'])
plt.ylabel('Total Spending')
plt.grid(True, alpha=0.3)

# Violin plot
plt.subplot(2, 2, 3)
sns.violinplot(x='Response', y='Total_Spent', data=data,
               inner='quartile',
               palette=[not_responded_color, responded_color])
plt.title('Violin Plot: Total Spending by Response')
plt.xticks([0, 1], ['Did Not Respond', 'Responded'])
plt.ylabel('Total Spending')
plt.grid(True, alpha=0.3)

# Bar Plot
plt.subplot(2, 2, 4)
avg_spent = data.groupby('Response')['Total_Spent'].mean()
sns.barplot(x=avg_spent.index, y=avg_spent.values,
            palette=[not_responded_color, responded_color],
            alpha=0.7)
plt.xticks([0, 1], ['Did Not Respond', 'Responded'])
plt.title('Average Spending by Response Group')
plt.ylabel('Average Total Spent')
plt.grid(True, alpha=0.3)

plt.tight_layout(pad=3.0)
plt.show()

n_cat = len(cat_columns)
cols = 3
rows = math.ceil(n_cat / cols)

fig_cat, axes_cat = plt.subplots(rows, cols, figsize=(cols * 10, rows * 7))
axes_cat = axes_cat.flatten()

for i, col in enumerate(cat_columns):
    ax = axes_cat[i]
    order = data[col].value_counts().index  # Чтобы порядок был по убыванию
    counts = data[col].value_counts()
    total = len(data)

    sns.countplot(data=data, x=col, ax=ax, order=order, legend=False, hue='Response')
    ax.set_title(col)
    ax.tick_params(axis='x', rotation=45)

    for p in ax.patches:
        count = int(p.get_height())
        percent = 100 * count / total
        ax.annotate(f'{count} ({percent:.1f}%)',
                    (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center', va='bottom', fontsize=12)
    ax.set_ylim(0, max(counts.values) * 1.15)

# Удаляем лишние оси
for j in range(i + 1, len(axes_cat)):
    fig_cat.delaxes(axes_cat[j])

fig_cat.suptitle("Категориальные признаки - Барплоты с количеством и процентами", fontsize=24)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

n_num = len(num_columns)
cols = 3
rows = math.ceil(n_num / cols)

fig_kde, axes_kde = plt.subplots(rows, cols, figsize=(cols * 5, rows * 4))
axes_kde = axes_kde.flatten()

for i, col in enumerate(num_columns):
    ax = axes_kde[i]
    sns.kdeplot(data=data, x=col, hue='Response',
                palette={0: colors[0], 1: colors[1]},  # Цвета для классов
                fill=True,                       # Заливка под кривыми
                alpha=0.5,                      # Прозрачность
                ax=ax,
                common_norm=False) # Чтобы нормализация была отдельная для респонса
    ax.set_xlabel(col)
    ax.set_ylabel('Density')
    ax.grid(True, alpha=0.3)

# Удаляем лишние оси
for j in range(i + 1, len(axes_kde)):
    fig_kde.delaxes(axes_kde[j])

fig_kde.suptitle("Количественные признаки - KDE в зависимости от отклика", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

corr = data.corr(numeric_only=True)

# Убираем корреляции ближе к 0
mask = np.abs(corr) < 0.05
filtered_corr = corr.mask(mask)

plt.figure(figsize=(min(2 + len(filtered_corr.columns), 16), 10))
mask_upper = np.triu(np.ones_like(filtered_corr, dtype=bool))

# всё, что мало коррелирует — прозрачное, + верх треуг
sns.heatmap(filtered_corr, mask=mask_upper, annot=True, fmt=".2f",
            cmap='coolwarm', center=0, linewidths=0.5, cbar_kws={"shrink": 0.8},
            annot_kws={"size": 8})

plt.title('Отфильтрованная корреляционная матрица (|r| > 0.2)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

Название столбца	Описание
Response	Целевая переменная: `1`, если клиент принял предложение, иначе `0`
ID	Уникальный идентификатор клиента
Year_Birth	Год рождения клиента
Complain	`1`, если клиент жаловался за последние 2 года
Dt_Customer	Дата регистрации клиента в компании
Education	Уровень образования клиента
Marital	Семейное положение клиента
Kidhome	Количество маленьких детей в семье
Teenhome	Количество подростков в семье
Income	Годовой доход домохозяйства клиента
MntFishProducts	Потрачено на рыбу за последние 2 года
MntMeatProducts	Потрачено на мясо
MntFruits	Потрачено на фрукты
MntSweetProducts	Потрачено на сладости
MntWines	Потрачено на вино
MntGoldProds	Потрачено на золотые товары
NumDealsPurchases	Количество покупок по скидкам
NumCatalogPurchases	Количество покупок по каталогу (доставка)
NumStorePurchases	Количество покупок в офлайн-магазинах
NumWebPurchases	Количество покупок через сайт
NumWebVisitsMonth	Количество визитов на сайт за последний месяц
Recency	Сколько дней назад была последняя покупка

	Id	Year_Birth	Education	Marital_Status	Income	Kidhome	Teenhome	Dt_Customer	Recency	MntWines	...	MntFishProducts	MntSweetProducts	MntGoldProds	NumDealsPurchases	NumWebPurchases	NumCatalogPurchases	NumStorePurchases	NumWebVisitsMonth	Response	Complain
0	1826	1970	Graduation	Divorced	84835.0	0	0	6/16/2014	0	189	...	111	189	218	1	4	4	6	1	1	0
1	1	1961	Graduation	Single	57091.0	0	0	6/15/2014	0	464	...	7	0	37	1	7	3	7	5	1	0
2	10476	1958	Graduation	Married	67267.0	0	1	5/13/2014	0	134	...	15	2	30	1	3	2	5	2	0	0
3	1386	1967	Graduation	Together	32474.0	1	1	11/5/2014	0	10	...	0	0	0	1	1	0	2	7	0	0
4	5371	1989	Graduation	Single	21474.0	1	0	8/4/2014	0	6	...	11	0	34	2	3	1	2	7	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2235	10142	1976	PhD	Divorced	66476.0	0	1	7/3/2013	99	372	...	47	48	78	2	5	2	11	4	0	0
2236	5263	1977	2n Cycle	Married	31056.0	1	0	1/22/2013	99	5	...	3	8	16	1	1	0	3	8	0	0
2237	22	1976	Graduation	Divorced	46310.0	1	0	3/12/2012	99	185	...	15	5	14	2	6	1	5	8	0	0
2238	528	1978	Graduation	Married	65819.0	0	0	11/29/2012	99	267	...	149	165	63	1	5	4	10	3	0	0
2239	4070	1969	PhD	Married	94871.0	0	2	1/9/2012	99	169	...	188	0	144	1	8	5	4	7	1	0

	Year_Birth	MntWines	MntFruits	MntMeatProducts	MntFishProducts	MntSweetProducts	MntGoldProds	NumDealsPurchases	NumCatalogPurchases	NumStorePurchases	NumWebVisitsMonth	Total_Spent	Income	Recency	Response
0	1970	189	104	379	111	189	218	1	4	6	1	1190	84835.0	0	1
1	1961	464	5	64	7	0	37	1	3	7	5	577	57091.0	0	1
2	1958	134	11	59	15	2	30	1	2	5	2	251	67267.0	0	0
3	1967	10	0	1	0	0	0	1	0	2	7	11	32474.0	0	0
4	1989	6	16	24	11	0	34	2	1	2	7	91	21474.0	0	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2235	1976	372	18	126	47	48	78	2	2	11	4	689	66476.0	99	0
2236	1977	5	10	13	3	8	16	1	0	3	8	55	31056.0	99	0
2237	1976	185	2	88	15	5	14	2	1	5	8	309	46310.0	99	0
2238	1978	267	38	701	149	165	63	1	4	10	3	1383	65819.0	99	0
2239	1969	169	24	553	188	0	144	1	5	4	7	1078	94871.0	99	1

Параметр	Описание
`data`	DataFrame с данными
`hue`	Столбец для группировки данных (цветовое кодирование)
`vars`	Список переменных для анализа (по умолчанию все числовые колонки)
`height`	Высота каждого графика в дюймах
`aspect`	Соотношение ширины и высоты графиков

Метод	Описание	Типичные функции
`.map(func)`	Применяет функцию ко всем графикам	`sns.scatterplot`, `plt.plot`
`.map_diag(func)`	Только диагональные графики	`sns.histplot`, `sns.kdeplot`
`.map_offdiag(func)`	Все графики кроме диагональных	`sns.scatterplot`, `sns.regplot`
`.map_lower(func)`	Графики ниже диагонали	`sns.kdeplot`, `hexbin`
`.map_upper(func)`	Графики выше диагонали	`sns.heatmap`, `sns.violinplot`

Введение в анализ данных ¶

Exploratory Data Analysis (EDA)¶

Цели EDA¶

1. Проверить качество и надёжность данных¶

2. Оценить распределение признаков и пригодность данных к анализу¶

3. Сформулировать гипотезы, связанные с целевой переменной¶

Данные retail¶

1. Работа с датасетом¶

2. Работа с категориальными данными¶

3. Работа с числовыми данными¶

4. Взаимодействия признаков друг с другом¶

5. Таргетное рассмотрение¶

6. Попробуем вспомнить, зачем мы это делаем¶

Параметр	Описание
`x`, `y`	Переменные на осях (одномерные данные или имена столбцов из `data`)
`hue`	Группировка по категориям (например, `hue='Gender'`)
`data`	DataFrame, из которого берутся данные
`orient`	`'v'` или `'h'` — ориентация графика
`color`	Цвет графика (один цвет)
`palette`	Цветовая палитра (если `hue` задан)
`width`	Ширина ящиков (по умолчанию `0.8`)
`dodge`	Разносить ли ящики по `hue` (`True/False`)
`fliersize`	Размер точек-выбросов
`linewidth`	Толщина линий
`whis`	Длина "усов" (например, `1.5` означает 1.5×IQR)
`notch`	Показывать ли вырез под медиану (`True/False`)

Введение в анализ данных¶

Exploratory Data Analysis (EDA)¶

Цели EDA¶

1. Проверить качество и надёжность данных¶

2. Оценить распределение признаков и пригодность данных к анализу¶

3. Сформулировать гипотезы, связанные с целевой переменной¶

Данные retail¶

1. Работа с датасетом¶

2. Работа с категориальными данными¶

3. Работа с числовыми данными¶

4. Взаимодействия признаков друг с другом¶

5. Таргетное рассмотрение¶

6. Попробуем вспомнить, зачем мы это делаем¶

Введение в анализ данных ¶