import pandas as pd
import scipy.stats as sps
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns

import random
# зафиксируем сид для воспроизводимости генерации
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

size=100

sample = <...>
<...>

sample_1 = <...>
sample_2 = <...>
<...>

sample_1 = <...>
sample_2 = <...>
<...>

sample_1 = <...>
sample_2 = <...>
<...>

sample_1 = <...>
sample_2 = <...>
<...>

df = sns.load_dataset("iris")

g = sns.PairGrid(df, hue='species')
g.map_lower(sns.kdeplot, cmap ="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3);

df.head()

np.unique(df.species)

sps.ttest_ind(df[df.species == 'setosa'].sepal_length, 
              df[df.species == 'versicolor'].sepal_length,
              equal_var=False)

sps.ttest_ind(df[df.species == 'virginica'].sepal_length, 
              df[df.species == 'versicolor'].sepal_length,
              equal_var=False)

sps.ttest_ind(df[df.species == 'virginica'].sepal_width, 
              df[df.species == 'versicolor'].sepal_width,
              equal_var=False)

sample_size = 100
sample_count = 1000

theta = 0.5

def wald_test(sample, theta, estimation_theta, estimation_sigma, alternative='two_sided'):
    """
    param sample: реализация выборки
    param theta: истинное значение параметра
    param estimation_theta: оценка параметра
    param estimation_sigma: оценка асимптотической дисперсии оценки estimation_sigma
    param alternative: вид альтернативной гипотезы, может принимать одно из значений 'two_sided', 'less', 'greater'

    return statistic
    return p_value
    """

    alpha = 0.05
    z = sps.norm.ppf(1 - alpha/2)
    n = len(sample)
    statistic = np.sqrt(n) * (estimation_theta - theta) / estimation_sigma

    if alternative == 'two_sided':
        p_value = sps.norm.sf(np.abs(statistic)) + sps.norm.cdf(-np.abs(statistic))
        conf_int = round(estimation_theta - z*estimation_sigma/np.sqrt(n), 4), round(estimation_theta + z*estimation_sigma/np.sqrt(n), 4)


    elif alternative == 'less':
        p_value = sps.norm.cdf(statistic)
        conf_int = (-np.inf, round(estimation_theta + z*estimation_sigma/np.sqrt(n), 4))

    
    elif alternative == 'greater':
        p_value = sps.norm.sf(statistic)
        conf_int = (round(estimation_theta + z*estimation_sigma/np.sqrt(n), 4), np.inf)

    else:
        raise ValueError('alternative name is wrong')

    return statistic, p_value, conf_int

sample = <...>

estimation_theta = <...>
estimation_sigma = <...>


counter = 0

for i in range(sample_count):
    _, p_value, conf_int = wald_test(sample[i], theta, estimation_theta[i], estimation_sigma[i])
    is_rejected = <...>
    if is_rejected:
        counter += 1
        
counter / sample_count

sample_all = <...>

counter = 0
for sample in sample_all:
    estimation_theta = <...>
    estimation_sigma = <...>

    for i in range(100):
        _, p_value, conf_int = wald_test(sample[i], theta, estimation_theta[i], estimation_sigma[i])
        is_rejected = <...>
        if is_rejected:
            counter += 1
            break

counter / 1000

sample_all = <...>

counter = 0
for sample in sample_all:
    estimation_theta = <...>
    estimation_sigma = <...>

    for i in range(100):
        _, p_value, conf_int = wald_test(sample[i], theta, estimation_theta[i], estimation_sigma[i])
        is_rejected = <...>
        if is_rejected:
            counter += 1
            break

counter / 1000

Критерии вида t-test¶

Одновыборочный¶

Двухвыборочный¶

Независимые выборки¶

Связные выборки¶

Пример: ирисы Фишера¶

Множественная проверка гипотез¶