File size: 17,826 Bytes

7885a28

import copy

import numpy as np
import pytest
from numpy.testing import assert_allclose

from scipy import stats
from scipy.stats._multicomp import _pvalue_dunnett, DunnettResult


class TestDunnett:
    # For the following tests, p-values were computed using Matlab, e.g.
    #     sample = [18.  15.  18.  16.  17.  15.  14.  14.  14.  15.  15....
    #               14.  15.  14.  22.  18.  21.  21.  10.  10.  11.  9....
    #               25.  26.  17.5 16.  15.5 14.5 22.  22.  24.  22.5 29....
    #               24.5 20.  18.  18.5 17.5 26.5 13.  16.5 13.  13.  13....
    #               28.  27.  34.  31.  29.  27.  24.  23.  38.  36.  25....
    #               38. 26.  22.  36.  27.  27.  32.  28.  31....
    #               24.  27.  33.  32.  28.  19. 37.  31.  36.  36....
    #               34.  38.  32.  38.  32....
    #               26.  24.  26.  25.  29. 29.5 16.5 36.  44....
    #               25.  27.  19....
    #               25.  20....
    #               28.];
    #     j = [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
    #          0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
    #          0 0 0 0...
    #          1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...
    #          2 2 2 2 2 2 2 2 2...
    #          3 3 3...
    #          4 4...
    #          5];
    #     [~, ~, stats] = anova1(sample, j, "off");
    #     [results, ~, ~, gnames] = multcompare(stats, ...
    #     "CriticalValueType", "dunnett", ...
    #     "Approximate", false);
    #     tbl = array2table(results, "VariableNames", ...
    #     ["Group", "Control Group", "Lower Limit", ...
    #     "Difference", "Upper Limit", "P-value"]);
    #     tbl.("Group") = gnames(tbl.("Group"));
    #     tbl.("Control Group") = gnames(tbl.("Control Group"))

    # Matlab doesn't report the statistic, so the statistics were
    # computed using R multcomp `glht`, e.g.:
    #     library(multcomp)
    #     options(digits=16)
    #     control < - c(18.0, 15.0, 18.0, 16.0, 17.0, 15.0, 14.0, 14.0, 14.0,
    #                   15.0, 15.0, 14.0, 15.0, 14.0, 22.0, 18.0, 21.0, 21.0,
    #                   10.0, 10.0, 11.0, 9.0, 25.0, 26.0, 17.5, 16.0, 15.5,
    #                   14.5, 22.0, 22.0, 24.0, 22.5, 29.0, 24.5, 20.0, 18.0,
    #                   18.5, 17.5, 26.5, 13.0, 16.5, 13.0, 13.0, 13.0, 28.0,
    #                   27.0, 34.0, 31.0, 29.0, 27.0, 24.0, 23.0, 38.0, 36.0,
    #                   25.0, 38.0, 26.0, 22.0, 36.0, 27.0, 27.0, 32.0, 28.0,
    #                   31.0)
    #     t < - c(24.0, 27.0, 33.0, 32.0, 28.0, 19.0, 37.0, 31.0, 36.0, 36.0,
    #             34.0, 38.0, 32.0, 38.0, 32.0)
    #     w < - c(26.0, 24.0, 26.0, 25.0, 29.0, 29.5, 16.5, 36.0, 44.0)
    #     x < - c(25.0, 27.0, 19.0)
    #     y < - c(25.0, 20.0)
    #     z < - c(28.0)
    #
    #     groups = factor(rep(c("control", "t", "w", "x", "y", "z"),
    #                         times=c(length(control), length(t), length(w),
    #                                 length(x), length(y), length(z))))
    #     df < - data.frame(response=c(control, t, w, x, y, z),
    #                       group=groups)
    #     model < - aov(response
    #     ~group, data = df)
    #     test < - glht(model=model,
    #                   linfct=mcp(group="Dunnett"),
    #                   alternative="g")
    #     summary(test)
    #     confint(test)
    # p-values agreed with those produced by Matlab to at least atol=1e-3

    # From Matlab's documentation on multcompare
    samples_1 = [
        [
            24.0, 27.0, 33.0, 32.0, 28.0, 19.0, 37.0, 31.0, 36.0, 36.0,
            34.0, 38.0, 32.0, 38.0, 32.0
        ],
        [26.0, 24.0, 26.0, 25.0, 29.0, 29.5, 16.5, 36.0, 44.0],
        [25.0, 27.0, 19.0],
        [25.0, 20.0],
        [28.0]
    ]
    control_1 = [
        18.0, 15.0, 18.0, 16.0, 17.0, 15.0, 14.0, 14.0, 14.0, 15.0, 15.0,
        14.0, 15.0, 14.0, 22.0, 18.0, 21.0, 21.0, 10.0, 10.0, 11.0, 9.0,
        25.0, 26.0, 17.5, 16.0, 15.5, 14.5, 22.0, 22.0, 24.0, 22.5, 29.0,
        24.5, 20.0, 18.0, 18.5, 17.5, 26.5, 13.0, 16.5, 13.0, 13.0, 13.0,
        28.0, 27.0, 34.0, 31.0, 29.0, 27.0, 24.0, 23.0, 38.0, 36.0, 25.0,
        38.0, 26.0, 22.0, 36.0, 27.0, 27.0, 32.0, 28.0, 31.0
    ]
    pvalue_1 = [4.727e-06, 0.022346, 0.97912, 0.99953, 0.86579]  # Matlab
    # Statistic, alternative p-values, and CIs computed with R multcomp `glht`
    p_1_twosided = [1e-4, 0.02237, 0.97913, 0.99953, 0.86583]
    p_1_greater = [1e-4, 0.011217, 0.768500, 0.896991, 0.577211]
    p_1_less = [1, 1, 0.99660, 0.98398, .99953]
    statistic_1 = [5.27356, 2.91270, 0.60831, 0.27002, 0.96637]
    ci_1_twosided = [[5.3633917835622, 0.7296142201217, -8.3879817106607,
                      -11.9090753452911, -11.7655021543469],
                     [15.9709832164378, 13.8936496687672, 13.4556900439941,
                      14.6434503452911, 25.4998771543469]]
    ci_1_greater = [5.9036402398526, 1.4000632918725, -7.2754756323636,
                    -10.5567456382391, -9.8675629499576]
    ci_1_less = [15.4306165948619, 13.2230539537359, 12.3429406339544,
                 13.2908248513211, 23.6015228251660]
    pvalues_1 = dict(twosided=p_1_twosided, less=p_1_less, greater=p_1_greater)
    cis_1 = dict(twosided=ci_1_twosided, less=ci_1_less, greater=ci_1_greater)
    case_1 = dict(samples=samples_1, control=control_1, statistic=statistic_1,
                  pvalues=pvalues_1, cis=cis_1)

    # From Dunnett1955 comparing with R's DescTools: DunnettTest
    samples_2 = [[9.76, 8.80, 7.68, 9.36], [12.80, 9.68, 12.16, 9.20, 10.55]]
    control_2 = [7.40, 8.50, 7.20, 8.24, 9.84, 8.32]
    pvalue_2 = [0.6201, 0.0058]
    # Statistic, alternative p-values, and CIs computed with R multcomp `glht`
    p_2_twosided = [0.6201020, 0.0058254]
    p_2_greater = [0.3249776, 0.0029139]
    p_2_less = [0.91676, 0.99984]
    statistic_2 = [0.85703, 3.69375]
    ci_2_twosided = [[-1.2564116462124, 0.8396273539789],
                     [2.5564116462124, 4.4163726460211]]
    ci_2_greater = [-0.9588591188156, 1.1187563667543]
    ci_2_less = [2.2588591188156, 4.1372436332457]
    pvalues_2 = dict(twosided=p_2_twosided, less=p_2_less, greater=p_2_greater)
    cis_2 = dict(twosided=ci_2_twosided, less=ci_2_less, greater=ci_2_greater)
    case_2 = dict(samples=samples_2, control=control_2, statistic=statistic_2,
                  pvalues=pvalues_2, cis=cis_2)

    samples_3 = [[55, 64, 64], [55, 49, 52], [50, 44, 41]]
    control_3 = [55, 47, 48]
    pvalue_3 = [0.0364, 0.8966, 0.4091]
    # Statistic, alternative p-values, and CIs computed with R multcomp `glht`
    p_3_twosided = [0.036407, 0.896539, 0.409295]
    p_3_greater = [0.018277, 0.521109, 0.981892]
    p_3_less = [0.99944, 0.90054, 0.20974]
    statistic_3 = [3.09073, 0.56195, -1.40488]
    ci_3_twosided = [[0.7529028025053, -8.2470971974947, -15.2470971974947],
                     [21.2470971974947, 12.2470971974947, 5.2470971974947]]
    ci_3_greater = [2.4023682323149, -6.5976317676851, -13.5976317676851]
    ci_3_less = [19.5984402363662, 10.5984402363662, 3.5984402363662]
    pvalues_3 = dict(twosided=p_3_twosided, less=p_3_less, greater=p_3_greater)
    cis_3 = dict(twosided=ci_3_twosided, less=ci_3_less, greater=ci_3_greater)
    case_3 = dict(samples=samples_3, control=control_3, statistic=statistic_3,
                  pvalues=pvalues_3, cis=cis_3)

    # From Thomson and Short,
    # Mucociliary function in health, chronic obstructive airway disease,
    # and asbestosis, Journal of Applied Physiology, 1969. Table 1
    # Comparing with R's DescTools: DunnettTest
    samples_4 = [[3.8, 2.7, 4.0, 2.4], [2.8, 3.4, 3.7, 2.2, 2.0]]
    control_4 = [2.9, 3.0, 2.5, 2.6, 3.2]
    pvalue_4 = [0.5832, 0.9982]
    # Statistic, alternative p-values, and CIs computed with R multcomp `glht`
    p_4_twosided = [0.58317, 0.99819]
    p_4_greater = [0.30225, 0.69115]
    p_4_less = [0.91929, 0.65212]
    statistic_4 = [0.90875, -0.05007]
    ci_4_twosided = [[-0.6898153448579, -1.0333456251632],
                     [1.4598153448579, 0.9933456251632]]
    ci_4_greater = [-0.5186459268412, -0.8719655502147 ]
    ci_4_less = [1.2886459268412, 0.8319655502147]
    pvalues_4 = dict(twosided=p_4_twosided, less=p_4_less, greater=p_4_greater)
    cis_4 = dict(twosided=ci_4_twosided, less=ci_4_less, greater=ci_4_greater)
    case_4 = dict(samples=samples_4, control=control_4, statistic=statistic_4,
                  pvalues=pvalues_4, cis=cis_4)

    @pytest.mark.parametrize(
        'rho, n_groups, df, statistic, pvalue, alternative',
        [
            # From Dunnett1955
            # Tables 1a and 1b pages 1117-1118
            (0.5, 1, 10, 1.81, 0.05, "greater"),  # different than two-sided
            (0.5, 3, 10, 2.34, 0.05, "greater"),
            (0.5, 2, 30, 1.99, 0.05, "greater"),
            (0.5, 5, 30, 2.33, 0.05, "greater"),
            (0.5, 4, 12, 3.32, 0.01, "greater"),
            (0.5, 7, 12, 3.56, 0.01, "greater"),
            (0.5, 2, 60, 2.64, 0.01, "greater"),
            (0.5, 4, 60, 2.87, 0.01, "greater"),
            (0.5, 4, 60, [2.87, 2.21], [0.01, 0.05], "greater"),
            # Tables 2a and 2b pages 1119-1120
            (0.5, 1, 10, 2.23, 0.05, "two-sided"),  # two-sided
            (0.5, 3, 10, 2.81, 0.05, "two-sided"),
            (0.5, 2, 30, 2.32, 0.05, "two-sided"),
            (0.5, 3, 20, 2.57, 0.05, "two-sided"),
            (0.5, 4, 12, 3.76, 0.01, "two-sided"),
            (0.5, 7, 12, 4.08, 0.01, "two-sided"),
            (0.5, 2, 60, 2.90, 0.01, "two-sided"),
            (0.5, 4, 60, 3.14, 0.01, "two-sided"),
            (0.5, 4, 60, [3.14, 2.55], [0.01, 0.05], "two-sided"),
        ],
    )
    def test_critical_values(
        self, rho, n_groups, df, statistic, pvalue, alternative
    ):
        rng = np.random.default_rng(165250594791731684851746311027739134893)
        rho = np.full((n_groups, n_groups), rho)
        np.fill_diagonal(rho, 1)

        statistic = np.array(statistic)
        res = _pvalue_dunnett(
            rho=rho, df=df, statistic=statistic,
            alternative=alternative,
            rng=rng
        )
        assert_allclose(res, pvalue, atol=5e-3)

    @pytest.mark.parametrize(
        'samples, control, pvalue, statistic',
        [
            (samples_1, control_1, pvalue_1, statistic_1),
            (samples_2, control_2, pvalue_2, statistic_2),
            (samples_3, control_3, pvalue_3, statistic_3),
            (samples_4, control_4, pvalue_4, statistic_4),
        ]
    )
    def test_basic(self, samples, control, pvalue, statistic):
        rng = np.random.default_rng(11681140010308601919115036826969764808)

        res = stats.dunnett(*samples, control=control, rng=rng)

        assert isinstance(res, DunnettResult)
        assert_allclose(res.statistic, statistic, rtol=5e-5)
        assert_allclose(res.pvalue, pvalue, rtol=1e-2, atol=1e-4)

    @pytest.mark.parametrize(
        'alternative',
        ['two-sided', 'less', 'greater']
    )
    def test_ttest_ind(self, alternative):
        # check that `dunnett` agrees with `ttest_ind`
        # when there are only two groups
        rng = np.random.default_rng(114184017807316971636137493526995620351)

        for _ in range(10):
            sample = rng.integers(-100, 100, size=(10,))
            control = rng.integers(-100, 100, size=(10,))

            # preserve use of old random_state during SPEC 7 transition
            res = stats.dunnett(
                sample, control=control,
                alternative=alternative, random_state=rng
            )
            ref = stats.ttest_ind(
                sample, control,
                alternative=alternative
            )

            assert_allclose(res.statistic, ref.statistic, rtol=1e-3, atol=1e-5)
            assert_allclose(res.pvalue, ref.pvalue, rtol=1e-3, atol=1e-5)

    @pytest.mark.parametrize(
        'alternative, pvalue',
        [
            ('less', [0, 1]),
            ('greater', [1, 0]),
            ('two-sided', [0, 0]),
        ]
    )
    def test_alternatives(self, alternative, pvalue):
        rng = np.random.default_rng(114184017807316971636137493526995620351)

        # width of 20 and min diff between samples/control is 60
        # and maximal diff would be 100
        sample_less = rng.integers(0, 20, size=(10,))
        control = rng.integers(80, 100, size=(10,))
        sample_greater = rng.integers(160, 180, size=(10,))

        res = stats.dunnett(
            sample_less, sample_greater, control=control,
            alternative=alternative, rng=rng
        )
        assert_allclose(res.pvalue, pvalue, atol=1e-7)

        ci = res.confidence_interval()
        # two-sided is comparable for high/low
        if alternative == 'less':
            assert np.isneginf(ci.low).all()
            assert -100 < ci.high[0] < -60
            assert 60 < ci.high[1] < 100
        elif alternative == 'greater':
            assert -100 < ci.low[0] < -60
            assert 60 < ci.low[1] < 100
            assert np.isposinf(ci.high).all()
        elif alternative == 'two-sided':
            assert -100 < ci.low[0] < -60
            assert 60 < ci.low[1] < 100
            assert -100 < ci.high[0] < -60
            assert 60 < ci.high[1] < 100

    @pytest.mark.parametrize("case", [case_1, case_2, case_3, case_4])
    @pytest.mark.parametrize("alternative", ['less', 'greater', 'two-sided'])
    def test_against_R_multicomp_glht(self, case, alternative):
        rng = np.random.default_rng(189117774084579816190295271136455278291)
        samples = case['samples']
        control = case['control']
        alternatives = {'less': 'less', 'greater': 'greater',
                        'two-sided': 'twosided'}
        p_ref = case['pvalues'][alternative.replace('-', '')]

        res = stats.dunnett(*samples, control=control, alternative=alternative,
                            rng=rng)
        # atol can't be tighter because R reports some pvalues as "< 1e-4"
        assert_allclose(res.pvalue, p_ref, rtol=5e-3, atol=1e-4)

        ci_ref = case['cis'][alternatives[alternative]]
        if alternative == "greater":
            ci_ref = [ci_ref, np.inf]
        elif alternative == "less":
            ci_ref = [-np.inf, ci_ref]
        assert res._ci is None
        assert res._ci_cl is None
        ci = res.confidence_interval(confidence_level=0.95)
        assert_allclose(ci.low, ci_ref[0], rtol=5e-3, atol=1e-5)
        assert_allclose(ci.high, ci_ref[1], rtol=5e-3, atol=1e-5)

        # re-run to use the cached value "is" to check id as same object
        assert res._ci is ci
        assert res._ci_cl == 0.95
        ci_ = res.confidence_interval(confidence_level=0.95)
        assert ci_ is ci

    @pytest.mark.parametrize('alternative', ["two-sided", "less", "greater"])
    def test_str(self, alternative):
        rng = np.random.default_rng(189117774084579816190295271136455278291)

        res = stats.dunnett(
            *self.samples_3, control=self.control_3, alternative=alternative,
            rng=rng
        )

        # check some str output
        res_str = str(res)
        assert '(Sample 2 - Control)' in res_str
        assert '95.0%' in res_str

        if alternative == 'less':
            assert '-inf' in res_str
            assert '19.' in res_str
        elif alternative == 'greater':
            assert 'inf' in res_str
            assert '-13.' in res_str
        else:
            assert 'inf' not in res_str
            assert '21.' in res_str

    def test_warnings(self):
        rng = np.random.default_rng(189117774084579816190295271136455278291)

        res = stats.dunnett(
            *self.samples_3, control=self.control_3, rng=rng
        )
        msg = r"Computation of the confidence interval did not converge"
        with pytest.warns(UserWarning, match=msg):
            res._allowance(tol=1e-5)

    def test_raises(self):
        samples, control = self.samples_3, self.control_3

        # alternative
        with pytest.raises(ValueError, match="alternative must be"):
            stats.dunnett(*samples, control=control, alternative='bob')

        # 2D for a sample
        samples_ = copy.deepcopy(samples)
        samples_[0] = [samples_[0]]
        with pytest.raises(ValueError, match="must be 1D arrays"):
            stats.dunnett(*samples_, control=control)

        # 2D for control
        control_ = copy.deepcopy(control)
        control_ = [control_]
        with pytest.raises(ValueError, match="must be 1D arrays"):
            stats.dunnett(*samples, control=control_)

        # No obs in a sample
        samples_ = copy.deepcopy(samples)
        samples_[1] = []
        with pytest.raises(ValueError, match="at least 1 observation"):
            stats.dunnett(*samples_, control=control)

        # No obs in control
        control_ = []
        with pytest.raises(ValueError, match="at least 1 observation"):
            stats.dunnett(*samples, control=control_)

        res = stats.dunnett(*samples, control=control)
        with pytest.raises(ValueError, match="Confidence level must"):
            res.confidence_interval(confidence_level=3)

    @pytest.mark.filterwarnings("ignore:Computation of the confidence")
    @pytest.mark.parametrize('n_samples', [1, 2, 3])
    def test_shapes(self, n_samples):
        rng = np.random.default_rng(689448934110805334)
        samples = rng.normal(size=(n_samples, 10))
        control = rng.normal(size=10)
        res = stats.dunnett(*samples, control=control, rng=rng)
        assert res.statistic.shape == (n_samples,)
        assert res.pvalue.shape == (n_samples,)
        ci = res.confidence_interval()
        assert ci.low.shape == (n_samples,)
        assert ci.high.shape == (n_samples,)