"""
Apply 3 algorithms on 6 problems for 21 runs and compare results with tests.

We apply three algorithms to six problem instances and conduct 31 runs per
instance-algorithm combination. We then apply the two-tailed Mann-Whitney U
test pairwise to the results and print its outcomes in a table.

Such a statistical test can tell us how likely the observed difference between
two sets of results would have occurred if the results would stem from the
same algorithm. If this `p`-value is small, then the two sets are likely to
stem from differently-performing algorithms. In order to decide whether or not
a `p`-value is small enough, it is compared to a significance threshold
`alpha`, which is set to `alpha=0.02` by default. In other words, `alpha` is
the maximum probability of "being wrong" that we will accept. However, since
we are performing more than one test, we need to adjust `alpha`. We therefore
use the Bonferroni correction, which sets `alpha'=alpha/n_texts`, where
`n_tests` is the number of conducted tests.

The first column of the table denotes the problem instances. Each of the other
three columns represents a pair of algorithms. In each cell, the pair is
compared based on the results on the instance of the row. The cell ten holds
the `p`-value of the two-tailed Mann-Whitney U test. If the first algorithm
is significantly better (at `p<alpha'`) than the second algorithm, then the
cell is marked with `<`. If the first algorithm is significantly worse (at
`p<alpha'`) than the second algorithm, then the cell is marked with `>`. If
the observed differences are not significant (`p>=alpha'`), then the cell
is marked with `?`.

Finally, the bottom row sums up the numbers of `<`, `?`, and `>` outcomes for
each algorithm pair.

However, there could also be a situation where a statistical comparison makes
no sense as no difference could reliably be detected anyway. For example, if
one algorithm has a smaller median result but a larger mean result, or if the
medians are the same, or if the means are the same. Regardless of what outcome
a test would have, we could not really claim that any of the algorithms was
better or worse. In such cases, no test is performed and `-` is printed
instead (signified by `&mdash;` in the markdown format).
"""
from pycommons.io.temp import temp_dir

from moptipy.algorithms.so.rls import RLS
from moptipy.api.execution import Execution
from moptipy.api.experiment import run_experiment
from moptipy.evaluation.end_results import from_logs
from moptipy.evaluation.tabulate_result_tests import tabulate_result_tests
from moptipy.examples.bitstrings.leadingones import LeadingOnes
from moptipy.examples.bitstrings.onemax import OneMax
from moptipy.examples.bitstrings.trap import Trap
from moptipy.operators.bitstrings.op0_random import Op0Random
from moptipy.operators.bitstrings.op1_flip1 import Op1Flip1
from moptipy.operators.bitstrings.op1_m_over_n_flip import Op1MoverNflip
from moptipy.spaces.bitstrings import BitStrings

# The six problems we want to try to solve:
problems = [lambda: OneMax(100),  # 100-dimensional OneMax
            lambda: OneMax(200),  # 200-dimensional OneMax
            lambda: Trap(100),  # 100-dimensional Trap
            lambda: Trap(200),  # 200-dimensional Trap
            lambda: LeadingOnes(100),  # 100-dimensional LeadingOnes
            lambda: LeadingOnes(200)]  # 200-dimensional LeadingOnes


def rls(problem, op1) -> Execution:
    """
    Create an RLS Execution.

    :param problem: the problem (OneMax, Trap, or LeadingOnes)
    :param op1: the unary operator to use
    :returns: the execution
    """
    return Execution().set_solution_space(BitStrings(problem.n)) \
        .set_objective(problem).set_algorithm(
        RLS(Op0Random(), op1)).set_max_fes(300)


# We execute the whole experiment in a temp directory.
# For a real experiment, you would put an existing directory path into `td` by
# doing `from pycommons.io.path import Path; td = Path("mydir")` and not use
# the `with` block.
with temp_dir() as td:  # create temporary directory `td`
    run_experiment(
        base_dir=td,  # set the base directory for log files
        instances=problems,  # define the problem instances
        setups=[lambda p: rls(p, Op1Flip1()),  # RLS + flip 1 bit
                lambda p: rls(p, Op1MoverNflip(p.n, 1)),  # flip bits at p=1/n
                lambda p: rls(p, Op1MoverNflip(p.n, 2)),  # flip bits at p=2/n
                ], n_runs=21)  # conduct 21 independent runs per setup

    end_results = list(from_logs(td))  # we will load the data into this list

    # create a markdown table with statistical test results
    file = tabulate_result_tests(end_results, dir_name=td)
    print("\n")
    print(file.read_all_str())  # and print it

# The temp directory is deleted as soon as we leave the `with` block.