import polars as pl
from pathlib import Path
import pytest
from kyd_dataspec_gen.profile_full_data import (
    gini_impurity,
    unalikeability,
    get_stats,
    check_replace_profile_data,
)

curr_dir = Path(__file__).parent
categories = {"values": ["a", "b"], "count": [1, 2]}
sample_size = 3


def test_gini_impurity():
    gini_impurity_output = gini_impurity(categories, sample_size)
    assert gini_impurity_output == 0.4444


def test_unalikeability():
    unalikeability_output = unalikeability(categories, sample_size)
    assert unalikeability_output == 0.6667


def test_get_stats():
    df = pl.read_csv(curr_dir / "test_data" / "panama_papers_nodes_entities.csv")
    converted_jurisdiction = df.select("jurisdiction").with_columns(
        pl.col("jurisdiction").str.len_chars().alias("jurisdiction")
    )
    output = get_stats(converted_jurisdiction, "jurisdiction")
    assert output["null_count"] == 0
    assert output["min"] == 3.0
    assert output["max"] == 3.0
    assert output["stdv"] == 0.0
    assert output["q_0"] == 3.0
    assert output["q_1"] == 3.0
    assert output["q_2"] == 3.0


@pytest.mark.parametrize(
    ("raw_dataset_path", "profile", "expected_output"),
    [
        pytest.param(
            curr_dir / "test_data" / "test_profile_full_data_entities.csv",
            {
                "global_stats": {
                    "samples_used": 4,
                    "column_count": 17,
                    "row_count": 8,
                    "row_has_null_ratio": 1.0,
                    "row_is_null_ratio": 0.0,
                    "unique_row_ratio": 1.0,
                    "duplicate_row_count": 0,
                    "file_type": "csv",
                    "encoding": "utf-8",
                    "correlation_matrix": None,
                    "chi2_matrix": "[[1.        , 0.3325939 , 0.09157819, 0.09157819, 0.15623563, 0.15623563,\n  0.3325939 , 0.23810331, 0.23810331, 0.        , 0.3325939 , 0.09157819,\n  0.        , 0.09157819, 0.09157819, 0.09157819, 0.        ], ... , [ 0.,  0., nan, nan,  0.,  0.,  0.,  0.,  0., nan,  0., nan, nan, nan,\n  nan, nan,  1.]]",
                    "profile_schema": {
                        "node_id": [0],
                        "incorporation_date": [1],
                        "closed_date": [2],
                    },
                    "times": {"row_stats": 0.0017},
                },
                "data_stats": [
                    {
                        "column_name": "node_id",
                        "data_type": "int",
                        "categorical": True,
                        "order": "ascending",
                        "samples": "['10000055', '10000001', '10000002', '10000003']",
                        "statistics": {
                            "min": 10000001.0,
                            "max": 10000055.0,
                            "mode": "[10000001.027, 10000001.999, 10000003.025, 10000054.973]",
                            "median": 10000002.512,
                            "sum": 40000061.0,
                            "mean": 10000015.25,
                            "variance": 702.9167,
                            "stddev": 26.5126,
                            "skewness": 1.9943,
                            "kurtosis": 3.981,
                            "histogram": {
                                "bin_edges": "[10000001. , 10000014.5, 10000028. , 10000041.5, 10000055. ]",
                                "bin_counts": "[3., 0., 0., 1.]",
                            },
                            "quantiles": {
                                "0": 10000001.054,
                                "1": 10000002.512,
                                "2": 10000003.052,
                            },
                            "median_abs_deviation": 1.0137,
                            "num_zeros": 0,
                            "num_negatives": 0,
                            "times": {
                                "min": 0.0,
                                "max": 0.0,
                                "sum": 0.0,
                                "variance": 0.0001,
                                "skewness": 0.0001,
                                "kurtosis": 0.0001,
                                "histogram_and_quantiles": 0.0006,
                                "num_zeros": 0.0,
                                "num_negatives": 0.0,
                            },
                            "unique_count": 4,
                            "unique_ratio": 1.0,
                            "categories": "['10000001', '10000002', '10000003', '10000055']",
                            "gini_impurity": 0.75,
                            "unalikeability": 1.0,
                            "categorical_count": {
                                "10000001": 1,
                                "10000002": 1,
                                "10000003": 1,
                                "10000055": 1,
                            },
                            "sample_size": 4,
                            "null_count": 0,
                            "null_types": [],
                            "null_types_index": {},
                            "data_type_representation": {
                                "datetime": 0.0,
                                "int": 1.0,
                                "float": 1.0,
                                "string": 1.0,
                            },
                        },
                    },
                    {
                        "column_name": "incorporation_date",
                        "data_type": "string",
                        "categorical": True,
                        "order": "random",
                        "samples": "['10-JAN-2006', '08-NOV-2006', '27-MAR-2006', '23-MAR-2006']",
                        "statistics": {
                            "min": 11.0,
                            "max": 11.0,
                            "mode": "[11.]",
                            "median": 11.0,
                            "sum": 44.0,
                            "mean": 11.0,
                            "variance": 0.0,
                            "stddev": 0.0,
                            "skewness": 0.0,
                            "kurtosis": -13.5,
                            "histogram": {
                                "bin_counts": "[4]",
                                "bin_edges": "[11., 11.]",
                            },
                            "quantiles": {"0": 11.0, "1": 11.0, "2": 11.0},
                            "median_abs_deviation": 0.0,
                            "times": {
                                "vocab": 0.0,
                                "min": 0.0,
                                "max": 0.0,
                                "sum": 0.0,
                                "variance": 0.0,
                                "skewness": 0.0001,
                                "kurtosis": 0.0001,
                                "histogram_and_quantiles": 0.0002,
                            },
                            "vocab": "['N', '-', '6', '0', '2', ... , 'J', 'R', '7', 'V', 'O']",
                            "unique_count": 4,
                            "unique_ratio": 1.0,
                            "categories": "['23-MAR-2006', '27-MAR-2006', '10-JAN-2006', '08-NOV-2006']",
                            "gini_impurity": 0.75,
                            "unalikeability": 1.0,
                            "categorical_count": {
                                "23-MAR-2006": 1,
                                "27-MAR-2006": 1,
                                "10-JAN-2006": 1,
                                "08-NOV-2006": 1,
                            },
                            "sample_size": 4,
                            "null_count": 0,
                            "null_types": [],
                            "null_types_index": {},
                            "data_type_representation": {
                                "datetime": 0.0,
                                "int": 0.0,
                                "float": 0.0,
                                "string": 1.0,
                            },
                        },
                    },
                    {
                        "column_name": "closed_date",
                        "data_type": None,
                        "categorical": True,
                        "order": None,
                        "samples": [],
                        "statistics": {
                            "unique_count": 0,
                            "unique_ratio": 0,
                            "categories": [],
                            "gini_impurity": None,
                            "unalikeability": None,
                            "categorical_count": {},
                            "sample_size": 4,
                            "null_count": 4,
                            "null_types": "['']",
                            "null_types_index": {"": "[0, 1, 2, 3]"},
                            "num_negatives": 0,
                            "num_zeros": 0,
                        },
                    },
                ],
            },
            {
                "global_stats": {
                    "samples_used": 8,
                    "column_count": 17,
                    "row_count": 8,
                    "row_has_null_ratio": 1.0,
                    "row_is_null_ratio": 0.0,
                    "unique_row_ratio": 1.0,
                    "duplicate_row_count": 0,
                    "file_type": "csv",
                    "encoding": "utf-8",
                    "correlation_matrix": None,
                    "chi2_matrix": "[[1.        , 0.3325939 , 0.09157819, 0.09157819, 0.15623563, 0.15623563,\n  0.3325939 , 0.23810331, 0.23810331, 0.        , 0.3325939 , 0.09157819,\n  0.        , 0.09157819, 0.09157819, 0.09157819, 0.        ], ... , [ 0.,  0., nan, nan,  0.,  0.,  0.,  0.,  0., nan,  0., nan, nan, nan,\n  nan, nan,  1.]]",
                    "profile_schema": {
                        "node_id": [0],
                        "incorporation_date": [1],
                        "closed_date": [2],
                    },
                    "times": {"row_stats": 0.0017},
                },
                "data_stats": [
                    {
                        "column_name": "node_id",
                        "data_type": "int",
                        "categorical": True,
                        "order": "ascending",
                        "samples": "['10000055', '10000001', '10000002', '10000003']",
                        "statistics": {
                            "min": 10000001.0,
                            "max": 10000061.0,
                            "mode": [],
                            "median": 10000056.0,
                            "sum": 90000353,
                            "mean": 10000039.2222,
                            "variance": 782.9444,
                            "stddev": 27.9811,
                            "skewness": -0.6944,
                            "kurtosis": -1.4969,
                            "histogram": {
                                "bin_edges": [
                                    10000007.0,
                                    10000013.0,
                                    10000019.0,
                                    10000025.0,
                                    10000031.0,
                                    10000037.0,
                                    10000043.0,
                                    10000049.0,
                                    10000055.0,
                                    10000061.0,
                                ],
                                "bin_counts": [
                                    3,
                                    0,
                                    0,
                                    0,
                                    0,
                                    0,
                                    0,
                                    0,
                                    1,
                                    5,
                                ],
                            },
                            "quantiles": {
                                "0": 10000003.0,
                                "1": 10000056.0,
                                "2": 10000058.0,
                            },
                            "median_abs_deviation": 4.0,
                            "num_zeros": 0,
                            "num_negatives": 0,
                            "times": {},
                            "unique_count": 9,
                            "unique_ratio": 1.0,
                            "categories": [
                                10000001,
                                10000002,
                                10000003,
                                10000055,
                                10000056,
                                10000057,
                                10000058,
                                10000060,
                                10000061,
                            ],
                            "gini_impurity": 0.8889,
                            "unalikeability": 1.0,
                            "categorical_count": {
                                10000001: 1,
                                10000002: 1,
                                10000003: 1,
                                10000055: 1,
                                10000056: 1,
                                10000057: 1,
                                10000058: 1,
                                10000060: 1,
                                10000061: 1,
                            },
                            "sample_size": 9,
                            "null_count": 0,
                            "null_types": [],
                            "null_types_index": {},
                            "data_type_representation": {
                                "datetime": 0.0,
                                "int": 1.0,
                                "float": 1.0,
                                "string": 1.0,
                            },
                        },
                    },
                    {
                        "column_name": "incorporation_date",
                        "data_type": "string",
                        "categorical": True,
                        "order": "random",
                        "samples": "['10-JAN-2006', '08-NOV-2006', '27-MAR-2006', '23-MAR-2006']",
                        "statistics": {
                            "min": 11.0,
                            "max": 11.0,
                            "mode": [],
                            "median": 11.0,
                            "sum": 99,
                            "mean": 11.0,
                            "variance": 0.0,
                            "stddev": 0.0,
                            "skewness": 0.0,
                            "kurtosis": 0.0,
                            "histogram": {
                                "bin_counts": [
                                    0,
                                    0,
                                    0,
                                    0,
                                    9,
                                    0,
                                    0,
                                    0,
                                    0,
                                    0,
                                ],
                                "bin_edges": [
                                    10.6,
                                    10.7,
                                    10.8,
                                    10.9,
                                    11.0,
                                    11.1,
                                    11.2,
                                    11.3,
                                    11.4,
                                    11.5,
                                ],
                            },
                            "quantiles": {"0": 11.0, "1": 11.0, "2": 11.0},
                            "median_abs_deviation": 0.0,
                            "times": {},
                            "vocab": "['N', '-', '6', '0', '2', ... , 'J', 'R', '7', 'V', 'O']",
                            "unique_count": 9,
                            "unique_ratio": 1.0,
                            "num_negatives": 0,
                            "num_zeros": 0,
                            "categories": [
                                "29-SEP-2005",
                                "16-AUG-2004",
                                "17-OCT-2006",
                                "12-JUN-2006",
                                "27-MAR-2006",
                                "23-MAR-2006",
                                "10-JAN-2006",
                                "08-NOV-2006",
                                "24-NOV-2006",
                            ],
                            "gini_impurity": 0.8889,
                            "unalikeability": 1.0,
                            "categorical_count": {
                                "23-MAR-2006": 1,
                                "27-MAR-2006": 1,
                                "10-JAN-2006": 1,
                                "08-NOV-2006": 1,
                                "12-JUN-2006": 1,
                                "29-SEP-2005": 1,
                                "24-NOV-2006": 1,
                                "16-AUG-2004": 1,
                                "17-OCT-2006": 1,
                            },
                            "sample_size": 9,
                            "null_count": 0,
                            "null_types": [],
                            "null_types_index": {},
                            "data_type_representation": {
                                "datetime": 0.0,
                                "int": 0.0,
                                "float": 0.0,
                                "string": 1.0,
                            },
                        },
                    },
                    {
                        "column_name": "closed_date",
                        "data_type": None,
                        "categorical": True,
                        "order": None,
                        "samples": [],
                        "statistics": {
                            "unique_count": 0,
                            "unique_ratio": 0,
                            "categories": [],
                            "gini_impurity": None,
                            "unalikeability": None,
                            "categorical_count": {},
                            "sample_size": 9,
                            "null_count": 9,
                            "null_types": "['']",
                            "null_types_index": {"": [0, 1, 2, 3, 4, 5, 6, 7, 8]},
                            "num_negatives": 0,
                            "num_zeros": 0,
                        },
                    },
                ],
            },
        )
    ],
)
def test_check_replace_profile_data(raw_dataset_path, profile, expected_output):
    profile_output = check_replace_profile_data(raw_dataset_path, profile)
    # Compare mode and categories ignoring order, only check for non-null columns
    num_datasets = len(profile_output["data_stats"]) - 1
    for ind in range(num_datasets):
        assert sorted(
            profile_output["data_stats"][ind]["statistics"]["mode"]
        ) == sorted(expected_output["data_stats"][ind]["statistics"]["mode"])
        assert sorted(
            profile_output["data_stats"][ind]["statistics"]["categories"]
        ) == sorted(expected_output["data_stats"][ind]["statistics"]["categories"])
        del profile_output["data_stats"][ind]["statistics"]["mode"]
        del expected_output["data_stats"][ind]["statistics"]["mode"]
        del profile_output["data_stats"][ind]["statistics"]["categories"]
        del expected_output["data_stats"][ind]["statistics"]["categories"]
    assert profile_output == expected_output
