import copy
from pathlib import Path
from unittest.mock import MagicMock, patch

import polars as pl
import pytest

from kyd_dataspec_gen.config import Config
from kyd_dataspec_gen.models import (
    CompoundPrimaryKey,
    DataSetList,
    DataSetWithCompoundPrimaryKey,
)
from kyd_dataspec_gen.primary_key_detection import (
    detect_compound_primary_key,
    identify_primary_key,
    read_dataset,
    review_primary_key,
    update_primary_key_wo_verification,
    update_primary_key_w_verification,
    verify_compound_primary_keys,
    verify_primary_key_combination,
)

curr_dir = Path(__file__).parent


def test_read_dataset():
    """Test the read_dataset function."""
    df = read_dataset(curr_dir / "test_data/test_relationship.csv")
    assert df.shape == (2, 8)


@pytest.mark.parametrize(
    (
        "col_stats",
        "col_name",
        "data_type",
        "primary_key_list",
        "expected_result",
        "expected_primary_key_list",
    ),
    [
        pytest.param(
            {
                "unique_ratio": 1.0,
                "null_count": 0,
            },
            "node_id",
            "string",
            [],
            True,
            ["node_id"],
            id="An id column with unique values and no nulls, it should be identified as primary key",
        ),
        pytest.param(
            {
                "unique_ratio": 0.02312,
                "null_count": 0,
            },
            "sourceID",
            "string",
            [],
            False,
            [],
            id="A column with non-unique values and no nulls, it should not be identified as primary key",
        ),
        pytest.param(
            {
                "unique_ratio": 0.00012,
                "null_count": 263324,
            },
            "countries",
            "string",
            [],
            False,
            [],
            id="A column with non-unique values and nulls, it should not be identified as primary key",
        ),
    ],
)
def test_identify_primary_key(
    col_stats,
    col_name,
    data_type,
    primary_key_list,
    expected_result,
    expected_primary_key_list,
):
    """Test the identify_primary_key function."""
    result = identify_primary_key(
        col_stats,
        col_name,
        data_type,
        Config(identification_column_keyword="_id"),
        primary_key_list,
    )
    assert result == expected_result
    assert primary_key_list == expected_primary_key_list


def test_review_primary_key():
    """Test the review_primary_key function."""
    global_schema = {
        "name": "icij",
        "datasets": [
            {
                "name": "officers",
                "type": "csv",
                "columns": [
                    {
                        "name": "officer_id",
                        "primaryKey": True,
                        "alternateKey": False,
                        "foreignKey": False,
                        "dataProfile": {
                            "statistics": {
                                "uniqueRatio": 1.0,
                                "nullCount": 0,
                            }
                        },
                    },
                    {
                        "name": "officer_name",
                        "primaryKey": True,
                        "alternateKey": False,
                        "foreignKey": False,
                        "dataProfile": {
                            "statistics": {
                                "uniqueRatio": 1.0,
                                "nullCount": 0,
                            }
                        },
                    },
                ],
            },
            {
                "name": "test_relationship",
                "type": "csv",
                "columns": [
                    {
                        "name": "node_id_start",
                        "primaryKey": False,
                        "alternateKey": False,
                        "foreignKey": True,
                        "dataProfile": {
                            "statistics": {
                                "uniqueRatio": 0.32453,
                                "nullCount": 0,
                            }
                        },
                    },
                    {
                        "name": "sourceID",
                        "primaryKey": False,
                        "alternateKey": False,
                        "foreignKey": False,
                        "dataProfile": {
                            "statistics": {
                                "uniqueRatio": 0.38205,
                                "nullCount": 0,
                            }
                        },
                    },
                    {
                        "name": "node_id_end",
                        "primaryKey": False,
                        "alternateKey": False,
                        "foreignKey": True,
                        "dataProfile": {
                            "statistics": {
                                "uniqueRatio": 0.02312,
                                "nullCount": 0,
                            }
                        },
                    },
                ],
            },
            {
                "name": "officers_test",
                "type": "csv",
                "columns": [
                    # First pk in the dataset and is a foreign key in the data source
                    {
                        "name": "officer_key",
                        "primaryKey": True,
                        "alternateKey": False,
                        "foreignKey": False,
                        "dataProfile": {
                            "statistics": {
                                "uniqueRatio": 1.0,
                                "nullCount": 0,
                            }
                        },
                    },
                    {
                        "name": "officer_name",
                        "primaryKey": True,
                        "alternateKey": False,  # not a foreign key in the data source, pk becomes an ak
                        "foreignKey": False,
                        "dataProfile": {
                            "statistics": {
                                "uniqueRatio": 1.0,
                                "nullCount": 0,
                            }
                        },
                    },
                ],
            },
            # The following dataset is used for testing the above officer_key as a foreign key in the data source
            # which should stay as a primary key during review
            {
                "name": "officer_addresses",
                "type": "csv",
                "columns": [
                    {
                        "name": "officer_key",
                        "primaryKey": True,
                        "alternateKey": False,
                        "foreignKey": True,
                        "dataProfile": {
                            "statistics": {
                                "uniqueRatio": 1.0,
                                "nullCount": 0,
                            }
                        },
                    },
                    {
                        "name": "address_id",
                        "primaryKey": True,
                        "alternateKey": False,
                        "foreignKey": True,
                        "dataProfile": {
                            "statistics": {
                                "uniqueRatio": 0.8348,
                                "nullCount": 0,
                            }
                        },
                    },
                ],
            },
        ],
    }
    with (
        patch(
            "kyd_dataspec_gen.primary_key_detection.generate_response"
        ) as mock_generate_response,
        patch(
            "kyd_dataspec_gen.primary_key_detection.verify_compound_primary_keys"
        ) as mock_verify_compound_primary_keys,
    ):
        mock_generate_response.return_value = DataSetList(
            data_sets=[
                DataSetWithCompoundPrimaryKey(
                    data_set_name="test_relationship",
                    compound_primary_key=[
                        CompoundPrimaryKey(
                            combination=["node_id_start", "node_id_end"],
                            verified=False,
                        )
                    ],
                )
            ]
        )
        mock_verify_compound_primary_keys.return_value = DataSetList(
            data_sets=[
                DataSetWithCompoundPrimaryKey(
                    data_set_name="test_relationship",
                    compound_primary_key=[
                        CompoundPrimaryKey(
                            combination=["node_id_start", "node_id_end"],
                            verified=True,
                        )
                    ],
                )
            ]
        )
        reviewed_schema = review_primary_key(
            global_schema,
            MagicMock(),
            Config(identification_column_keyword="_id"),
            True,  # noqa: FBT003 - only for testing purposes
            str(curr_dir / "test_data/"),
        )
        assert reviewed_schema["datasets"][0]["columns"][0]["primaryKey"] is True
        assert reviewed_schema["datasets"][0]["columns"][1]["primaryKey"] is False
        assert reviewed_schema["datasets"][0]["columns"][0]["alternateKey"] is False
        assert reviewed_schema["datasets"][0]["columns"][1]["alternateKey"] is True
        assert reviewed_schema["datasets"][1]["columns"][0]["primaryKey"] is True
        assert reviewed_schema["datasets"][1]["columns"][1]["primaryKey"] is False
        assert reviewed_schema["datasets"][1]["columns"][2]["primaryKey"] is True
        assert reviewed_schema["datasets"][2]["columns"][0]["primaryKey"] is True
        assert reviewed_schema["datasets"][2]["columns"][1]["primaryKey"] is False
        assert reviewed_schema["datasets"][2]["columns"][0]["alternateKey"] is False
        assert reviewed_schema["datasets"][2]["columns"][1]["alternateKey"] is True


dataset_wo_primary_key = [
    {
        "name": "test_relationship",
        "type": "csv",
        "columns": [
            {
                "name": "officer_id",
                "primaryKey": False,
                "foreignKey": True,
                "dataProfile": {
                    "statistics": {
                        "uniqueRatio": 0.32453,
                        "nullCount": 0,
                    }
                },
            },
            {
                "name": "sourceID",
                "primaryKey": False,
                "foreignKey": False,
                "dataProfile": {
                    "statistics": {
                        "uniqueRatio": 0.38205,
                        "nullCount": 0,
                    }
                },
            },
            {
                "name": "third_party_id",
                "primaryKey": False,
                "foreignKey": True,
                "dataProfile": {
                    "statistics": {
                        "uniqueRatio": 0.02312,
                        "nullCount": 0,
                    }
                },
            },
        ],
    },
]
expected_global_schema_test_1 = [
    {
        "name": "relationships",
        "type": "csv",
        "columns": [
            {
                "name": "officer_id",
                "primaryKey": True,
                "foreignKey": True,
                "dataProfile": {
                    "statistics": {
                        "uniqueRatio": 0.32453,
                        "nullCount": 0,
                    }
                },
            },
            {
                "name": "sourceID",
                "primaryKey": False,
                "foreignKey": False,
                "dataProfile": {
                    "statistics": {
                        "uniqueRatio": 0.38205,
                        "nullCount": 0,
                    }
                },
            },
            {
                "name": "third_party_id",
                "primaryKey": True,
                "foreignKey": True,
                "dataProfile": {
                    "statistics": {
                        "uniqueRatio": 0.02312,
                        "nullCount": 0,
                    }
                },
            },
        ],
    },
]

data_set_list = DataSetList(
    data_sets=[
        DataSetWithCompoundPrimaryKey(
            data_set_name="relationships",
            compound_primary_key=[
                CompoundPrimaryKey(
                    combination=["officer_id", "third_party_id"],
                    verified=False,
                    validated=False,
                )
            ],
        )
    ]
)


@pytest.mark.parametrize(
    ("ai_client", "dataset_wo_primary_key", "expected_result"),
    [
        pytest.param(
            MagicMock(),
            dataset_wo_primary_key,
            data_set_list,
            id="AI client provided",
        ),
        pytest.param(
            None,
            dataset_wo_primary_key,
            DataSetList(data_sets=[]),
            id="No AI client provided",
        ),
    ],
)
def test_detect_compound_primary_key(
    ai_client, dataset_wo_primary_key, expected_result
):
    """Test the detect_compound_primary_key function."""
    with patch(
        "kyd_dataspec_gen.primary_key_detection.generate_response"
    ) as mock_generate_response:
        mock_generate_response.return_value = DataSetList(
            data_sets=[
                DataSetWithCompoundPrimaryKey(
                    data_set_name="relationships",
                    compound_primary_key=[
                        CompoundPrimaryKey(
                            combination=["officer_id", "third_party_id"],
                            verified=False,
                            validated=False,
                        )
                    ],
                )
            ]
        )
        result = detect_compound_primary_key(ai_client, dataset_wo_primary_key)
        assert result == expected_result


def test_detect_compound_primary_key_empty_response():
    """Test the detect_compound_primary_key function."""
    with patch(
        "kyd_dataspec_gen.primary_key_detection.generate_response"
    ) as mock_generate_response:
        mock_generate_response.return_value = {}
        result = detect_compound_primary_key(MagicMock(), dataset_wo_primary_key)
        assert result == DataSetList(data_sets=[])


global_schema = {
    "name": "icij",
    "datasets": [
        {
            "name": "relationships",
            "type": "csv",
            "columns": [
                {
                    "name": "officer_id",
                    "primaryKey": False,
                    "foreignKey": True,
                    "dataProfile": {
                        "statistics": {
                            "uniqueRatio": 0.32453,
                            "nullCount": 0,
                        }
                    },
                },
                {
                    "name": "sourceID",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataProfile": {
                        "statistics": {
                            "uniqueRatio": 0.38205,
                            "nullCount": 0,
                        }
                    },
                },
                {
                    "name": "third_party_id",
                    "primaryKey": False,
                    "foreignKey": True,
                    "dataProfile": {
                        "statistics": {
                            "uniqueRatio": 0.02312,
                            "nullCount": 0,
                        }
                    },
                },
                {
                    "name": "link",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataProfile": {
                        "statistics": {
                            "uniqueRatio": 0.31235,
                            "nullCount": 0,
                        }
                    },
                },
            ],
        },
    ],
}
expected_global_schema_test_2 = {
    "name": "icij",
    "datasets": [
        {
            "name": "relationships",
            "type": "csv",
            "columns": [
                {
                    "name": "officer_id",
                    "primaryKey": True,
                    "foreignKey": True,
                    "dataProfile": {
                        "statistics": {
                            "uniqueRatio": 0.32453,
                            "nullCount": 0,
                        }
                    },
                },
                {
                    "name": "sourceID",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataProfile": {
                        "statistics": {
                            "uniqueRatio": 0.38205,
                            "nullCount": 0,
                        }
                    },
                },
                {
                    "name": "third_party_id",
                    "primaryKey": True,
                    "foreignKey": True,
                    "dataProfile": {
                        "statistics": {
                            "uniqueRatio": 0.02312,
                            "nullCount": 0,
                        }
                    },
                },
                {
                    "name": "link",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataProfile": {
                        "statistics": {
                            "uniqueRatio": 0.31235,
                            "nullCount": 0,
                        }
                    },
                },
            ],
        },
    ],
}
expected_global_schema_test_3 = {
    "name": "icij",
    "datasets": [
        {
            "name": "relationships",
            "type": "csv",
            "columns": [
                {
                    "name": "officer_id",
                    "primaryKey": True,
                    "foreignKey": True,
                    "dataProfile": {
                        "statistics": {
                            "uniqueRatio": 0.32453,
                            "nullCount": 0,
                        }
                    },
                },
                {
                    "name": "sourceID",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataProfile": {
                        "statistics": {
                            "uniqueRatio": 0.38205,
                            "nullCount": 0,
                        }
                    },
                },
                {
                    "name": "third_party_id",
                    "primaryKey": True,
                    "foreignKey": True,
                    "dataProfile": {
                        "statistics": {
                            "uniqueRatio": 0.02312,
                            "nullCount": 0,
                        }
                    },
                },
                {
                    "name": "link",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataProfile": {
                        "statistics": {
                            "uniqueRatio": 0.31235,
                            "nullCount": 0,
                        }
                    },
                },
            ],
        },
    ],
}
data_set_list = DataSetList(
    data_sets=[
        DataSetWithCompoundPrimaryKey(
            data_set_name="relationships",
            compound_primary_key=[
                # Validated and verified compound, both primary keys should be True
                CompoundPrimaryKey(
                    combination=["officer_id", "third_party_id"],
                    verified=True,
                ),
            ],
        )
    ]
)
data_set_list_2 = DataSetList(
    data_sets=[
        DataSetWithCompoundPrimaryKey(
            data_set_name="relationships",
            compound_primary_key=[
                # Validated but not verified compound, both primary keys should be False
                CompoundPrimaryKey(
                    combination=["officer_id", "third_party_id"],
                    verified=False,
                ),
            ],
        )
    ]
)
global_schema_2 = copy.deepcopy(global_schema)


def test_update_primary_key_w_verification():
    """Test the update_primary_key function."""
    update_primary_key_w_verification(data_set_list, global_schema)
    assert (
        global_schema["datasets"][0]["columns"][0]["primaryKey"]
        is expected_global_schema_test_2["datasets"][0]["columns"][0]["primaryKey"]
    )
    assert (
        global_schema["datasets"][0]["columns"][1]["primaryKey"]
        is expected_global_schema_test_2["datasets"][0]["columns"][1]["primaryKey"]
    )
    assert (
        global_schema["datasets"][0]["columns"][2]["primaryKey"]
        is expected_global_schema_test_2["datasets"][0]["columns"][2]["primaryKey"]
    )
    assert (
        global_schema["datasets"][0]["columns"][3]["primaryKey"]
        is expected_global_schema_test_2["datasets"][0]["columns"][3]["primaryKey"]
    )


def test_update_primary_key_wo_verification():
    """Test the update_primary_key function."""
    update_primary_key_wo_verification(data_set_list_2, global_schema_2)
    assert (
        global_schema["datasets"][0]["columns"][0]["primaryKey"]
        is expected_global_schema_test_3["datasets"][0]["columns"][0]["primaryKey"]
    )
    assert (
        global_schema["datasets"][0]["columns"][1]["primaryKey"]
        is expected_global_schema_test_3["datasets"][0]["columns"][1]["primaryKey"]
    )
    assert (
        global_schema["datasets"][0]["columns"][2]["primaryKey"]
        is expected_global_schema_test_3["datasets"][0]["columns"][2]["primaryKey"]
    )
    assert (
        global_schema["datasets"][0]["columns"][3]["primaryKey"]
        is expected_global_schema_test_3["datasets"][0]["columns"][3]["primaryKey"]
    )


potential_compounds = DataSetList(
    data_sets=[
        DataSetWithCompoundPrimaryKey(
            data_set_name="test_relationship",
            compound_primary_key=[
                # node_id_start and node_id_end are a unique combination
                CompoundPrimaryKey(
                    combination=["node_id_start", "node_id_end"],
                    verified=False,
                ),
                # rel_type and link are not a unique combination
                CompoundPrimaryKey(
                    combination=["rel_type", "link"],
                    verified=False,
                ),
            ],
        )
    ]
)


def test_verify_compound_primary_keys():
    """Test the verify_compound_primary_keys function."""
    verify_compound_primary_keys(
        str(curr_dir / "test_data/"),
        dataset_wo_primary_key,
        potential_compounds,
    )
    assert (
        potential_compounds["data_sets"][0]["compound_primary_key"][0]["verified"]
        is True
    )
    assert (
        potential_compounds["data_sets"][0]["compound_primary_key"][1]["verified"]
        is False
    )


@pytest.mark.parametrize(
    ("column_names", "expected_result"),
    [
        pytest.param(
            ["node_id_start", "node_id_end"],
            True,
            id="Valid primary key combination",
        ),
        pytest.param(
            ["rel_type", "link"],
            False,
            id="Invalid primary key combination",
        ),
    ],
)
def test_verify_primary_key_combination(column_names, expected_result):
    """Test the verify_primary_key_combination function."""
    df = pl.read_csv(str(curr_dir / "test_data/test_relationship.csv"))
    assert verify_primary_key_combination(df, column_names) == expected_result
