from kyd_dataspec_gen.match_reference_datasets import (
    publish_ref_dataset,
    add_published_reference,
    ReferenceData,
    read_published_ref_dataset,
    match_reference_data_to_published_set,
)
import pytest
from pathlib import Path
from kyd_dataspec_gen.config import Config

test_config = Config()
curr_dir = Path(__file__).parent
test_data_dir = curr_dir / "test_data"

global_schema = {
    "referenceDatasets": [
        {
            "refDataCode": "REF-001",
            "description": "Sample reference dataset 1",
            "values": [{"value": "Sample 1A"}, {"value": "Sample 1B"}],
        },
        {
            "refDataCode": "REF-002",
            "description": "Sample reference dataset 2",
            "values": [{"value": "Sample 2A"}, {"value": "Sample 2B"}],
        },
    ]
}


def test_publish_ref_dataset():
    """Test the publish_ref_dataset function."""
    published_ref = publish_ref_dataset(
        global_schema, "published_reference_dataset", "tests/test_data/", test_config
    )
    assert Path("tests/test_data/published_reference_dataset.csv").exists()
    assert published_ref["published_ref_name"] == "published_reference_dataset"
    assert len(published_ref["published_ref_dataset"]) == 2
    assert published_ref["published_ref_dataset"][0]["ref_id"] == "PREF-001"
    assert published_ref["published_ref_dataset"][1]["ref_id"] == "PREF-002"
    # Clean up test file
    Path("tests/test_data/published_reference_dataset.csv").unlink()


@pytest.mark.parametrize(
    "ref_data, published_ref",
    [
        pytest.param(
            {
                "refDataCode": "REF-001",
                "description": "Sample reference dataset 1",
                "dataMapping": {"dataMappingName": None},
                "values": [{"value": "Sample 1A"}, {"value": "Sample 1B"}],
            },
            ReferenceData(
                ref_id="PREF-001",
                name="REF-001",
                description="Sample reference dataset 1",
                sample_values=["Sample 1A", "Sample 1B"],
            ),
            id="With existing dataMapping",
        ),
        pytest.param(
            {
                "refDataCode": "REF-001",
                "description": "Sample reference dataset 1",
                "values": [{"value": "Sample 1A"}, {"value": "Sample 1B"}],
            },
            ReferenceData(
                ref_id="PREF-001",
                name="REF-001",
                description="Sample reference dataset 1",
                sample_values=["Sample 1A", "Sample 1B"],
            ),
            id="Without existing dataMapping",
        ),
    ],
)
def test_add_published_reference(ref_data, published_ref):
    """Test adding published reference dataset to global schema."""
    add_published_reference(ref_data, published_ref)
    assert ref_data["dataMapping"]["referenceDataset"] == "PREF-001"


def test_read_published_ref_dataset():
    """Test reading a published reference dataset from CSV."""
    test_csv_path = test_data_dir / "test_published_ref.csv"
    published_ref = read_published_ref_dataset(str(test_csv_path))
    assert published_ref["published_ref_name"] == "test_published_ref"
    assert len(published_ref["published_ref_dataset"]) == 3
    assert published_ref["published_ref_dataset"][0]["ref_id"] == "PREF-Title"
    assert published_ref["published_ref_dataset"][1]["ref_id"] == "PREF-EmployeeID"
    assert published_ref["published_ref_dataset"][2]["ref_id"] == "PREF-CategoryList"


def test_match_reference_data_to_published_set():
    """Test matching reference data to published reference dataset."""
    global_schema = {
        "referenceDatasets": [
            {
                # Fuzzy matching with no overlapping values
                "refDataCode": "REF-Employee_id",
                "dataMapping": {"dataMappingName": None},
                "description": "Sample reference dataset 1",
                "values": [{"value": "Sample 1A"}, {"value": "Sample 1B"}],
            },
            {
                # Direct matching
                "refDataCode": "REF-Title",
                "description": "Sample reference dataset 2",
                "values": [{"value": "Sample 2A"}, {"value": "Sample 2B"}],
            },
            {
                # Intentionally using a name that is similar but not identical to test fuzzy matching
                "refDataCode": "REF-Category_list",
                "description": "Sample reference dataset Categories",
                "values": [{"value": "Bikes"}, {"value": "Clothing"}],
            },
        ]
    }
    test_csv_path = test_data_dir / "test_published_ref.csv"
    updated_schema = match_reference_data_to_published_set(
        global_schema, str(test_csv_path), test_config
    )
    updated_ref_datasets = updated_schema["referenceDatasets"]
    assert "referenceDataset" not in updated_ref_datasets[0]["dataMapping"]
    assert updated_ref_datasets[1]["dataMapping"]["referenceDataset"] == "PREF-Title"
    assert (
        updated_ref_datasets[2]["dataMapping"]["referenceDataset"]
        == "PREF-CategoryList"
    )
