from pathlib import Path

import pytest
from google import genai
from mock import patch
from unittest.mock import MagicMock

from kyd_dataspec_gen.config import Config
from kyd_dataspec_gen.data_spec_gen import (
    categorical_check,
    check_correlated_columns,
    create_data_profile_shape,
    create_sampled_categories,
    determine_format,
    dict_to_category_list_count,
    generate_data_spec,
    generate_schema_information,
    get_data_type,
    get_times,
    get_unLikeAbility,
    reuse_current_generated_information,
    save_reference_datasets,
    insert_generated_information,
)
from kyd_dataspec_gen.models import (
    Column,
    DataSet,
    DataSetKey,
    DataSource,
    Relationship,
    DataClassification,
    DataDictionaryMatch,
)

curr_dir = Path(__file__).parent
t_data_source = "icij"
t_profile_dir = str(curr_dir / "test_data/")
t_schema_path = str(curr_dir / "test_data" / "test_schema.json")
test_config = Config()
test_config_2 = Config(categorical_limit=3)


@pytest.mark.parametrize(
    (
        "data_source",
        "profile_dir",
        "schema_path",
        "existing_dataspec_path",
        "output_path",
        "expected_schema_path",
    ),
    [
        pytest.param(
            t_data_source,
            t_profile_dir,
            "",
            "",
            f"{curr_dir}/data_spec/icij_data_spec.json",
            "https://schema.kyd.ai/v1/datasource.schema.json",
            id="no schema path is given",
        ),
        pytest.param(
            t_data_source,
            t_profile_dir,
            t_schema_path,
            "",
            f"{curr_dir}/data_spec/icij_data_spec.json",
            str(curr_dir / "test_data" / "test_schema.json"),
            id="schema path is given",
        ),
        pytest.param(
            t_data_source,
            t_profile_dir,
            "",
            str(curr_dir / "test_data" / "icij_data_spec_ori.json"),
            f"{curr_dir}/data_spec/icij_data_spec.json",
            "https://schema.kyd.ai/v1/datasource.schema.json",
            id="existing schema path is given, with samples of node_id in a different order, and one less samples in the name column, the output should have the same node_id samples as the existing but with new name samples",
        ),
    ],
)
def test_generate_data_spec(
    data_source,
    profile_dir,
    schema_path,
    existing_dataspec_path,
    output_path,
    expected_schema_path,
):
    """Test the generate_data_spec function."""
    # Test with a valid data source and output path
    with (
        patch(
            "kyd_dataspec_gen.data_spec_gen.generate_schema_information"
        ) as mock_generate_info,
        patch("google.genai.Client") as mock_client,
    ):
        mock_client.return_value = MagicMock()
        mock_generate_info.return_value = {
            "name": "icij",
            "description": "This data source contains information about offshore entities, "
            "intermediaries, officers, and addresses, along with the relationships "
            "between them, derived from various leaks including the Panama Papers, "
            "Paradise Papers, and Offshore Leaks. ",
            "location_coverage": ["United Kingdom", "United States"],
            "relationships": [],
            "data_sets": [
                {
                    "data_set_name": "icij",
                    "description": "This dataset contains information about "
                    "offshore entities such as companies and trusts.",
                    "columns": [
                        {
                            "col_name": "node_id",
                            "description": "Unique identifier for the entity node.",
                            "comment": "This column contains unique identifiers for each entity.",
                            "primary_key": True,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "name",
                            "description": "Name of the entity.",
                            "comment": "Name of the entity.",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.names,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "jurisdiction",
                            "description": "jurisdiction",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "jurisdiction_description",
                            "description": "jurisdiction_description",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "country_codes",
                            "description": "country_codes",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "countries",
                            "description": "countries",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.addresses,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "incorporation_date",
                            "description": "incorporation_date",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "inactivation_date",
                            "description": "inactivation_date",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "struck_off_date",
                            "description": "struck_off_date",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "closed_date",
                            "description": "closed_date",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "ibcRUC",
                            "description": "ibcRUC",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "status",
                            "description": "status",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "company_type",
                            "description": "company_type",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "service_provider",
                            "description": "service_provider",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "sourceID",
                            "description": "sourceID",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "valid_until",
                            "description": "valid_until",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                        {
                            "col_name": "note",
                            "description": "note",
                            "comment": "",
                            "primary_key": False,
                            "foreign_key": False,
                            "data_classification": DataClassification.non_classified,
                            "anonymised_samples": [],
                            "data_dictionary_match": DataDictionaryMatch.new_missing,
                            "proposed_dd_match": None,
                        },
                    ],
                },
            ],
        }
        data_spec = generate_data_spec(
            data_source=data_source,
            profile_dir=profile_dir,
            config=test_config,
            output_dir=str(Path(__file__).parent / "data_spec"),
            schema_path=schema_path,
            existing_dataspec_path=existing_dataspec_path,
        )

        assert Path(output_path).exists()

    assert data_spec["$schema"] == expected_schema_path
    assert data_spec["name"] == data_source
    assert len(data_spec["datasets"][0]["columns"]) == 17
    assert len(data_spec["datasets"][0]["columns"][0]["samples"]) == 4
    assert len(data_spec["datasets"][0]["columns"][1]["samples"]) == 4

    # Clean up the generated data specification file
    Path.unlink(Path(output_path))


generated_info = {
    "name": "icij",
    "description": "This data source contains information about offshore entities, intermediaries, officers, and addresses, along with the relationships between them, derived from various leaks including the Panama Papers, Paradise Papers, and Offshore Leaks. ",
    "location_coverage": ["United Kingdom", "United States"],
    "relationships": [],
    "data_sets": [
        {
            "data_set_name": "icij",
            "description": "This dataset contains information about offshore entities such as companies and trusts.",
            "relationships": [],
            "columns": [
                {
                    "col_name": "node_id",
                    "type": "int",
                    "description": "Unique identifier for the entity node.",
                    "comment": "This column contains unique identifiers for each entity.",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "name",
                    "type": "string",
                    "description": "Name of the entity.",
                    "comment": "This column has 162851 non-null values (99.99%). 6 values are null (0.01%). There are 161469 unique values (99.15%).",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.names,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "jurisdiction",
                    "type": "string",
                    "description": "The code for the jurisdiction where the entity is registered.",
                    "comment": "This column has 161363 non-null values (99.08%). 1494 values are null (0.92%). There are 81 unique values (0.05%). Top 3 values are BAH (25.75%), BVI (20.43%), MLT (10.34%).",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "jurisdiction_description",
                    "type": "string",
                    "description": "The name of the jurisdiction where the entity is registered.",
                    "comment": "This column has 161364 non-null values (99.08%). 1493 values are null (0.92%). There are 68 unique values (0.04%). Top 3 values are Bahamas (25.76%), British Virgin Islands (20.99%), Malta (10.34%).",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "country_codes",
                    "description": "ISO 3166-1 alpha-3 country codes associated with the entity, potentially multiple separated by semicolons.",
                    "comment": "This column has 386 non-null values (12.91%). 2603 values are null (87.09%). There are 63 unique values (16.32%). Top 3 values are IMN (27.2%), CYM (20.47%), GBR (6.74%). Values are separated by ';'.",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "countries",
                    "description": "Full country names associated with the entity, potentially multiple separated by semicolons.",
                    "comment": "This column has a high number of missing values (37.97% null). It contains a small number of unique values (0.59% unique among non-nulls). Top 3 frequent values are Malta (17.04%), British Virgin Islands (10.85%), and Hong Kong (7.68%).",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.addresses,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "incorporation_date",
                    "description": "The incorporation date for the 'other' node, if applicable.",
                    "comment": "This column has a high number of missing values (70.3% null). It contains many unique values (93.92% unique among non-nulls). Date format is DD-MMM-YYYY.",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "inactivation_date",
                    "description": "The date the entity became inactive.",
                    "comment": "This column has a very high number of missing values (82.22% null). It contains many unique values (17.21% unique among non-nulls). Date format is DD-MMM-YYYY.",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "struck_off_date",
                    "description": "The date the entity was struck off the registry.",
                    "comment": "This column has a high number of missing values (57.92% null). It contains many unique values (6.61% unique among non-nulls). Date format is DD-MMM-YYYY.",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "closed_date",
                    "description": "The closed date for the 'other' node, if applicable.",
                    "comment": "This column has a very high number of missing values (95.95% null). It contains many unique values (95.73% unique among non-nulls). Date format is DD-MMM-YYYY.",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "ibcRUC",
                    "description": "An identifier related to International Business Companies (IBC) or Registro \u00danico de Contribuyentes (RUC).",
                    "comment": "This column has a high number of missing values (30.99% null). It contains many unique values (92.61% unique among non-nulls). Top 3 frequent values are not applicable as it's not a categorical column.",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "status",
                    "description": "The status of the intermediary.",
                    "comment": "This column has a high number of missing values (53.24% null). It contains a very small number of unique values (0.32% unique among non-nulls). Top 3 frequent values are ACTIVE (57.1%), SUSPENDED (36.5%), and UNRECOVERABLE ACCOUNTS (2.83%).",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "company_type",
                    "description": "The type of company or entity.",
                    "comment": "This column has a very high number of missing values (82.91% null). It contains a small number of unique values (0.16% unique among non-nulls). Top 3 frequent values are Standard International Company (28.89%), Standard Company under IBC Act (19.49%), and Business Company Limited by Shares (18.08%).",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "service_provider",
                    "description": "The service provider associated with the entity.",
                    "comment": "This column has a high number of missing values (57.86% null). It contains a very small number of unique values (0.01% unique among non-nulls). Top 3 frequent values are Mossack Fonseca (61.81%), Portcullis Trustnet (17.85%), and Commonwealth Trust Limited (13.14%).",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "sourceID",
                    "description": "Identifier for the source dataset the record originated from.",
                    "comment": "This column has no missing values (0% null). It contains a very small number of unique values (0.01% unique). Top 3 frequent values are Panama Papers (26.15%), Bahamas Leaks (21.47%), and Offshore Leaks (13.12%).",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "valid_until",
                    "description": "Indicates the date until which the source data is considered current.",
                    "comment": "This column has a very small number of missing values (less than 1% null). It contains a very small number of unique values (0.03% unique among non-nulls). Top 3 frequent values are The Panama Papers data is current through 2015 (26.15%), The Bahamas Leaks data is current through early 2016. (21.47%), and The Offshore Leaks data is current through 2010 (13.12%).",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
                {
                    "col_name": "note",
                    "description": "Additional notes or comments about the entity.",
                    "comment": "This column has a very high number of missing values (95.05% null). It contains a small number of unique values (0.95% unique among non-nulls). Top 3 frequent values are Closed date stands for Cancelled date. (1.07%), Closed date stands for dissolved date. (0.32%), and Closed date stands for Liquidation date. (0.28%).",
                    "primary_key": False,
                    "foreign_key": False,
                    "data_classification": DataClassification.non_classified,
                    "anonymised_samples": [],
                    "data_dictionary_match": DataDictionaryMatch.new_missing,
                    "proposed_dd_match": None,
                },
            ],
        }
    ],
}
expected_data_spec = {
    "$schema": "https://schema.kyd.ai/v1/datasource.schema.json",
    "name": "icij",
    "description": "This data source contains information about offshore entities, intermediaries, officers, and addresses, along with the relationships between them, derived from various leaks including the Panama Papers, Paradise Papers, and Offshore Leaks. ",
    "licensing": "Open Data Commons",
    "locationCoverage": ["United Kingdom", "United States"],
    "relationships": [],
    "format": "[\"<class 'pandas.core.frame.DataFrame'>\"]",
    "encoding": None,
    "datasets": [
        {
            "name": "icij",
            "type": "<class 'pandas.core.frame.DataFrame'>",
            "description": "This dataset contains information about offshore entities such as companies and trusts.",
            "code": "icij",
            "columns": [
                {
                    "name": "node_id",
                    "type": "int",
                    "description": "Unique identifier for the entity node.",
                    "comment": "This column contains unique identifiers for each entity.",
                    "primaryKey": True,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "name",
                    "type": "string",
                    "description": "Name of the entity.",
                    "comment": "This column has 162851 non-null values (99.99%). 6 values are null (0.01%). There are 161469 unique values (99.15%).",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.names,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "jurisdiction",
                    "type": "string",
                    "description": "The code for the jurisdiction where the entity is registered.",
                    "comment": "This column has 161363 non-null values (99.08%). 1494 values are null (0.92%). There are 81 unique values (0.05%). Top 3 values are BAH (25.75%), BVI (20.43%), MLT (10.34%).",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "jurisdiction_description",
                    "type": "string",
                    "description": "The name of the jurisdiction where the entity is registered.",
                    "comment": "This column has 161364 non-null values (99.08%). 1493 values are null (0.92%). There are 68 unique values (0.04%). Top 3 values are Bahamas (25.76%), British Virgin Islands (20.99%), Malta (10.34%).",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "country_codes",
                    "description": "ISO 3166-1 alpha-3 country codes associated with the entity, potentially multiple separated by semicolons.",
                    "comment": "This column has 386 non-null values (12.91%). 2603 values are null (87.09%). There are 63 unique values (16.32%). Top 3 values are IMN (27.2%), CYM (20.47%), GBR (6.74%). Values are separated by ';'.",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "countries",
                    "description": "Country names associated with the 'other' node.",
                    "comment": "This column has a high number of missing values (31.02% null). It contains a small number of unique values (0.53% unique among non-nulls). Top 3 frequent values are China (11.79%), Hong Kong (10.05%), and United States (8.78%).",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.addresses,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "incorporation_date",
                    "description": "The date the entity was incorporated.",
                    "comment": "This column has a small number of missing values (3.28% null). It contains many unique values (8.31% unique among non-nulls). Date format is DD-MMM-YYYY.",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "inactivation_date",
                    "description": "The date the entity became inactive.",
                    "comment": "This column has a very high number of missing values (82.22% null). It contains many unique values (17.21% unique among non-nulls). Date format is DD-MMM-YYYY.",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "struck_off_date",
                    "description": "The date the entity was struck off the registry.",
                    "comment": "This column has a high number of missing values (57.92% null). It contains many unique values (6.61% unique among non-nulls). Date format is DD-MMM-YYYY.",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "closed_date",
                    "description": "The date the 'other' node was closed, if applicable.",
                    "comment": "This column has a very high number of missing values (95.95% null). It contains many unique values (95.73% unique among non-nulls). Date format is DD-MMM-YYYY.",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "ibcRUC",
                    "description": "An identifier related to International Business Companies or similar registration.",
                    "comment": "This column has a high number of missing values (30.99% null). It contains many unique values (92.61% unique among non-nulls). Top 3 frequent values are not applicable as it's not a categorical column.",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "status",
                    "description": "The current status of the entity.",
                    "comment": "This column has 71407 non-null values (43.85%). 91450 values are null (56.15%). There are 68 unique values (0.1%). Top 3 values are Active (32.43%), Defaulted (27.96%), Dissolved (6.7%).",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "company_type",
                    "description": "The type of company or entity.",
                    "comment": "This column has 27592 non-null values (16.94%). 135265 values are null (83.06%). There are 51 unique values (0.18%). Top 3 values are Standard International Company (28.51%), Standard Company under IBC Act (18.91%), Business Company Limited by Shares (18.56%).",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "service_provider",
                    "description": "The service provider associated with the entity.",
                    "comment": "This column has 68676 non-null values (42.17%). 94181 values are null (57.83%). There are 4 unique values (0.01%). Top 3 values are Mossack Fonseca (62.07%), Portcullis Trustnet (17.69%), Commonwealth Trust Limited (12.96%).",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "sourceID",
                    "description": "Identifier for the source leak or dataset.",
                    "comment": "This column has 162857 non-null values (100%). 0 values are null (0%). There are 21 unique values (0.01%). Top 3 values are Panama Papers (26.17%), Bahamas Leaks (21.61%), Offshore Leaks (12.93%).",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "valid_until",
                    "description": "Indicates the date until which the source data is considered current.",
                    "comment": "This column has 162812 non-null values (99.97%). 45 values are null (0.03%). There are 41 unique values (0.03%). Top 3 values are The Panama Papers data is current through 2015 (26.17%), The Bahamas Leaks data is current through early 2016. (21.61%), The Offshore Leaks data is current through 2010 (12.93%).",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
                {
                    "name": "note",
                    "description": "Additional notes or comments about the entity.",
                    "comment": "This column has a very high number of missing values (95.05% null). It contains a small number of unique values (0.95% unique among non-nulls). Top 3 frequent values are Closed date stands for Cancelled date. (1.07%), Closed date stands for dissolved date. (0.32%), and Closed date stands for Liquidation date. (0.28%).",
                    "primaryKey": False,
                    "foreignKey": False,
                    "dataClassification": DataClassification.non_classified,
                    "dataDictionaryMatch": {
                        "result": DataDictionaryMatch.new_missing,
                        "proposed_dd_match": None,
                    },
                },
            ],
        }
    ],
}


def test_generate_data_spec_missing_info():
    """Test the generate_data_spec function with missing information."""
    with (
        patch(
            "kyd_dataspec_gen.data_spec_gen.generate_schema_information"
        ) as mock_generated_info,
        patch("google.genai.Client") as mock_client,
    ):
        mock_client.return_value = MagicMock()
        mock_generated_info.return_value = generated_info
        data_spec = generate_data_spec(
            data_source="icij",
            profile_dir=t_profile_dir,
            config=test_config,
            output_dir=str(Path(__file__).parent),
            schema_path=t_schema_path,
            existing_dataspec_path=str(
                curr_dir / "test_data" / "icij_data_spec_ori.json"
            ),
        )
        for col in data_spec["datasets"][0]["columns"]:
            for expected_col in expected_data_spec["datasets"][0]["columns"]:
                if col["name"] == expected_col["name"]:
                    assert col["description"] == expected_col["description"]


category_count_1 = {"10000001A": 1, "10000002": 2, "10000003": 1, "10000055": 4}
sample_size_1 = 8
expected_data_profile_shape_1 = [
    {"value": "99999999", "statistics": {"sampleSize": 7, "likelihood": 0.875}},
    {"value": "99999999X", "statistics": {"sampleSize": 1, "likelihood": 0.125}},
]

category_count_2 = {
    "10000001A": 1,
    "10000002": 2,
    "10000003": 1,
    "10000055": 4,
    "A123": 1,
    "AA23w345": 1,
}
sample_size_2 = 10
expected_data_profile_shape_2 = []


@pytest.mark.parametrize(
    ("category_count", "sample_size", "expected_data_profile_shape"),
    [
        pytest.param(
            category_count_1,
            sample_size_1,
            expected_data_profile_shape_1,
            id="with category count",
        ),
        pytest.param({}, 0, [], id="without category count"),
        pytest.param(
            category_count_2,
            sample_size_2,
            expected_data_profile_shape_2,
            id="After converting categorical count to list, the length of list is over limit, it should return an empty list",
        ),
    ],
)
def test_create_data_profile_shape(
    category_count, sample_size, expected_data_profile_shape
):
    """Test the create_data_profile_shape function."""
    data_profile_shape = create_data_profile_shape(
        category_count, sample_size, test_config_2
    )
    assert data_profile_shape == expected_data_profile_shape


@pytest.mark.parametrize(
    ("category_count", "expected_category_count"),
    [
        pytest.param(
            {
                "10000": 271,
                "10001": 130,
                "10002": 273,
                "10003": 214,
                "10004": 64,
                "10005": 75,
                "10006": 255,
                "10007": 116,
                "10008": 28,
                "10009": 236,
                "10010": 113,
                "10011": 127,
                "10012": 183,
                "10013": 209,
                "10014": 95,
                "10015": 244,
                "10016": 42,
                "10017": 35,
                "10018": 188,
                "10019": 279,
                "10020": 250,
                "10021": 57,
                "10022": 2,
                "10023": 285,
                "10024": 278,
                "10025": 110,
                "10026": 247,
                "10027": 253,
                "10028": 207,
                "10029": 291,
                "10030": 296,
                "10031": 231,
                "10032": 240,
                "10033": 235,
                "10034": 170,
                "10035": 202,
                "10036": 39,
                "10037": 159,
                "10038": 97,
                "10039": 62,
                "10040": 249,
                "10041": 194,
                "10042": 211,
                "10043": 139,
                "10044": 122,
                "10045": 78,
                "10046": 39,
                "10047": 57,
                "10048": 1,
                "10049": 216,
                "10050": 9,
                "10051": 135,
                "10052": 8,
                "10053": 210,
                "10054": 184,
                "10055": 74,
                "10056": 247,
                "10057": 299,
                "10058": 137,
                "10059": 132,
                "10060": 216,
                "10061": 268,
                "10062": 33,
                "10063": 61,
                "10064": 221,
                "10065": 67,
                "10066": 231,
                "10067": 140,
                "10068": 57,
                "10069": 115,
                "10070": 291,
                "10071": 245,
                "10072": 24,
                "10073": 227,
                "10074": 39,
                "10075": 172,
                "10076": 231,
                "10077": 73,
                "10078": 208,
                "10079": 17,
                "10080": 153,
                "10081": 289,
                "10082": 294,
                "10083": 295,
                "10084": 218,
                "10085": 56,
                "10086": 229,
                "10087": 48,
                "10088": 266,
                "10089": 23,
                "10090": 151,
                "10091": 131,
                "10092": 267,
                "10093": 152,
                "10094": 60,
                "10095": 254,
                "10096": 220,
                "10097": 178,
                "10098": 144,
                "10099": 169,
                "10100": 291,
                "10101": 34,
                "10102": 300,
                "10103": 244,
                "10104": 194,
                "10105": 185,
                "10106": 232,
                "10107": 14,
                "10108": 200,
                "10109": 197,
                "10110": 274,
                "10111": 21,
                "10112": 9,
                "10113": 36,
                "10114": 125,
                "10115": 8,
                "10116": 151,
                "10117": 74,
                "10118": 52,
                "10119": 76,
                "10120": 120,
                "10121": 24,
                "10122": 187,
                "10123": 42,
                "10124": 185,
                "10125": 273,
                "10126": 285,
                "10127": 191,
                "10128": 169,
                "10129": 163,
                "10130": 137,
                "10131": 184,
                "10132": 129,
                "10133": 296,
                "10134": 232,
                "10135": 232,
                "10136": 117,
                "10137": 185,
                "10138": 19,
                "10139": 225,
                "10140": 45,
                "10141": 223,
                "10142": 28,
                "10143": 34,
                "10144": 151,
                "10145": 216,
                "10146": 227,
                "10147": 282,
                "10148": 196,
                "10149": 36,
                "10150": 299,
                "10151": 37,
                "10152": 237,
                "10153": 236,
                "10154": 170,
                "10155": 75,
                "10156": 8,
                "10157": 136,
                "10158": 141,
                "10159": 270,
                "10160": 133,
                "10161": 211,
                "10162": 297,
                "10163": 200,
                "10164": 82,
                "10165": 259,
                "10166": 46,
                "10167": 1,
                "10168": 214,
                "10169": 2,
                "10170": 48,
                "10171": 275,
                "10172": 1,
                "10173": 9,
                "10174": 84,
                "10175": 255,
                "10176": 166,
                "10177": 177,
                "10178": 217,
                "10179": 56,
                "10180": 179,
                "10181": 213,
                "10182": 103,
                "10183": 64,
                "10184": 21,
                "10185": 156,
                "10186": 193,
                "10187": 275,
                "10188": 289,
                "10189": 221,
                "10190": 169,
                "10191": 189,
                "10192": 33,
                "10193": 284,
                "10194": 76,
                "10195": 275,
                "10196": 197,
                "10197": 217,
                "10198": 236,
                "10199": 150,
                "10200": 84,
                "10201": 193,
                "10202": 282,
                "10203": 105,
                "10204": 41,
                "10205": 246,
                "10206": 150,
                "10207": 197,
                "10208": 67,
                "10209": 25,
                "10210": 294,
                "10211": 257,
                "10212": 176,
                "10213": 94,
                "10214": 82,
                "10215": 88,
                "10216": 290,
                "10217": 20,
                "10218": 24,
                "10219": 27,
                "10220": 169,
                "10221": 55,
                "10222": 287,
                "10223": 14,
                "10224": 208,
                "10225": 124,
                "10226": 238,
                "10227": 296,
                "10228": 43,
                "10229": 231,
                "10230": 47,
                "10231": 34,
                "10232": 289,
                "10233": 121,
                "10234": 181,
                "10235": 25,
                "10236": 92,
                "10237": 163,
                "10238": 162,
                "10239": 46,
                "10240": 190,
                "10241": 125,
                "10242": 158,
                "10243": 75,
                "10244": 262,
                "10245": 252,
                "10246": 162,
                "10247": 262,
                "10248": 264,
                "10249": 298,
                "10250": 150,
            },
            [
                {
                    "statistics": {
                        "sampleSize": 300,
                    },
                    "value": "10102",
                },
                {
                    "statistics": {
                        "sampleSize": 299,
                    },
                    "value": "10057",
                },
                {
                    "statistics": {
                        "sampleSize": 299,
                    },
                    "value": "10150",
                },
                {
                    "statistics": {
                        "sampleSize": 298,
                    },
                    "value": "10249",
                },
                {
                    "statistics": {
                        "sampleSize": 297,
                    },
                    "value": "10162",
                },
                {
                    "statistics": {
                        "sampleSize": 296,
                    },
                    "value": "10030",
                },
                {
                    "statistics": {
                        "sampleSize": 296,
                    },
                    "value": "10133",
                },
                {
                    "statistics": {
                        "sampleSize": 296,
                    },
                    "value": "10227",
                },
                {
                    "statistics": {
                        "sampleSize": 295,
                    },
                    "value": "10083",
                },
                {
                    "statistics": {
                        "sampleSize": 294,
                    },
                    "value": "10082",
                },
                {
                    "statistics": {
                        "sampleSize": 294,
                    },
                    "value": "10210",
                },
                {
                    "statistics": {
                        "sampleSize": 291,
                    },
                    "value": "10029",
                },
                {
                    "statistics": {
                        "sampleSize": 291,
                    },
                    "value": "10070",
                },
                {
                    "statistics": {
                        "sampleSize": 291,
                    },
                    "value": "10100",
                },
                {
                    "statistics": {
                        "sampleSize": 290,
                    },
                    "value": "10216",
                },
                {
                    "statistics": {
                        "sampleSize": 289,
                    },
                    "value": "10081",
                },
                {
                    "statistics": {
                        "sampleSize": 289,
                    },
                    "value": "10188",
                },
                {
                    "statistics": {
                        "sampleSize": 289,
                    },
                    "value": "10232",
                },
                {
                    "statistics": {
                        "sampleSize": 287,
                    },
                    "value": "10222",
                },
                {
                    "statistics": {
                        "sampleSize": 285,
                    },
                    "value": "10023",
                },
                {
                    "statistics": {
                        "sampleSize": 9,
                    },
                    "value": "10112",
                },
                {
                    "statistics": {
                        "sampleSize": 9,
                    },
                    "value": "10173",
                },
                {
                    "statistics": {
                        "sampleSize": 8,
                    },
                    "value": "10052",
                },
                {
                    "statistics": {
                        "sampleSize": 8,
                    },
                    "value": "10115",
                },
                {
                    "statistics": {
                        "sampleSize": 8,
                    },
                    "value": "10156",
                },
                {
                    "statistics": {
                        "sampleSize": 2,
                    },
                    "value": "10022",
                },
                {
                    "statistics": {
                        "sampleSize": 2,
                    },
                    "value": "10169",
                },
                {
                    "statistics": {
                        "sampleSize": 1,
                    },
                    "value": "10048",
                },
                {
                    "statistics": {
                        "sampleSize": 1,
                    },
                    "value": "10167",
                },
                {
                    "statistics": {
                        "sampleSize": 1,
                    },
                    "value": "10172",
                },
            ],
            id="with over 250 category count",
        ),
        pytest.param({}, [], id="without category count"),
        pytest.param(
            {"10000": 3, "10001": 1, "10002": 5},
            [
                {"statistics": {"sampleSize": 5}, "value": "10002"},
                {"statistics": {"sampleSize": 3}, "value": "10000"},
                {"statistics": {"sampleSize": 1}, "value": "10001"},
            ],
            id="with less than 250 category count",
        ),
        pytest.param(
            {"10000\n": 3, "10001": 1, "10002": 5},
            [
                {"statistics": {"sampleSize": 5}, "value": "10002"},
                {"statistics": {"sampleSize": 3}, "value": "100000x0a"},
                {"statistics": {"sampleSize": 1}, "value": "10001"},
            ],
            id="categories with escape characters",
        ),
    ],
)
def test_create_sampled_categories(category_count, expected_category_count):
    """Test the create_sampled_categories function."""
    sampled_categories = create_sampled_categories(category_count)
    assert sampled_categories == expected_category_count


@pytest.mark.parametrize(
    ("col_name", "data_type", "data_profile_shape", "samples", "expected_col_format"),
    [
        pytest.param(
            "incorporation_date",
            "date",
            [
                {
                    "value": "9999-99-99",
                    "statistics": {"sampleSize": 3, "likelihood": 0.6},
                },
                {
                    "value": "99-99-9999",
                    "statistics": {"sampleSize": 1, "likelihood": 0.2},
                },
                {
                    "value": "99.99.9999",
                    "statistics": {"sampleSize": 1, "likelihood": 0.2},
                },
            ],
            [
                "2023-01-01",
                "2023-11-20",
                "2023-05-14",
                "24-12-2023",
                "12.24.2023",
            ],
            {"dateFormat": {"YYYY-MM-DD": 3, "DD-MM-YYYY": 1, "MM.DD.YYYY": 1}},
            id="The column is a date",
        ),
        pytest.param(
            "node_id",
            "str",
            [
                {
                    "value": "XXX_9999",
                    "statistics": {"sampleSize": 2, "likelihood": 0.6667},
                },
                {
                    "value": "XXX_99999",
                    "statistics": {"sampleSize": 1, "likelihood": 0.3333},
                },
            ],
            ["abd_1234", "udi_5678", "xyz_91011"],
            {},
            id="The id column is a string with separator",
        ),
        pytest.param(
            "note",
            "str",
            [
                {
                    "value": "XXXX XX X XXXXXXXX",
                    "statistics": {"sampleSize": 1, "likelihood": 0.5},
                },
                {
                    "value": "X XXXX XXXXXXXX, XXXX X XXXXX",
                    "statistics": {"sampleSize": 1, "likelihood": 0.5},
                },
            ],
            ["This is a sentence", "A test sentence, with a comma"],
            {},
            id="The column is with separator but as it is a sentence, there should not be a separator in the output.",
        ),
        pytest.param(
            "countries",
            "str",
            [
                {
                    "value": "XXXXXXXX;XXXXXX XXXXXXX",
                    "statistics": {"sampleSize": 1, "likelihood": 0.25},
                },
                {
                    "value": "XXXXXX XXXXXXX, XXXXXX XXXXXX",
                    "statistics": {"sampleSize": 1, "likelihood": 0.25},
                },
                {
                    "value": "XXXXX",
                    "statistics": {"sampleSize": 1, "likelihood": 0.25},
                },
            ],
            [
                "CANADA",
                "GUERNSEY;UNITED KINGDOM",
                "UNITED KINGDOM, UNITED STATES",
            ],
            {"separator": ";"},
            id="The column is a string with separator",
        ),
        pytest.param(
            "note",
            "str",
            [],
            [],
            {},
            id="The profile shape and samples are not provided",
        ),
    ],
)
def test_determine_format(
    col_name, data_type, data_profile_shape, samples, expected_col_format
):
    """Test the determine_format function."""
    col_format = determine_format(
        col_name, data_type, data_profile_shape, samples, test_config
    )
    assert len(col_format) == len(expected_col_format)
    assert all([a == b for a, b in zip(col_format, expected_col_format, strict=True)])


@pytest.mark.parametrize(
    ("times_obj", "expected_times_list"),
    [
        pytest.param({}, [], id="empty object"),
        pytest.param(
            {
                "min": 0.0,
                "max": 0.0,
                "sum": 0.0,
                "variance": 0.0,
                "skewness": 0.0001,
                "kurtosis": 0.0001,
                "histogram_and_quantiles": 0.0003,
                "num_zeros": 0.0,
                "num_negatives": 0.0,
            },
            [
                0.0,
                0.0,
                0.0,
                0.0,
                0.0001,
                0.0001,
                0.0003,
                0.0,
                0.0,
            ],
            id="valid object",
        ),
    ],
)
def test_get_times(times_obj, expected_times_list):
    """Test the get_times function."""
    result = get_times(times_obj)
    assert result == expected_times_list


@pytest.mark.parametrize(
    ("category_count", "expected_unLikeAbility"),
    [
        pytest.param(
            {
                "10000": 271,
                "10001": 130,
                "10002": 273,
                "10003": 214,
                "10004": 64,
                "10005": 75,
                "10006": 255,
                "10007": 116,
                "10008": 28,
            },
            1426,
            id="with category count",
        ),
        pytest.param({}, 0, id="without category count"),
    ],
)
def test_get_unLikeAbility(category_count, expected_unLikeAbility):
    """Test the get_unLikeAbility function."""
    unLikeAbility = get_unLikeAbility(category_count)
    assert unLikeAbility == expected_unLikeAbility


@pytest.mark.parametrize(
    ("col_name", "data_profile_shape", "data_type", "expected_data_type"),
    [
        pytest.param("id", [], "str", "str", id="empty data profile shape"),
        pytest.param(
            "id",
            [
                {
                    "value": "99999999",
                    "statistics": {"sampleSize": 7, "likelihood": 0.875},
                },
                {
                    "value": "99999999X",
                    "statistics": {"sampleSize": 1, "likelihood": 0.125},
                },
            ],
            "str",
            "str",
            id="data profile shape with string type",
        ),
        pytest.param(
            "incorporation_date",
            [
                {
                    "value": "9999-99-99",
                    "statistics": {"sampleSize": 3, "likelihood": 1.0},
                }
            ],
            "str",
            "date",
            id="inferred data type is a str, but the pattern in the data profile shape shows it is a date",
        ),
    ],
)
def test_get_data_type(col_name, data_profile_shape, data_type, expected_data_type):
    """Test the get_data_type function."""
    result = get_data_type(col_name, data_profile_shape, data_type)
    assert result == expected_data_type


@pytest.mark.parametrize(
    (
        "reference_data_sets",
        "col_name",
        "category_count",
        "total_rows",
        "col_format",
        "expected_ref_name",
        "expected_ref_datasets",
    ),
    [
        pytest.param(
            [],
            "category",
            {"10000001A": 1, "10000002": 2, "10000003": 1, "10000055": 4},
            8,
            {},
            "REF-category",
            [
                {
                    "refDataCode": "REF-category",
                    "description": "List of category",
                    "dataMapping": {
                        "dataMappingName": None,
                    },
                    "values": [
                        {
                            "value": "10000001A",
                            "statistics": {
                                "likelihood": 0.125,
                                "sampleSize": 1,
                            },
                        },
                        {
                            "value": "10000002",
                            "statistics": {
                                "likelihood": 0.25,
                                "sampleSize": 2,
                            },
                        },
                        {
                            "value": "10000003",
                            "statistics": {
                                "likelihood": 0.125,
                                "sampleSize": 1,
                            },
                        },
                        {
                            "value": "10000055",
                            "statistics": {
                                "likelihood": 0.5,
                                "sampleSize": 4,
                            },
                        },
                    ],
                },
            ],
            id="category count is between 0 and 250",
        ),
        pytest.param(
            [
                {
                    "refDataCode": "REF-countries",
                    "description": "List of countries",
                    "dataMapping": {
                        "dataMappingName": None,
                    },
                    "values": [
                        {
                            "value": "PANAMA",
                            "statistics": {
                                "likelihood": 0.125,
                                "sampleSize": 1,
                            },
                        },
                    ],
                },
            ],
            "countries",
            {"Guernsey": 1, "Jersey": 2, "Guernsey;United Kingdom": 1},
            4,
            {"separator": ";"},
            "REF-countries",
            [
                {
                    "refDataCode": "REF-countries",
                    "description": "List of countries",
                    "dataMapping": {
                        "dataMappingName": None,
                    },
                    "values": [
                        {
                            "value": "PANAMA",
                            "statistics": {
                                "likelihood": 0.125,
                                "sampleSize": 1,
                            },
                        },
                        {
                            "value": "Guernsey",
                            "statistics": {
                                "likelihood": 0.5,
                                "sampleSize": 2,
                            },
                        },
                        {
                            "value": "Jersey",
                            "statistics": {
                                "likelihood": 0.5,
                                "sampleSize": 2,
                            },
                        },
                        {
                            "value": "United Kingdom",
                            "statistics": {
                                "likelihood": 0.25,
                                "sampleSize": 1,
                            },
                        },
                    ],
                },
            ],
            id="Some category values have separators",
        ),
        pytest.param(
            [
                {
                    "refDataCode": "REF-category",
                    "description": "List of category",
                    "dataMapping": {
                        "dataMappingName": None,
                    },
                    "values": [
                        {
                            "value": "10000001A",
                            "statistics": {
                                "likelihood": 0.125,
                                "sampleSize": 1,
                            },
                        },
                        {
                            "value": "10000002",
                            "statistics": {
                                "likelihood": 0.25,
                                "sampleSize": 2,
                            },
                        },
                        {
                            "value": "10000003",
                            "statistics": {
                                "likelihood": 0.125,
                                "sampleSize": 1,
                            },
                        },
                    ],
                },
            ],
            "category",
            {"10000001A": 1, "10000002": 2, "10000003": 1, "10000055": 4},
            8,
            {},
            "REF-category",
            [
                {
                    "refDataCode": "REF-category",
                    "description": "List of category",
                    "dataMapping": {
                        "dataMappingName": None,
                    },
                    "values": [
                        {
                            "value": "10000001A",
                            "statistics": {
                                "likelihood": 0.125,
                                "sampleSize": 1,
                            },
                        },
                        {
                            "value": "10000002",
                            "statistics": {
                                "likelihood": 0.25,
                                "sampleSize": 2,
                            },
                        },
                        {
                            "value": "10000003",
                            "statistics": {
                                "likelihood": 0.125,
                                "sampleSize": 1,
                            },
                        },
                        {
                            "value": "10000055",
                            "statistics": {
                                "likelihood": 0.5,
                                "sampleSize": 4,
                            },
                        },
                    ],
                },
            ],
            id="refDataCode is already present",
        ),
        pytest.param(
            [],
            "note",
            {},
            0,
            {},
            None,
            [],
            id="category count is 0",
        ),
        pytest.param(
            [],
            "category",
            {
                "10000": 271,
                "10001": 130,
                "10002": 273,
                "10003": 214,
                "10004": 64,
                "10005": 75,
                "10006": 255,
                "10007": 116,
                "10008": 28,
                "10009": 236,
                "10010": 113,
                "10011": 127,
                "10012": 183,
                "10013": 209,
                "10014": 95,
                "10015": 244,
                "10016": 42,
                "10017": 35,
                "10018": 188,
                "10019": 279,
                "10020": 250,
                "10021": 57,
                "10022": 2,
                "10023": 285,
                "10024": 278,
                "10025": 110,
                "10026": 247,
                "10027": 253,
                "10028": 207,
                "10029": 291,
                "10030": 296,
                "10031": 231,
                "10032": 240,
                "10033": 235,
                "10034": 170,
                "10035": 202,
                "10036": 39,
                "10037": 159,
                "10038": 97,
                "10039": 62,
                "10040": 249,
                "10041": 194,
                "10042": 211,
                "10043": 139,
                "10044": 122,
                "10045": 78,
                "10046": 39,
                "10047": 57,
                "10048": 1,
                "10049": 216,
                "10050": 9,
                "10051": 135,
                "10052": 8,
                "10053": 210,
                "10054": 184,
                "10055": 74,
                "10056": 247,
                "10057": 299,
                "10058": 137,
                "10059": 132,
                "10060": 216,
                "10061": 268,
                "10062": 33,
                "10063": 61,
                "10064": 221,
                "10065": 67,
                "10066": 231,
                "10067": 140,
                "10068": 57,
                "10069": 115,
                "10070": 291,
                "10071": 245,
                "10072": 24,
                "10073": 227,
                "10074": 39,
                "10075": 172,
                "10076": 231,
                "10077": 73,
                "10078": 208,
                "10079": 17,
                "10080": 153,
                "10081": 289,
                "10082": 294,
                "10083": 295,
                "10084": 218,
                "10085": 56,
                "10086": 229,
                "10087": 48,
                "10088": 266,
                "10089": 23,
                "10090": 151,
                "10091": 131,
                "10092": 267,
                "10093": 152,
                "10094": 60,
                "10095": 254,
                "10096": 220,
                "10097": 178,
                "10098": 144,
                "10099": 169,
                "10100": 291,
                "10101": 34,
                "10102": 300,
                "10103": 244,
                "10104": 194,
                "10105": 185,
                "10106": 232,
                "10107": 14,
                "10108": 200,
                "10109": 197,
                "10110": 274,
                "10111": 21,
                "10112": 9,
                "10113": 36,
                "10114": 125,
                "10115": 8,
                "10116": 151,
                "10117": 74,
                "10118": 52,
                "10119": 76,
                "10120": 120,
                "10121": 24,
                "10122": 187,
                "10123": 42,
                "10124": 185,
                "10125": 273,
                "10126": 285,
                "10127": 191,
                "10128": 169,
                "10129": 163,
                "10130": 137,
                "10131": 184,
                "10132": 129,
                "10133": 296,
                "10134": 232,
                "10135": 232,
                "10136": 117,
                "10137": 185,
                "10138": 19,
                "10139": 225,
                "10140": 45,
                "10141": 223,
                "10142": 28,
                "10143": 34,
                "10144": 151,
                "10145": 216,
                "10146": 227,
                "10147": 282,
                "10148": 196,
                "10149": 36,
                "10150": 299,
                "10151": 37,
                "10152": 237,
                "10153": 236,
                "10154": 170,
                "10155": 75,
                "10156": 8,
                "10157": 136,
                "10158": 141,
                "10159": 270,
                "10160": 133,
                "10161": 211,
                "10162": 297,
                "10163": 200,
                "10164": 82,
                "10165": 259,
                "10166": 46,
                "10167": 1,
                "10168": 214,
                "10169": 2,
                "10170": 48,
                "10171": 275,
                "10172": 1,
                "10173": 9,
                "10174": 84,
                "10175": 255,
                "10176": 166,
                "10177": 177,
                "10178": 217,
                "10179": 56,
                "10180": 179,
                "10181": 213,
                "10182": 103,
                "10183": 64,
                "10184": 21,
                "10185": 156,
                "10186": 193,
                "10187": 275,
                "10188": 289,
                "10189": 221,
                "10190": 169,
                "10191": 189,
                "10192": 33,
                "10193": 284,
                "10194": 76,
                "10195": 275,
                "10196": 197,
                "10197": 217,
                "10198": 236,
                "10199": 150,
                "10200": 84,
                "10201": 193,
                "10202": 282,
                "10203": 105,
                "10204": 41,
                "10205": 246,
                "10206": 150,
                "10207": 197,
                "10208": 67,
                "10209": 25,
                "10210": 294,
                "10211": 257,
                "10212": 176,
                "10213": 94,
                "10214": 82,
                "10215": 88,
                "10216": 290,
                "10217": 20,
                "10218": 24,
                "10219": 27,
                "10220": 169,
                "10221": 55,
                "10222": 287,
                "10223": 14,
                "10224": 208,
                "10225": 124,
                "10226": 238,
                "10227": 296,
                "10228": 43,
                "10229": 231,
                "10230": 47,
                "10231": 34,
                "10232": 289,
                "10233": 121,
                "10234": 181,
                "10235": 25,
                "10236": 92,
                "10237": 163,
                "10238": 162,
                "10239": 46,
                "10240": 190,
                "10241": 125,
                "10242": 158,
                "10243": 75,
                "10244": 262,
                "10245": 252,
                "10246": 162,
                "10247": 262,
                "10248": 264,
                "10249": 298,
                "10250": 150,
            },
            39359,
            {},
            None,
            [],
            id="category count is over 250",
        ),
    ],
)
def test_save_reference_datasets(
    reference_data_sets,
    col_name,
    category_count,
    total_rows,
    col_format,
    expected_ref_name,
    expected_ref_datasets,
):
    """Test the save_reference_datasets function."""
    ref_name = save_reference_datasets(
        reference_data_sets,
        col_name,
        category_count,
        total_rows,
        col_format,
        test_config,
    )
    assert ref_name == expected_ref_name
    assert reference_data_sets == expected_ref_datasets


@pytest.mark.parametrize(
    ("col_format", "category_count", "total_rows", "expected_list"),
    [
        pytest.param(
            {"separator": ":"},
            {
                "10000:A": 271,
                "A": 1,
                "10001": 130,
                "10002": 273,
                "10003": 214,
                "10004": 64,
                "10005": 75,
                "10006": 255,
                "10007": 116,
                "10008": 28,
            },
            1500,
            [
                {
                    "statistics": {
                        "likelihood": 0.1807,
                        "sampleSize": 271,
                    },
                    "value": "10000",
                },
                {
                    "statistics": {
                        "likelihood": 0.1813,
                        "sampleSize": 272,
                    },
                    "value": "A",
                },
                {
                    "statistics": {
                        "likelihood": 0.0867,
                        "sampleSize": 130,
                    },
                    "value": "10001",
                },
                {
                    "statistics": {
                        "likelihood": 0.182,
                        "sampleSize": 273,
                    },
                    "value": "10002",
                },
                {
                    "statistics": {
                        "likelihood": 0.1427,
                        "sampleSize": 214,
                    },
                    "value": "10003",
                },
                {
                    "statistics": {
                        "likelihood": 0.0427,
                        "sampleSize": 64,
                    },
                    "value": "10004",
                },
                {
                    "statistics": {
                        "likelihood": 0.05,
                        "sampleSize": 75,
                    },
                    "value": "10005",
                },
                {
                    "statistics": {
                        "likelihood": 0.17,
                        "sampleSize": 255,
                    },
                    "value": "10006",
                },
                {
                    "statistics": {
                        "likelihood": 0.0773,
                        "sampleSize": 116,
                    },
                    "value": "10007",
                },
                {
                    "statistics": {
                        "likelihood": 0.0187,
                        "sampleSize": 28,
                    },
                    "value": "10008",
                },
            ],
        ),
        pytest.param(
            {},
            {
                "10000A": 271,
                "A\t\r": 1,
                "10001\n": 130,
            },
            402,
            [
                {
                    "statistics": {
                        "likelihood": 0.6741,
                        "sampleSize": 271,
                    },
                    "value": "10000A",
                },
                {
                    "statistics": {
                        "likelihood": 0.0025,
                        "sampleSize": 1,
                    },
                    "value": "A0x090x0d",
                },
                {
                    "statistics": {
                        "likelihood": 0.3234,
                        "sampleSize": 130,
                    },
                    "value": "100010x0a",
                },
            ],
        ),
    ],
)
def test_dict_to_category_list_count(
    col_format, category_count, total_rows, expected_list
):
    """Test the dict_to_statistics_list function."""
    result = dict_to_category_list_count(col_format, category_count, total_rows)
    assert result == expected_list


def test_check_correlated_columns():
    """Test the check_correlated_columns function."""
    column_list = [
        {
            "name": "address",
            "dataProfile": {
                "statistics": {
                    "sampledCategories": [
                        {"values": "36 Avenue, London", "statistics": {"sampleSize": 2}}
                    ],
                }
            },
        },
        {
            "name": "address_line_1",
            "dataProfile": {
                "statistics": {
                    "sampledCategories": [
                        {"values": "36 Avenue", "statistics": {"sampleSize": 2}}
                    ],
                }
            },
        },
        {
            "name": "name",
            "dataProfile": {
                "statistics": {
                    "sampledCategories": [
                        {"values": "ABC Limited", "statistics": {"sampleSize": 1}}
                    ],
                }
            },
        },
        {
            "name": "postcode",
            "dataProfile": {
                "statistics": {
                    "sampledCategories": [
                        {"values": "CR3 0AE", "statistics": {"sampleSize": 1}},
                        {"values": "SW2 9AC", "statistics": {"sampleSize": 1}},
                    ],
                }
            },
        },
    ]
    expected_column_list = [
        {
            "name": "address",
            "dataProfile": {
                "statistics": {
                    "sampledCategories": [
                        {"values": "36 Avenue, London", "statistics": {"sampleSize": 2}}
                    ],
                }
            },
            "correlatedColumns": ["address_line_1"],
        },
        {
            "name": "address_line_1",
            "dataProfile": {
                "statistics": {
                    "sampledCategories": [
                        {"values": "36 Avenue", "statistics": {"sampleSize": 2}}
                    ],
                }
            },
            "correlatedColumns": ["address"],
        },
        {
            "name": "name",
            "dataProfile": {
                "statistics": {
                    "sampledCategories": [
                        {"values": "ABC Limited", "statistics": {"sampleSize": 1}}
                    ],
                }
            },
        },
        {
            "name": "postcode",
            "dataProfile": {
                "statistics": {
                    "sampledCategories": [
                        {"values": "CR3 0AE", "statistics": {"sampleSize": 1}},
                        {"values": "SW2 9AC", "statistics": {"sampleSize": 1}},
                    ],
                }
            },
        },
    ]
    check_correlated_columns(column_list)
    assert column_list == expected_column_list


@pytest.mark.parametrize(
    (
        "inferred_categorical",
        "col_data_type",
        "col_format",
        "data_profile_shape",
        "unique_count",
        "total_rows",
        "expected_categorical",
    ),
    [
        pytest.param(
            True,
            "str",
            {},
            [
                {
                    "value": "XXX",
                    "statistics": {"sampleSize": 143296, "likelihood": 0.8884},
                },
                {
                    "value": "XX",
                    "statistics": {"sampleSize": 15002, "likelihood": 0.093},
                },
            ],
            2,
            90,
            True,
            id="Inferred categorical is True, column is not a date or text, unique count < 50% of total rows, output True.",
        ),
        pytest.param(
            True,
            "text",
            {},
            [
                {
                    "value": "XXX X X XXX X XX XXX XXXX XXXXX XXX XX",
                    "statistics": {"sampleSize": 1418, "likelihood": 0.7088},
                },
                {
                    "value": "XXX XX X XXX X XX XXX XXXX XXXXX XXX X",
                    "statistics": {"sampleSize": 403, "likelihood": 0.2012},
                },
                {
                    "value": "XXX XX X XXX X XX XXX XXXX XXXXX XXX XXXX",
                    "statistics": {"sampleSize": 180, "likelihood": 0.09},
                },
            ],
            11,
            90,
            False,
            id="Inferred categorical is True, column is text, output False.",
        ),
        pytest.param(
            True,
            "str",
            {},
            [
                {
                    "value": "XXX X X XXX X XX XXX XXXX XXXXX XXX XX",
                    "statistics": {"sampleSize": 1418, "likelihood": 0.7088},
                },
                {
                    "value": "XXX XX X XXX X XX XXX XXXX XXXXX XXX X",
                    "statistics": {"sampleSize": 403, "likelihood": 0.2012},
                },
                {
                    "value": "XXX XX X XXX X XX XXX XXXX XXXXX XXX XXXX",
                    "statistics": {"sampleSize": 180, "likelihood": 0.09},
                },
            ],
            11,
            12,
            False,
            id="Inferred categorical is True, column is str, all word counts > 10, unique count is over 50% of total rows, output False.",
        ),
        pytest.param(
            True,
            "str",
            {},
            [
                {
                    "value": "XXX X X XXX X XX",
                    "statistics": {"sampleSize": 1418, "likelihood": 0.7088},
                },
                {
                    "value": "XXX XX X XXX X XX XXX XXXX XXXXX XXX X",
                    "statistics": {"sampleSize": 403, "likelihood": 0.2012},
                },
                {
                    "value": "XXX XX X XXX X XX XXX XXXX XXXXX XXX XXXX",
                    "statistics": {"sampleSize": 180, "likelihood": 0.09},
                },
            ],
            11,
            20,
            False,
            id="Inferred categorical is True, column is str, not all word counts > 10, unique count > 50% of total rows, output False.",
        ),
    ],
)
def test_categorical_check(
    inferred_categorical,
    col_data_type,
    col_format,
    data_profile_shape,
    unique_count,
    total_rows,
    expected_categorical,
):
    """Test the categorical_check function."""
    categorical = categorical_check(
        inferred_categorical,
        col_data_type,
        col_format,
        data_profile_shape,
        unique_count,
        total_rows,
    )
    assert categorical == expected_categorical


@pytest.mark.parametrize(
    ("global_schema, existing_schema, expected_global_schema"),
    [
        pytest.param(
            {
                "name": "icij",
                "description": "TEST ICIJ data source",
                "datasets": [
                    {
                        "name": "icij",
                        "description": "ICIJ dataset",
                        "columns": [
                            {
                                "name": "node_id",
                                "description": "Unique identifier",
                                "comment": "This column contains unique identifiers for each entity.",
                                "samples": [
                                    "10000001",
                                    "10000055",
                                    "10000002",
                                    "10000003",
                                ],
                            }
                        ],
                    },
                ],
            },
            {
                "name": "icij",
                "description": "ICIJ data source",
                "datasets": [
                    {
                        "name": "icij",
                        "description": "ICIJ dataset",
                        "columns": [
                            {
                                "name": "node_id",
                                "description": "Unique identifier",
                                "comment": "This column contains unique identifiers for each entity.",
                                "samples": [
                                    "10000055",
                                    "10000001",
                                    "10000002",
                                    "10000003",
                                ],
                            }
                        ],
                    },
                ],
            },
            {
                "name": "icij",
                "description": "ICIJ data source",
                "datasets": [
                    {
                        "name": "icij",
                        "description": "ICIJ dataset",
                        "columns": [
                            {
                                "name": "node_id",
                                "description": "Unique identifier",
                                "comment": "This column contains unique identifiers for each entity.",
                                "samples": [
                                    "10000055",
                                    "10000001",
                                    "10000002",
                                    "10000003",
                                ],
                            }
                        ],
                    },
                ],
            },
            id="Different data source descriptions and same samples in different orders in global and existing schemas, should reuse current generated information and in same order as the existing global schema",
        ),
        pytest.param(
            {
                "name": "icij",
                "description": "ICIJ data source",
                "datasets": [
                    {
                        "name": "icij",
                        "description": "ICIJ dataset",
                        "columns": [
                            {
                                "name": "node_id",
                                "description": "Unique identifier",
                                "comment": "This column contains unique identifiers for each entity.",
                                "samples": ["10000055", "10000002", "10000003"],
                            }
                        ],
                    },
                ],
            },
            {
                "name": "icij",
                "description": "ICIJ data source",
                "datasets": [
                    {
                        "name": "icij",
                        "description": "ICIJ dataset",
                        "columns": [
                            {
                                "name": "node_id",
                                "description": "Unique identifier",
                                "comment": "This column contains unique identifiers for each entity.",
                                "samples": [
                                    "10000055",
                                    "10000001",
                                    "10000002",
                                    "10000003",
                                ],
                            }
                        ],
                    },
                ],
            },
            {
                "name": "icij",
                "description": "ICIJ data source",
                "datasets": [
                    {
                        "name": "icij",
                        "description": "ICIJ dataset",
                        "columns": [
                            {
                                "name": "node_id",
                                "description": "Unique identifier",
                                "comment": "This column contains unique identifiers for each entity.",
                                "samples": ["10000055", "10000002", "10000003"],
                            }
                        ],
                    },
                ],
            },
            id="Different samples in global and existing schemas, should keep samples in the new global schema",
        ),
    ],
)
def test_reuse_current_generated_information(
    global_schema, existing_schema, expected_global_schema
):
    """Test the reuse_current_generated_information function."""
    reuse_current_generated_information(global_schema, existing_schema)
    assert (
        global_schema["datasets"][0]["columns"][0]["samples"]
        == expected_global_schema["datasets"][0]["columns"][0]["samples"]
    )


def test_generate_schema_information():
    """Test the generate_schema_information function."""
    global_schema = {
        "name": "icij",
        "description": "",
        "datasets": [
            {
                "name": "address",
                "description": "",
                "columns": [
                    {
                        "name": "node_id",
                        "description": "Unique identifier",
                        "comment": "This column contains unique identifiers for each entity.",
                        "samples": [
                            "10000001",
                            "10000055",
                            "10000002",
                            "10000003",
                        ],
                        "primaryKey": True,
                        "foreignKey": False,
                        "dataProfile": {
                            "statistics": {"sampledCategories": [], "samples": []}
                        },
                    }
                ],
            },
            {
                "name": "relationships",
                "description": "Defines relationships between datasets.",
                "columns": [
                    {
                        "name": "officer_id",
                        "description": "officer identifier",
                        "comment": "70% unique",
                        "samples": [
                            "10000001",
                            "10000055",
                            "10000002",
                            "10000003",
                        ],
                        "primaryKey": False,
                        "foreignKey": False,
                        "dataProfile": {
                            "statistics": {"sampledCategories": [], "samples": []}
                        },
                    }
                ],
            },
        ],
    }
    expected_generated_response = DataSource(
        name="icij",
        description="ICIJ data source",
        location_coverage=["USA", "UK"],
        data_sets=[
            DataSet(
                data_set_name="address",
                description="Address dataset",
                columns=[
                    Column(
                        col_name="node_id",
                        description="Unique identifier",
                        comment="100% unique",
                        foreign_key=False,
                        data_classification=DataClassification.non_classified,
                        anonymised_samples=[],
                        data_dictionary_match=DataDictionaryMatch.new_missing,
                    ),
                ],
            ),
            DataSet(
                data_set_name="relationships",
                description="",
                columns=[
                    Column(
                        col_name="officer_id",
                        description="",
                        comment="",
                        foreign_key=True,
                        data_classification=DataClassification.individual_identifier_data,
                        anonymised_samples=[],
                        data_dictionary_match=DataDictionaryMatch.new_missing,
                    ),
                ],
            ),
        ],
        relationships=[
            Relationship(
                source=DataSetKey(data_set_name="relationships", key="officer_id"),
                target=DataSetKey(data_set_name="address", key="node_id"),
                cardinality="1:M",
                type="foreign_key",
            )
        ],
    )
    with (
        patch(
            "kyd_dataspec_gen.data_spec_gen.generate_response"
        ) as mock_generate_response,
        patch("google.genai.Client") as mock_client,
    ):
        mock_client.return_value = MagicMock()
        mock_generate_response.return_value = expected_generated_response
        description_response = generate_schema_information(
            global_schema,
            "generate descriptions for the data source, each data set and column, then identify the foreign keys and the relationships between the datasets",
            genai.Client(),
        )
        assert description_response == expected_generated_response


def test_insert_generated_information():
    """Test the insert_generated_information function."""
    global_schema = {
        "name": "customer",
        "description": "Data specification for customer",
        "locationCoverage": [],
        "referenceDatasets": [],
        "relationships": [],
        "datasets": [
            {
                "name": "customer",
                "description": "",
                "columns": [
                    {
                        "name": "name",
                        "description": "",
                        "comment": "",
                        "samples": [
                            "Timothy Sullivan",
                            "Lucas Ware",
                            "Joseph Brown",
                            "Daniel Carter",
                            "Nathan Perry",
                        ],
                        "foreignKey": False,
                        "dataClassification": None,
                        "dataProfile": {
                            "statistics": {
                                "sampledCategories": [],
                                "categories": [],
                            }
                        },
                    },
                    {
                        "name": "ssn",
                        "description": "Social Security Number",
                        "comment": "",
                        "foreignKey": False,
                        "dataClassification": None,
                        "samples": [
                            "449-70-7628",
                            "036-36-8320",
                            "456-34-3564",
                            "180-23-1808",
                        ],
                        "dataProfile": {
                            "statistics": {
                                "sampledCategories": [],
                                "categories": [],
                            }
                        },
                    },
                ],
            }
        ],
    }
    generated_info = DataSource(
        name="customer",
        description="Data about customers",
        location_coverage=["USA", "UK"],
        data_sets=[
            DataSet(
                data_set_name="customer",
                description="This dataset contains customer information.",
                columns=[
                    Column(
                        col_name="name",
                        description="Customer Name",
                        comment="This column contains the names of the customers.",
                        foreign_key=False,
                        data_classification=DataClassification.individual_identifier_data,
                        anonymised_samples=[],
                        data_dictionary_match=DataDictionaryMatch.new_missing,
                        proposed_dd_match=None,
                    ),
                    Column(
                        col_name="ssn",
                        description="Social Security Number",
                        comment="This column contains social security numbers for each entity.",
                        foreign_key=True,
                        data_classification=DataClassification.sensitive_data,
                        anonymised_samples=[
                            "683-98-2211",
                            "735-30-2476",
                            "072-28-1686",
                            "894-66-0485",
                        ],
                        data_dictionary_match=DataDictionaryMatch.new_missing,
                        proposed_dd_match=None,
                    ),
                ],
            )
        ],
        relationships=[],
    )
    expected_schema = {
        "name": "customer",
        "description": "Data about customers",
        "locationCoverage": ["USA", "UK"],
        "referenceDatasets": [],
        "relationships": [],
        "datasets": [
            {
                "name": "customer",
                "description": "This dataset contains customer information.",
                "columns": [
                    {
                        "name": "name",
                        "description": "Customer Name",
                        "comment": "This column contains the names of the customers.",
                        "foreignKey": False,
                        "dataClassification": DataClassification.individual_identifier_data.value,
                        "dataDictionaryMatching": {
                            "result": "New/Missing",
                            "potentialElementMatch": None,
                        },
                        "samples": [
                            "Timothy Sullivan",
                            "Lucas Ware",
                            "Joseph Brown",
                            "Daniel Carter",
                            "Nathan Perry",
                        ],
                        "dataProfile": {
                            "statistics": {
                                "sampledCategories": [],
                                "categories": [],
                            }
                        },
                    },
                    {
                        "name": "ssn",
                        "description": "Social Security Number",
                        "comment": "This column contains social security numbers for each entity.",
                        "foreignKey": True,
                        "dataClassification": DataClassification.sensitive_data.value,
                        "dataDictionaryMatching": {
                            "result": "New/Missing",
                            "potentialElementMatch": None,
                        },
                        "samples": [
                            "683-98-2211",
                            "735-30-2476",
                            "072-28-1686",
                            "894-66-0485",
                        ],
                        "dataProfile": {
                            "statistics": {
                                "sampledCategories": [],
                                "categories": [],
                            }
                        },
                    },
                ],
            }
        ],
    }
    enable_anonymised_samples = True
    insert_generated_information(
        global_schema, generated_info, enable_anonymised_samples
    )
    assert global_schema == expected_schema
