try:
    from typing import TypedDict
except ImportError:  # Fallback for Python versions < 3.12
    from typing_extensions import TypedDict

from rapidfuzz import fuzz
import csv
from pathlib import Path
import logging
from kyd_dataspec_gen.config import Config

logger = logging.getLogger(__name__)


class ReferenceData(TypedDict):
    """Information about reference data"""

    ref_id: str
    name: str
    description: str
    sample_values: list[str]


class PublishedReferenceDataset(TypedDict):
    """Published reference dataset"""

    published_ref_name: str
    published_ref_dataset: list[ReferenceData]


def publish_ref_dataset(
    global_schema: dict,
    ref_dataset_name: str,
    output_dir: str,
    config: Config,
) -> PublishedReferenceDataset:
    """
    Creates a published reference dataset object from the provided global schema.

    Args:
        global_schema (dict): The global schema containing reference datasets under the "referenceDatasets" key.
        ref_dataset_name (str): The name of the published reference dataset file (without extension).
        output_dir (str): The directory path to save the published reference dataset CSV file.
        config (Config): Configuration object containing settings like ID prefix and sample values limit.

    Returns:
        PublishedReferenceDataset: An object containing the published reference dataset, with each reference data entry
        including its ID, name, description, and up to five sample values.

    Notes:
        - Each reference data entry's ID is constructed by concatenating a prefix (`PUBLISHED_REF_ID_PREFIX`) with the
          reference data code (excluding the first four characters).
        - The number of sample values from each reference data entry can be set using the `SAMPLE_VALUES_LIMIT` constant.
    """
    output_path = Path(output_dir + ref_dataset_name + ".csv")
    logger.info(f"Publishing reference dataset '{ref_dataset_name}' to '{output_path}'")
    published_ref_dataset = []
    for ref_data in global_schema.get("referenceDatasets", []):
        sample_values = [
            v["value"]
            for i, v in enumerate(ref_data["values"])
            if i < config.sample_values_limit
        ]
        ref_name = ref_data["refDataCode"][4:]
        published_ref_dataset.append(
            ReferenceData(
                ref_id=config.published_ref_id_prefix + ref_name,
                name=ref_name,
                description=ref_data["description"],
                sample_values=sample_values,
            )
        )
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as csvfile:
        fieldnames = ["ref_id", "name", "description", "sample_values"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for ref in published_ref_dataset:
            writer.writerow(
                {
                    "ref_id": ref["ref_id"],
                    "name": ref["name"],
                    "description": ref["description"],
                    "sample_values": ";".join(ref["sample_values"]),
                }
            )
    return PublishedReferenceDataset(
        published_ref_name=ref_dataset_name,
        published_ref_dataset=published_ref_dataset,
    )


def add_published_reference(ref_data: dict, published_ref: ReferenceData) -> None:
    """
    Adds a published reference dataset identifier to the reference data mapping.

    Args:
        ref_data (dict): The reference data dictionary to update.
        published_ref (ReferenceData): The published reference data object containing a "ref_id" key.

    Returns:
        None

    Side Effects:
        Modifies the 'ref_data' dictionary in place by setting the "referenceDataset" key in its "dataMapping" sub-dictionary.
    """
    ref_data.setdefault("dataMapping", {})["referenceDataset"] = published_ref["ref_id"]


def read_published_ref_dataset(
    published_ref_dataset_path: str,
) -> PublishedReferenceDataset:
    """
    Reads a published reference dataset from a CSV file.

    Args:
        published_ref_dataset_path (str): The file path to the published reference dataset CSV file.

    Returns:
        PublishedReferenceDataset: An object containing the published reference dataset, with each reference data entry
        including its ID, name, description, and sample values.

    Notes:
        - The CSV file is expected to have columns: "ref_id", "name", "description", and "sample_values".
        - The "sample_values" column should contain semicolon-separated values.
    """
    published_ref_dataset = []
    with open(published_ref_dataset_path, "r", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            sample_values = (
                row["sample_values"].split(";") if row["sample_values"] else []
            )
            published_ref_dataset.append(
                ReferenceData(
                    ref_id=row["ref_id"],
                    name=row["name"],
                    description=row["description"],
                    sample_values=sample_values,
                )
            )
    return PublishedReferenceDataset(
        published_ref_name=Path(published_ref_dataset_path).stem,
        published_ref_dataset=published_ref_dataset,
    )


def match_reference_data_to_published_set(
    global_schema: dict,
    published_ref_dataset_path: str,
    config: Config,
) -> dict:
    """
    Matches reference datasets in the global schema to a published reference dataset and updates the schema with data mappings.

    Args:
        global_schema (dict): The data spec schema containing reference datasets under the "referenceDatasets" key.
        published_ref_dataset_path (str): The file path to the published reference dataset CSV file.
        config (Config): Configuration object containing settings like fuzzy threshold and ID prefix.

    Returns:
        dict: The updated global schema with matched reference datasets annotated with "dataMapping" where applicable.

    Notes:
        - Performs both exact name matching and fuzzy matching (using fuzz.ratio) between reference dataset names.
        - If a match is found, adds a "dataMapping" entry to the corresponding reference dataset in the schema.
        - Uses a fuzzy threshold (fuzzy_threshold) for approximate matches and checks for overlapping values.
        - Relies on the external function `create_data_map` to generate the mapping details.
    """
    published_ref_dataset = read_published_ref_dataset(published_ref_dataset_path)
    for ref_data in global_schema.get("referenceDatasets", []):
        ref_data_code = ref_data["refDataCode"]
        ref_name = ref_data_code[4:].lower()
        for published_ref in published_ref_dataset["published_ref_dataset"]:
            published_ref_data_name = published_ref["name"].lower()
            if ref_name == published_ref_data_name:
                logger.debug(
                    f"Exact match found for reference data '{ref_name}' with published reference '{published_ref_data_name}'."
                )
                add_published_reference(ref_data, published_ref)
                continue
            if (
                fuzz.ratio(ref_name, published_ref_data_name)
                > config.ref_data_fuzzy_threshold
            ):
                ref_values = [v["value"].lower() for v in ref_data["values"]]
                published_ref_values = [
                    v.lower() for v in published_ref["sample_values"]
                ]
                intersection = set(ref_values) & set(published_ref_values)
                if intersection:
                    logger.debug(
                        f"Fuzzy match found for reference data '{ref_name}' with published reference '{published_ref_data_name}'. Overlapping values: {intersection}"
                    )
                    add_published_reference(ref_data, published_ref)
                continue
    return global_schema
