import logging
from pathlib import Path

import polars as pl
from google import genai
from google.genai.types import GenerateContentConfigDict

from kyd_dataspec_gen.config import Config
from kyd_dataspec_gen.models import DataSetList
from kyd_dataspec_gen.setup_ai import generate_response
from kyd_dataspec_gen.utils import read_dataset

logger = logging.getLogger(__name__)


def identify_primary_key(
    col_stats: dict,
    col_name: str,
    data_type: str,
    config: Config,
    primary_key_list: list,
) -> bool:
    """
    Determines whether a column is likely to be a primary key based on its statistics and name.

    Args:
        col_stats (dict): A dictionary containing statistics about the column, such as 'unique_ratio' and 'null_count'.
        col_name (str): The name of the column.
        data_type (str): The data type of the column (e.g., 'string', 'int').
        config (Config): Configuration object containing settings for primary key identification.
        primary_key_list (list): A list to which the identified primary key column names will be appended.

    Returns:
        bool: True if the column is likely a primary key, False otherwise.
    """
    logger.debug(f"Identifying primary key for column: {col_name}")
    unique_ratio = col_stats.get("unique_ratio", 0)
    null_count = col_stats.get("null_count", 0)
    logger.debug(
        f"col_name: {col_name}, unique_ratio: {unique_ratio}, null_count: {null_count}, data_type: {data_type}"
    )
    if unique_ratio == 1.0 and null_count == 0 and data_type in ["string", "int"]:
        logger.debug(
            f"Column {col_name} is identified as a primary key based on statistics."
        )
        primary_key_list.append(col_name)
        return True
    if (
        config.identification_column_keyword in col_name.lower()
        and null_count == 0
        and data_type in ["string", "int"]
    ):
        logger.debug(
            f"Column {col_name} is identified as a primary key based on its name."
        )
        primary_key_list.append(col_name)
        return True
    return False


def review_primary_key(
    global_schema: dict,
    ai_client: genai.Client | None,
    config: Config,
    verify_primary_key: bool = False,  # noqa: FBT001, FBT002 - allows the caller to easily enable or disable verification logic without requiring additional configuration or complex parameter structures
    raw_data_folder: str = "",
) -> dict:
    """
    Reviews and updates the primary key status of columns in each dataset within the global schema.

    If a dataset does not have any column marked as a primary key, this function checks if any column is both:
        - Marked as a foreign key, and
        - Its name is present in the provided primary_key_list.

    If such a column is found, it is set as a primary key.

    Args:
        global_schema (dict): The schema containing datasets and their columns.
        ai_client (genai.Client | None): The AI client used for generating suggestions. If None, detection is skipped.
        verify_primary_key (bool): If True, the function will verify the generated primary keys against the raw data.
            Defaults to False.
        raw_data_folder (str): The folder containing raw dataset files. Used for verification if verify_primary_key is True.

    Returns:
        dict: The updated global schema with primary key assignments reviewed and modified as necessary.
    """
    foreign_keys = [
        column["name"]
        for dataset in global_schema["datasets"]
        for column in dataset["columns"]
        if column.get("foreignKey", False)
    ]
    dataset_wo_primary_key = []
    for dataset in global_schema["datasets"]:
        pk_count = sum(
            [1 for column in dataset["columns"] if column.get("primaryKey", False)]
        )
        if pk_count > 1:
            logger.debug(
                f"Multiple primary keys found in dataset {dataset['name']}. Reviewing the primary keys."
            )
            id_col_exists = any(
                config.identification_column_keyword in column["name"].lower()
                for column in dataset["columns"]
            )
            # If there are multiple primary keys and one of them is an identification column,
            # it should be the only primary key.
            # The other primary keys should be set as alternate keys.
            if not id_col_exists:
                for column in dataset["columns"]:
                    pk = 0
                    if column.get("primaryKey", False):
                        pk += 1
                        # There is only one unique primary key and is a foreign key, it should be a primary key.
                        # If there are multiple primary keys, the other unique primary keys are set as alternate keys.
                        if column["name"] in foreign_keys and pk == 1:
                            continue
                        column["primaryKey"] = False
                        column["alternateKey"] = True
                        logger.debug(
                            f"Column {column['name']} in dataset {dataset['name']} is set as an alternate key."
                        )
            else:
                for column in dataset["columns"]:
                    if (
                        column.get("primaryKey", False)
                        and config.identification_column_keyword
                        not in column["name"].lower()
                    ):
                        column["primaryKey"] = False
                        column["alternateKey"] = True
                        logger.debug(
                            f"Column {column['name']} in dataset {dataset['name']} is set as an alternate key."
                        )

        if pk_count == 0:
            logger.debug(
                f"No primary key found in dataset {dataset['name']}. Passing data to AI for review."
            )
            dataset_wo_primary_key.append(dataset)
    if len(dataset_wo_primary_key) > 0:
        logger.debug(
            f"{[dataset['name'] for dataset in dataset_wo_primary_key]} datasets without primary keys found."
        )
        generated_pk_compounds = detect_compound_primary_key(
            ai_client,
            dataset_wo_primary_key,
        )
        if len(generated_pk_compounds.get("data_sets", [])) > 0:
            if verify_primary_key:
                generated_pk_compounds = verify_compound_primary_keys(
                    raw_data_folder,
                    dataset_wo_primary_key,
                    generated_pk_compounds,
                )
                update_primary_key_w_verification(generated_pk_compounds, global_schema)
            else:
                update_primary_key_wo_verification(
                    generated_pk_compounds, global_schema
                )
    return global_schema


def update_primary_key_wo_verification(
    generated_pk_compounds: DataSetList, global_schema: dict
) -> None:
    """
    Updates the global schema with the generated primary key compounds without validation.

    Args:
        generated_pk_compounds (DataSetList): List of dictionaries containing dataset names and their compound primary keys.
        global_schema (dict): The schema containing datasets and their columns.

    Returns:
        None
    """
    logger.info(
        "Updating global schema with primary keys from AI detection without verification."
    )
    for r_dataset in generated_pk_compounds["data_sets"]:
        for dataset in global_schema["datasets"]:
            if dataset["name"] == r_dataset["data_set_name"]:
                for col in dataset["columns"]:
                    top_combination = r_dataset["compound_primary_key"][0]
                    if col["name"] in top_combination["combination"]:
                        # The column in the combination is confirmed as a primary key
                        col["primaryKey"] = True
                        logger.debug(
                            f"Column {col['name']} in dataset {dataset['name']} is identified as a primary key."
                        )


def update_primary_key_w_verification(
    generated_pk_compounds: DataSetList, global_schema: dict
) -> None:
    """
    Updates the global schema with the generated primary key compounds.

    Args:
        generated_pk_compounds (DataSetList): List of dictionaries containing dataset names and their compound primary keys.
        global_schema (dict): The schema containing datasets and their columns.

    Returns:
        None
    """
    logger.info("Updating global schema with verified primary keys from AI detection.")
    for r_dataset in generated_pk_compounds["data_sets"]:
        for dataset in global_schema["datasets"]:
            if dataset["name"] == r_dataset["data_set_name"]:
                for col in dataset["columns"]:
                    top_combination = r_dataset["compound_primary_key"][0]
                    if col["name"] in top_combination["combination"]:
                        # The column in the combination is confirmed as a primary key
                        if top_combination["verified"]:
                            col["primaryKey"] = True
                            logger.debug(
                                f"Column {col['name']} in dataset {dataset['name']} is identified and verified as a primary key."
                            )
                        # The column in the combination is validated but doesn't seem to be a correct primary key
                        else:
                            col["primaryKey"] = False
                            logger.debug(
                                f"Column {col['name']} in dataset {dataset['name']} is not verified as a primary key."
                            )
                        # Keep the primary key status if no validation and verification are done
                        logger.debug(
                            f"Column {col['name']} in dataset {dataset['name']} is identified as a primary key."
                        )


def detect_compound_primary_key(
    ai_client: genai.Client | None,
    dataset_wo_primary_key: list,
) -> DataSetList:
    """
    Detects possible compound primary keys for datasets lacking a unique, non-null column.

    This function analyzes each dataset in the provided list and suggests up to three likely combinations of columns that could serve as compound primary keys, ordered from most to least likely. If the most likely combination is a subset of a less likely combination, the function prioritizes the combination with more columns.

    Args:
        ai_client (genai.Client | None): An optional AI client used to generate suggestions for compound primary keys. If None, detection is skipped.
        dataset_wo_primary_key (list): A list of datasets (without primary keys) to analyze.

    Returns:
        DataSetList: An object containing the datasets with suggested compound primary key combinations.
    """
    ai_config: GenerateContentConfigDict = {
        "response_mime_type": "application/json",
        "response_schema": DataSetList,
    }
    prompt_content = (
        "The following datasets lack a unique, non-null column identified as a primary key. "
        "For each dataset, review all columns and suggest up to three likely combinations of columns that could serve as compound primary keys, ordered from most to least likely."
        "Reorder them and prioritise the combination with more columns if the top combination is a subset of the less likely combinations: "
    )
    response = DataSetList(data_sets=[])
    if ai_client:
        logger.debug("Detecting compound primary keys using Gemini AI")
        response: DataSetList = generate_response(
            ai_client,
            prompt_content + str(dataset_wo_primary_key),
            ai_config,
        )
        if "data_sets" in response:
            logger.debug(f"Possible compound primary keys: {response}")
        else:
            response = DataSetList(data_sets=[])
            logger.debug("No compound primary keys detected.")
    else:
        logger.warning(
            "Gemini AI client is not set up, skipping compound primary key detection."
        )
    return response


def verify_compound_primary_keys(
    raw_data_folder: str,
    dataset_wo_primary_key: list,
    potential_compounds: DataSetList,
) -> DataSetList:
    """
    Verifies compound primary key combinations for datasets that lack a primary key.
    This function iterates through a list of datasets without primary keys, checks if their corresponding
    raw data files exist, and verifies each compound primary key combination specified in the AI detected DataSetList response.
    The verification status is updated in the potential_compounds output.

    Args:
        raw_data_folder (str): The folder containing raw dataset files.
        dataset_wo_primary_key (list): List of dataset metadata dictionaries without primary keys.
        potential_compounds (DataSetList): The potential compound primary key combinations.

    Returns:
        DataSetList: The updated DataSetList with verification results for compound primary keys.
    """
    if not potential_compounds.get("data_sets", None):
        logger.warning(
            "No potential compound primary keys found. Skipping verification."
        )
        return potential_compounds
    for dataset in dataset_wo_primary_key:
        for r_dataset in potential_compounds["data_sets"]:
            if dataset["name"] == r_dataset["data_set_name"]:
                raw_dataset_path = Path(raw_data_folder) / (
                    dataset["name"] + "." + dataset["type"]
                )
                if not raw_dataset_path.exists():
                    logger.error(
                        f"Raw dataset file {raw_dataset_path} does not exist. Skipping verification."
                    )
                    continue
                df = read_dataset(raw_dataset_path)
                logger.debug(
                    f"Verifying compound primary keys for dataset {dataset['name']}"
                )
                for combination in r_dataset["compound_primary_key"]:
                    if verify_primary_key_combination(
                        df,
                        combination["combination"],
                    ):
                        combination["verified"] = True
                        logger.debug(
                            f"Compound primary key verified for dataset {dataset['name']}: {combination['combination']}"
                        )
                    else:
                        combination["verified"] = False
                        logger.debug(
                            f"This compound primary key is not valid for dataset {dataset['name']}: {combination['combination']}"
                        )
    return potential_compounds


def verify_primary_key_combination(
    data_frame: pl.DataFrame, column_names: list[str]
) -> bool:
    """
    Checks whether the AI detected primary key combination columns form a unique primary key combination.

    Args:
        data_frame (pl.DataFrame): The DataFrame to check.
        column_names (list[str]): List of column names to test as a primary key.

    Returns:
        bool: True if the combination of columns uniquely identifies each row (i.e., forms a primary key), False otherwise.
    """
    total_rows = data_frame.height
    unique_row_subset_num = data_frame.n_unique(subset=column_names)
    logger.debug(
        f"Total rows: {total_rows}, Unique rows with subset {column_names}: {unique_row_subset_num}"
    )
    return total_rows == unique_row_subset_num
