import argparse
import json
import logging
from pathlib import Path

from kyd_dataspec_gen.config import get_config, Config
from kyd_dataspec_gen.create_dataset_report import render_jinja
from kyd_dataspec_gen.data_profiler import profile_data
from kyd_dataspec_gen.data_spec_gen import generate_data_spec
from kyd_dataspec_gen.logging_config import setup_logging

setup_logging()
logger = logging.getLogger(__name__)
current_dir = Path(__file__).parent


def cli_profile_data(
    data_source: str,
    data_type: str,
    raw_data_folder: str,
    output_dir: str,
):
    """
    Profiles the data from the specified raw data folder and saves the profile to the output directory.

    Args:
        data_source (str): The source of the data (e.g., 'icij', 'dnb').
        data_type (str): The type of input data (e.g., 'csv', 'json', 'avro', 'txt', 'parquet').
        raw_data_folder (str): The folder containing the raw data files to profile.
        output_dir (str): The directory where the profile output will be saved. Defaults to "output/profile/".
    """
    logger.info(f"Profiling data from {raw_data_folder} of type {data_type}")

    if not raw_data_folder or not data_type:
        logger.error(
            "To automatically run data profiling, "
            "--raw-data-folder and --data-type must be provided."
        )
        return

    if not Path(raw_data_folder).exists():
        logger.error(
            f"The specified raw data folder does not exist: {raw_data_folder}."
        )
        return

    profile_data(
        data_source=data_source,
        data_type=data_type,
        output_dir=output_dir,
        input_dir=raw_data_folder,
    )

    logger.info(
        f"Data profiling completed successfully. Profile saved to {output_dir}/profile/{data_source}."
    )
    return True


def cli_data_spec_gen(
    data_source: str,
    profile_dir: str,
    config: Config,
    output_dir: str,
    existing_dataspec_path: str = "",
    verify_primary_key: bool = False,  # noqa: FBT001, FBT002 - Passing verify_primary_key to review_primary_key function and allowing the caller to easily enable or disable verification logic without requiring additional configuration or complex parameter structures
    full_data_profiling: bool = False,  # noqa: FBT001, FBT002 - Passing full_data_profiling to check_replace_profile_data function and allowing the caller to easily enable or disable full data profiling without requiring additional configuration or complex parameter structures
    raw_data_folder: str = "",
    enable_anonymisation: bool = False,  # noqa: FBT001, FBT002 - Passing enable_anonymisation to anonymise_samples function and allowing the caller to easily enable or disable anonymisation without requiring additional configuration or complex parameter structures
    published_ref_dataset_path: str = "",
    publish_new_ref_dataset: list | None = None,
    data_dictionary_path: str = "",
):
    """
    Generates a data specification from the provided data profiles.

    Args:
        data_source (str): The source of the data (e.g., 'icij', 'dnb').
        profile_dir (str): The directory containing the data profile files.
        config (Config): Configuration settings for data specification generation.
        output_dir (str): The directory where the generated data specification will be saved.
        existing_dataspec_path (str): Path to an existing data specification file to update. Defaults to "".
        verify_primary_key (bool): Whether to verify primary key detection. Defaults to False.
        full_data_profiling (bool): Whether to enable full data profiling. Defaults to False.
        raw_data_folder (str): The folder containing the raw data files, if needed for verification. Defaults to "".
        enable_anonymisation (bool): Whether to anonymise sensitive data in the profiling and data specification generation steps. Defaults to False.
    """
    if verify_primary_key and not raw_data_folder:
        logger.error("To verify primary keys, --raw-data-folder must be provided.")
        return

    if full_data_profiling and not raw_data_folder:
        logger.error(
            "To enable full data profiling, --raw-data-folder must be provided."
        )
        return

    if data_dictionary_path and not Path(data_dictionary_path).exists():
        logger.error(
            f"The specified data dictionary file does not exist: {data_dictionary_path}."
        )
        return

    logger.info("Generating data specification from data profiles")

    data_spec = generate_data_spec(
        data_source=data_source,
        profile_dir=profile_dir,
        config=config,
        output_dir=f"{output_dir}/data_spec/",
        existing_dataspec_path=existing_dataspec_path,
        verify_primary_key=verify_primary_key,
        full_data_profiling=full_data_profiling,
        raw_data_folder=raw_data_folder,
        enable_anonymisation=enable_anonymisation,
        published_ref_dataset_path=published_ref_dataset_path,
        publish_new_ref_dataset=publish_new_ref_dataset,
        data_dictionary_path=data_dictionary_path,
    )

    logger.info(
        f"Data specification generated successfully. Data spec saved to {output_dir}."
    )
    return data_spec


def main():
    """To call and run the data profiler to generate a data specification report"""
    parser = argparse.ArgumentParser(
        prog="kyd_dataspec_gen",
        description="Generates Data Specification files",
    )

    def file_choices(choices: list[str], fname: str) -> str:
        """
        Validates that the given filename has an extension present in the provided list of choices.

        Args:
            choices (list[str]): A list of allowed file extensions (including the dot, e.g., ['csv', 'json']).
            fname (str): The filename to validate.

        Returns:
            str: The validated filename if its extension is in the list of choices.

        Raises:
            SystemExit: If the file extension is not in the list of choices, triggers a parser error and exits.
        """
        ext = Path(fname).suffix
        if ext not in choices:
            parser.error(f"File must be in one of the following formats {choices}")
        return fname

    parser.add_argument(
        "-r",
        "--run-script",
        required=True,
        type=str,
        choices=["full", "profile_data", "data_spec_gen", "report_gen"],
        help="Specify the script to execute: full script, data specification and report generation, or report generation only",
    )
    parser.add_argument(
        "-ds",
        "--data-source",
        type=str,
        required=True,
        help="The data source to profile (e.g., icij, dnb)",
    )
    parser.add_argument(
        "-dt",
        "--data-type",
        type=str,
        choices=["csv", "json", "avro", "txt", "parquet"],
        help="The type of input data to profile (e.g., csv, json, avro, txt, parquet)",
    )
    parser.add_argument(
        "-v",
        "--verify-primary-key",
        action="store_true",
        help="Verify primary key detection, if applicable. Raw data folder path should also be provided if this is opted. Only runs when specified and there are datasets without any primary key and a compound primary key is generated.",
    )
    parser.add_argument(
        "-rd",
        "--raw-data-folder",
        type=str,
        help="The folder containing the raw data files to profile",
    )
    parser.add_argument(
        "-f",
        "--full-data-profiling",
        action="store_true",
        help="Enable full data profiling, which checks the output of data profiler and re-profile datasets that do not have complete data statistics in the data spec generation step.",
    )
    parser.add_argument(
        "-a",
        "--enable-anonymisation",
        action="store_true",
        help="Enable anonymisation of sensitive data in the profiling and data specification generation steps.",
    )
    parser.add_argument(
        "-pr",
        "--published-ref-dataset-path",
        type=lambda p: file_choices([".csv"], p),
        help="The path to the published reference dataset file in CSV format to match against reference datasets in the data spec generation step.",
    )
    parser.add_argument(
        "-nr",
        "--publish-new-ref-dataset",
        nargs="*",
        metavar=("NEW_REF_NAME", "OUTPUT_DIR_PATH"),
        type=str,
        # default is set to None and handled later
        default=None,
        help="Publish new reference dataset from the global schema. Specify the new reference dataset name and the output directory path to publish the new reference dataset CSV file (default: published_reference_dataset, output/published_reference_dataset/).",
    )
    parser.add_argument(
        "-dd",
        "--data-dictionary-path",
        type=lambda p: file_choices([".csv"], p),
        help="The path to the data dictionary file in CSV format to match against columns in the data spec generation step.",
    )
    parser.add_argument(
        "-o",
        "--output-dir",
        type=str,
        default="output/",
        help="The directory where the profile, dataspec json and md output will be saved (default: output/)",
    )
    parser.add_argument(
        "-c",
        "--data-spec-config",
        type=str,
        help="The data specification configuration file path (default: config.toml in the package directory)",
        default=f"{current_dir.parent}/config.toml",
    )
    parser.add_argument(
        "-u",
        "--update-dataspec-file-path",
        type=str,
        default="",
        help="The data path of the existing dataspec json file if you want to keep generated descriptions, primary and foreign keys identification",
    )
    parser.add_argument(
        "-tr",
        "--template-and-report",
        type=str,
        nargs=2,
        metavar=("TEMPLATE_FILE", "REPORT_OUTPUT_FILE"),
        default=["template.md", "_report.md"],
        help="Specify the template file and the output file for the rendered report (default: template.md _report.md)",
    )
    args = parser.parse_args()
    # Ensure profiles exist before calling generate_data_spec
    profile_dir = f"{args.output_dir}/profile/{args.data_source}"
    profile_output_files = list(Path(profile_dir).glob("p_*.json"))

    # Handle publish_new_ref_dataset empty argument case
    if args.publish_new_ref_dataset:
        if len(args.publish_new_ref_dataset) > 0:
            publish_new_ref_dataset = args.publish_new_ref_dataset
        else:
            logger.debug(
                "No publish_new_ref_dataset arguments provided, using defaults."
            )
            publish_new_ref_dataset = [
                "published_reference_dataset",
                f"{args.output_dir}/published_reference_dataset/",
            ]
    else:
        publish_new_ref_dataset = None

    if args.run_script == "report_gen":
        logger.info("Running report generation (checking for existing data spec)")
        data_spec_file_path = Path(
            f"{args.output_dir}/data_spec/{args.data_source}_data_spec.json"
        )
        data_spec = None  # Initialize data_spec variable

        if not data_spec_file_path.exists():
            logger.warning(
                f"Data specification file not found at {data_spec_file_path}. Attempting to generate it now."
            )

            if not profile_output_files:
                logger.info(
                    f"No data profiles found for {args.data_source} in {profile_dir}. Running data profiling before generating the data specification."
                )
                cli_profile_data(
                    args.data_source,
                    args.data_type,
                    args.raw_data_folder,
                    f"{args.output_dir}/profile/{args.data_source}/",
                )
            else:
                logger.info(
                    f"Data profile files found in {profile_dir}. Skipping data profiling."
                )

            data_spec = cli_data_spec_gen(
                data_source=args.data_source,
                profile_dir=profile_dir,
                config=get_config(args.data_spec_config),
                output_dir=f"{args.output_dir}/data_spec/",
                existing_dataspec_path=args.update_dataspec_file_path,
                verify_primary_key=args.verify_primary_key,
                full_data_profiling=args.full_data_profiling,
                raw_data_folder=args.raw_data_folder,
                enable_anonymisation=args.enable_anonymisation,
                published_ref_dataset_path=args.published_ref_dataset_path,
                publish_new_ref_dataset=publish_new_ref_dataset,
                data_dictionary_path=args.data_dictionary_path,
            )
        else:
            logger.info(
                f"Data specification file found at {data_spec_file_path}. Loading it."
            )
            try:
                with open(data_spec_file_path, "r", encoding="utf-8") as f:
                    data_spec = json.load(f)
            except json.JSONDecodeError:
                logger.error(
                    f"Error decoding JSON from {data_spec_file_path}. Please check the file's integrity."
                )
                return

        # Proceed to render report only if data_spec was successfully loaded or generated
        if data_spec:
            render_jinja(
                data_spec,
                args.template_and_report[0],
                f"{args.output_dir}/data_spec/report/{args.data_source}{args.template_and_report[1]}",
            )
        else:
            logger.error(
                "Failed to obtain data specification. Report generation aborted."
            )

    elif args.run_script == "data_spec_gen":
        logger.info(
            "Checking for existing data profiles for data specification generation"
        )

        if not profile_output_files:
            logger.warning(
                f"Data profile files not found in {profile_dir}. Attempting to run data profiling first."
            )
            if not cli_profile_data(
                args.data_source,
                args.data_type,
                args.raw_data_folder,
                f"{args.output_dir}/profile/{args.data_source}/",
            ):
                logger.error(
                    "Data profiling failed. Aborting data specification generation."
                )
                return

        cli_data_spec_gen(
            data_source=args.data_source,
            profile_dir=profile_dir,
            config=get_config(args.data_spec_config),
            output_dir=f"{args.output_dir}/data_spec/",
            existing_dataspec_path=args.update_dataspec_file_path,
            verify_primary_key=args.verify_primary_key,
            full_data_profiling=args.full_data_profiling,
            raw_data_folder=args.raw_data_folder,
            enable_anonymisation=args.enable_anonymisation,
            published_ref_dataset_path=args.published_ref_dataset_path,
            publish_new_ref_dataset=publish_new_ref_dataset,
            data_dictionary_path=args.data_dictionary_path,
        )

    elif args.run_script == "profile_data":
        cli_profile_data(
            args.data_source,
            args.data_type,
            args.raw_data_folder,
            f"{args.output_dir}/profile/{args.data_source}/",
        )

    elif args.run_script == "full":
        logger.info("Running the full script")
        cli_profile_data(
            args.data_source,
            args.data_type,
            args.raw_data_folder,
            f"{args.output_dir}/profile/{args.data_source}/",
        )
        data_spec = cli_data_spec_gen(
            data_source=args.data_source,
            profile_dir=profile_dir,
            config=get_config(args.data_spec_config),
            output_dir=f"{args.output_dir}/data_spec/",
            existing_dataspec_path=args.update_dataspec_file_path,
            verify_primary_key=args.verify_primary_key,
            full_data_profiling=args.full_data_profiling,
            raw_data_folder=args.raw_data_folder,
            enable_anonymisation=args.enable_anonymisation,
            published_ref_dataset_path=args.published_ref_dataset_path,
            publish_new_ref_dataset=publish_new_ref_dataset,
            data_dictionary_path=args.data_dictionary_path,
        )
        if data_spec:
            render_jinja(
                data_spec,
                args.template_and_report[0],
                f"{args.output_dir}/data_spec/report/{args.data_source}{args.template_and_report[1]}",
            )
        logger.info(
            f"Full script execution completed successfully. Report generated to {args.output_dir}/data_spec/report/."
        )
    else:
        logger.error(
            "Invalid script option. Please choose 'full', 'profile_data', 'data_spec_gen', or 'report_gen'."
        )
        return


if __name__ == "__main__":
    main()
