import fnmatch
import json
import logging
import re
from pathlib import Path
from typing import Optional

import pandas as pd
from dataprofiler import Data, StructuredProfiler, Profiler, ProfilerOptions
from dataprofiler.data_readers.csv_data import CSVData
import csv
import fsspec
import io

logger = logging.getLogger(__name__)
fs = fsspec.filesystem("gcs")


def generate_profile_report(
    data: Data | pd.DataFrame | CSVData, output_path: Path
) -> None:
    """
    Generate a profile report for the given data and save it to the specified output path.

    Args:
        data (Data): The input data to be profiled.
        output_path (str): The file path where the generated profile report will be saved.

    Returns:
        None
    """
    profile_options = ProfilerOptions()
    profile_options.set(
        {
            "structured_options.category.is_enabled": True,
        }
    )
    profile = Profiler(data, options=profile_options, min_true_samples=20)  # type: ignore[call-arg] - data is either Data from accepted formats or pd.DataFrame
    if isinstance(profile, StructuredProfiler):
        # If the profile is a StructuredProfiler, we use the report method directly
        report = profile.report(report_options={"output_format": "pretty"})
        output_path.parent.mkdir(parents=True, exist_ok=True)
        logger.debug(f"Saving profile report to {output_path}")
        with open(output_path, "w", encoding="utf-8") as file:
            json.dump(report, file, indent=4)
    else:
        logger.error("The profile is not a StructuredProfiler. Cannot generate report.")


def profile_data(
    data_source: str,
    data_type: str,
    output_dir: str,
    input_dir: str = "",
    input_data: Optional[pd.DataFrame] = None,
) -> None:
    """
    Profiles a dataset using a data profiler and generates a report.

    This function supports profiling data from either a pandas DataFrame or
    files of specific types (e.g., CSV, JSON, Avro, TXT, Parquet). The profiling
    results are saved as JSON files in the specified output directory.

    Args:
        data_source (str): The name of the data source being profiled. This is
            used to name the output report files.
        data_type (str): The type of the data source. Supported types include
            "csv", "json", "avro", "txt", "parquet", and "pd.dataframe".
        output_dir (str): The directory where the profiling reports will be saved.
        input_dir (str, optional): The directory containing the input files to
            be profiled. Required if profiling files. Defaults to an empty string.
        input_data (pd.DataFrame, optional): A pandas DataFrame to be profiled.
            Required if `data_type` is "pd.dataframe". Defaults to None.

    Returns:
        None

    Notes:
        - When profiling files, only files matching the specified `data_type`
          and supported file extensions are considered.
        - The profiling reports are saved in JSON format with filenames prefixed
          by "p_" followed by the data source or file stem.
    """
    valid_file_types = [
        "csv",
        "json",
        "avro",
        "txt",
        "parquet",
    ]
    file_pattern_list = ["*.csv", "*.json", "*.avro", "*.txt", "*.parquet"]
    translated_regexes = [fnmatch.translate(p) for p in file_pattern_list]
    combined_regex_string = "|".join(translated_regexes)
    combined_compiled_regex = re.compile(combined_regex_string, re.IGNORECASE)
    data_type = data_type.lower()
    if data_type == "pd.dataframe" and input_data is not None:
        logger.debug("Profiling data from pandas DataFrame")
        output_path = Path(output_dir) / f"p_{data_source}.json"
        generate_profile_report(input_data, output_path)
    elif data_type in valid_file_types:
        data_source_files = []
        all_data_source_files = list(Path(input_dir).glob(f"*.{data_type}"))
        if "gs://" in input_dir:
            all_data_source_files = [
                Path("gs://" + fsspec_path) for fsspec_path in fs.ls(input_dir)
            ]
        for f in all_data_source_files:
            match = combined_compiled_regex.match(f.name)
            if match and data_type in f.name.lower():
                data_source_files.append(f)
        logger.debug(f"Profiling {list(data_source_files)} files in {input_dir}")
        if not data_source_files:
            logger.warning(f"No files found in {input_dir} for data type {data_type}")
            return
        for p in data_source_files:
            data = None
            if p.suffix.lower() == ".csv":
                if "gs:/" in str(p):
                    gs_p = str(p).replace("gs:/", "gs://")
                    with fs.open(gs_p, "r", encoding="utf-8") as f:
                        file_content = f.read()
                        delimiter = str(
                            csv.Sniffer().sniff(file_content.splitlines()[0]).delimiter
                        )
                        content = pd.read_csv(io.StringIO(file_content), sep=delimiter)
                    data = CSVData(data=content, options={"delimiter": delimiter})
                else:
                    with open(p, "r", encoding="utf-8") as f:
                        # Use the csv.Sniffer to detect the delimiter
                        # This is useful for CSV files with different delimiters
                        delimiter = str(csv.Sniffer().sniff(f.read()).delimiter)
                    data = CSVData(p.as_posix(), options={"delimiter": delimiter})
            else:
                data = Data(p.as_posix())
            output_path = Path(output_dir) / f"p_{p.stem}.json"
            if data:
                generate_profile_report(data, output_path)
    else:
        logger.error(
            f"Invalid data type '{data_type}'. Supported types are: {', '.join(valid_file_types)}"
        )
