import csv
import logging
import os
from pathlib import Path

from ast import literal_eval

import polars as pl
import fsspec
import io
import tempfile

logger = logging.getLogger(__name__)


def read_dataset(raw_dataset_path: Path) -> pl.DataFrame:
    """
    Reads a dataset from the specified file path and returns it as a Polars DataFrame.

    Supports CSV, JSON, and Parquet file formats. For CSV files, the delimiter is automatically detected.
    Logs the reading process and raises a ValueError if the file type is unsupported.

    Args:
        raw_dataset_path (Path): The path to the dataset file.

    Returns:
        pl.DataFrame: The loaded dataset as a Polars DataFrame.

    Raises:
        ValueError: If the file extension is not one of .csv, .json, or .parquet.
    """
    logger.info(f"Reading raw dataset from {raw_dataset_path}")
    suffix = raw_dataset_path.suffix
    if "gs:/" not in str(raw_dataset_path):
        if suffix == ".csv":
            with open(raw_dataset_path, "r", encoding="utf-8") as f:
                # Use the csv module to read the header
                delimiter = str(csv.Sniffer().sniff(f.read()).delimiter)
                logger.debug(f"delimiter in csv detected: {delimiter}")
            df = pl.read_csv(raw_dataset_path, separator=delimiter)
        elif suffix == ".json":
            df = pl.read_json(raw_dataset_path)
        elif suffix == ".parquet":
            df = pl.read_parquet(raw_dataset_path)
        else:
            msg = f"Unsupported data type: {suffix}"
            raise ValueError(msg)
    else:
        fs, fs_path = fsspec.core.url_to_fs(
            str(raw_dataset_path).replace("gs:/", "gs://")
        )
        with fs.open(fs_path, "rb") as f:
            if suffix == ".csv":
                content = f.read().decode("utf-8")
                delimiter = str(csv.Sniffer().sniff(content.splitlines()[0]).delimiter)
                logger.debug(f"delimiter in csv detected: {delimiter}")
                df = pl.read_csv(io.StringIO(content), separator=delimiter)
            elif suffix == ".json":
                content = f.read().decode("utf-8")
                df = pl.read_json(io.StringIO(content))
            elif suffix == ".parquet":
                with tempfile.NamedTemporaryFile(
                    suffix=".parquet", delete=False
                ) as tmp:
                    tmp.write(f.read())
                    tmp.flush()
                    tmp_path = tmp.name
                df = pl.read_parquet(tmp_path)
                try:
                    os.unlink(tmp_path)
                except Exception:
                    logger.warning("Failed to remove temp file %s", tmp_path)
            else:
                msg = f"Unsupported data type: {suffix}"
                raise ValueError(msg)
    return df


def string_array_to_list(str_array: str) -> list:
    """
    Converts a string representation of a list into an actual Python list.

    This function takes a string input that represents a list (e.g., "[1,2,3]")
    and attempts to convert it into a Python list using `literal_eval`. If the
    input is already a list, it is returned as-is. If the conversion fails due
    to syntax errors or invalid input, an empty list is returned, and an error
    is logged.

    Args:
        str_array (str): A string representing a list (e.g., "[1,2,3]") or an
                         actual list.

    Returns:
        list: The converted Python list if successful, or an empty list if the
              conversion fails.
    """
    if isinstance(str_array, list):
        return str_array

    clean_str = str_array.replace("\n", "").replace(" ... ,", "").replace("...,", "")

    try:
        # Attempt to parse the string as a Python literal
        return literal_eval(clean_str)
    except (SyntaxError, ValueError) as e:
        # If parsing fails, return an empty list
        logger.error(f"Error parsing string array. Error: {e}")
        return []
