"""
Convert a docx file to markdown

This module provides a class `Docx2Md` that converts a docx file to markdown format.
It extracts text, images, tables, and hyperlinks from the docx file and formats them into markdown syntax.

It also handles various styles and formatting options such as bold, italic, underline, and strikethrough.
It can also extract metadata details from the docx file.
"""

import logging
from pathlib import Path
from typing import TYPE_CHECKING

from docx import Document
from docx.document import Document as DocxDocument
from lxml import etree as lxml_etree

from .docx_config import DocxConfig
from .docx_constants import docx_namespaces, w_ns
from .docx_file import DocxFile
from .docx_parts_para import ParaPart
from .docx_parts_table import TableParts
from .numbered_item_tracker import ListNumberTracker, TrackerType

if TYPE_CHECKING:
    from .docx_parts_run import RunPart

logger = logging.getLogger(__name__)


class Docx2Md:
    """
    A class to convert a .docx file to markdown format.

    This class extracts text, images, tables, and hyperlinks from a .docx file
    and formats them into markdown syntax. It also handles various styles and
    formatting options such as bold, italic, underline, and strikethrough.

    Attributes
    ----------
    docx_file : Path
        Path to the input .docx file.
    output_dir : Path
        Path to the directory where extracted images will be saved.
    md_file : str or None
        The path to the generated markdown file.
    md_file_content : str or None
        The content of the generated markdown file.
    hyperlinks : dict
        A dictionary of hyperlinks extracted from the document.

    Methods
    -------
    extract_bookmark_name(xml_string: str) -> str
        Extract the w:name value from the w:bookmarkStart element in the given XML string.
    convert_docx_2_md(output_md_file: str | Path) -> None
        Convert the .docx file to markdown and save it to the specified output file.
    handle_document_resources(document: Document, output_md_file: str | Path) -> None
        Handle the resources in the document, such as hyperlinks and images.

    """

    def __init__(
        self,
        docx_file: Path | str,
        config: DocxConfig,
    ) -> None:
        """
        Initialize the Docx2Md class.

        Args:
            docx_file (Path): Path to the input .docx file.
            config (DocxConfig): Configuration object containing settings for the conversion.

        """
        self.docx_file = Path(docx_file)
        self.output_dir = Path(config.output_image_dir)
        self.md_file = None
        self.md_file_content: str | None = None
        self.document: DocxDocument | None = None
        self.config = config
        self.numbering_templates = {}
        self.numbering_list_lookup = {}
        self.parser = lxml_etree.XMLParser(recover=True)
        # self.list_type = TrackerType.MD_LIST
        self.header_type = TrackerType.MD_HEADING
        self.current_run_info: RunPart | None = None
        self.footnotes = []
        self.endnotes = []
        self.table_parts = TableParts(self, config)

        # Add support for list of header numbering schemes may be a number of trackers
        self.header_tracker: dict[int, ListNumberTracker] = {}

        # Add support for custom styles
        if "bullets" not in config.styles:
            config.styles["bullets"] = {}

    def extract_bookmark_name(self, xml_string: str) -> str | None:
        """
        Extract the w:name value from the w:bookmarkStart element in the given XML string.

        Args:
            xml_string (str): The XML content as a string.

        Returns:
            str: The value of the w:name attribute, or None if not found.

        """
        # Parse the XML string
        root = lxml_etree.fromstring(
            "<root>" + xml_string + "</root>",
            parser=self.parser,
        )

        # Find the w:bookmarkStart element
        bookmark_start = root.find(".//w:bookmarkStart", namespaces=docx_namespaces)

        # Extract and return the w:name attribute value
        return bookmark_start.get(f"{w_ns}name") if bookmark_start is not None else None

    def _open_docx(self) -> None:
        """
        Open the docx file and extract the document part.

        This method is used to open the docx file and extract the document part
        for further processing.

        Returns
        -------
            None

        """
        self.document = Document(str(self.docx_file))
        self.raw_doc = DocxFile(self.config, self.docx_file)

    def convert_docx_2_md(
        self,
        output_file: str | Path,
    ) -> None:
        """
        Generate a document from the template given

        Args:
        ----
            output_file (Path): the file that it will be written to

        """
        # Process numbering styles
        if self.document is None:
            self._open_docx()

        # Get the file name of the input file (removing extension and leading path)
        output_file_name = Path(output_file).stem

        # Iterate through all the relationships in the document
        # and extract the hyperlinks
        self.handle_document_resources(self.document, output_file_name)

        markdown_lines = []

        in_list = False

        self.config.runtime.list_tracker = None

        if self.document is None:
            return  # pragma: no cover

        self._extract_notes()

        for element in self.document.element.body:  # pyright: ignore[reportAttributeAccessIssue]
            logger.debug(f"Element [{element}]")
            if element.tag.endswith("p"):  # Paragraph
                paragraph = element
                docx_para = ParaPart(self.config, element)
                logger.debug(f"DocxPara: {docx_para}")

                # Check if the paragraph contains a bookmark start element
                # This is used to create an anchor link in the markdown
                if "<w:bookmarkStart" in paragraph.xml:
                    # Extract the bookmark name
                    bookmark_name = self.extract_bookmark_name(paragraph.xml)
                    logger.debug(f"Bookmark name: {bookmark_name}")
                    markdown_lines.append(f'<a name="{bookmark_name}"></a>')

                # Process paragraph: get text, style and custom style marker
                in_list = docx_para.process_para(
                    paragraph,
                    markdown_lines,
                    in_list,
                )
            else:
                in_list = False
                if element.tag.endswith("tbl"):  # Table
                    # markdown_table = self.convert_table_to_md(element)
                    markdown_table = self.table_parts.convert_table_to_text(element)
                    markdown_lines.append(markdown_table)
                elif element.tag.endswith("sdt"):  # Table of Contents
                    markdown_lines.append("<<TOC>>")
                elif element.tag.endswith("sectPr"):  # Section Break
                    # Handle section breaks if needed
                    pass
                # check to see if we have a missing type, if so, log it once only
                elif element.tag not in self.config.runtime.missing_types:
                    self.config.runtime.missing_types[element.tag] = 1
                    logger.warning(
                        f"Missing handler {element}. Prev Line[{markdown_lines[-1] if len(markdown_lines) > 0 else 'None'}] ",
                    )

        # Generate the final markdown content
        self.md_file_content = "\n\n".join(markdown_lines)

        # Add footnotes and endnotes to the end of the document
        if self.footnotes:
            self.md_file_content += "\n\n" + "\n".join(self.footnotes)
        if self.endnotes:
            self.md_file_content += "\n\n" + "\n".join(self.endnotes)

        logger.debug(f"Markdown content: {markdown_lines}")
        # save the markdown text to a file
        with open(str(output_file), "w", encoding="utf-8") as md_file:
            md_file.write(self.md_file_content + "\n")
            logger.info(f"Markdown file saved to {output_file}")

    def handle_document_resources(
        self,
        document: DocxDocument | None,
        output_file_name: str,
    ) -> None:
        """
        Handle the resources in the document, such as hyperlinks and images.

        This method extracts hyperlinks and images from the document and saves
        the images to the specified output directory.

        Args:
        ----
            document (Document): The docx document object to process.
            output_file_name (str): The base name for output files, used for naming extracted images.

        Returns:
        -------
            None

        """
        if document is None:  # pragma: no cover
            logger.warning("No document provided for resource handling.")
            return

        for r, relation in document.part.rels.items():
            if relation.is_external:
                self.config.runtime.hyperlinks[r] = {
                    "val": relation.target_ref,
                    "type": "hyperlink",
                }
            elif "image" in relation.target_ref:
                if not self.config.export_images:
                    self.config.runtime.hyperlinks[r] = {"val": "", "type": "image"}
                    logger.info(f"Skipping image export for {relation.target_ref}")
                    continue

                content_type = relation.target_part.content_type

                # Create the output directory if it doesn't exist
                if not self.output_dir.exists():
                    logger.debug(f"Creating image folder: [{self.output_dir}]")
                    self.output_dir.mkdir(parents=True, exist_ok=True)

                # Save the image to the output directory
                image_data = relation.target_part.blob
                image_extension = content_type.split("/")[-1]
                image_name = f"{output_file_name}_{r}.{image_extension}"
                image_path = self.output_dir / image_name
                self.config.runtime.images[r] = image_path

                with open(image_path, "wb") as img_file:
                    img_file.write(image_data)

                self.config.runtime.hyperlinks[r] = {
                    "val": image_path,
                    "type": "image",
                }

                logger.info(f"Extracted image saved to {image_path}")

    def _extract_notes(self) -> None:
        """Extract footnotes and endnotes from the document."""
        if self.document is None:
            return  # pragma: no cover

        # Extract footnotes
        # if self.document.part.has_part_with_reltype(
        #     "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes",
        # ):
        #     footnotes_part = self.document.part..footnotes_part
        #     if footnotes_part is not None:
        #         for footnote in footnotes_part.footnotes:
        #             footnote_text = "".join(
        #                 self.extract_paragraph_text(p, OutputEncodeType.MARKDOWN)
        #                 for p in footnote.paragraphs
        #             )
        #             self.footnotes.append(f"[^{footnote.id}]: {footnote_text}")

        # # Extract endnotes
        # if self.document.part.has_part_with_reltype(
        #     "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes",
        # ):
        #     endnotes_part = self.document.part.endnotes_part
        #     if endnotes_part is not None:
        #         for endnote in endnotes_part.endnotes:
        #             endnote_text = "".join(
        #                 self.extract_paragraph_text(p, OutputEncodeType.MARKDOWN)
        #                 for p in endnote.paragraphs
        #             )
        #             self.endnotes.append(f"[^end{endnote.id}]: {endnote_text}")
