"""Slides parser.

Contains parsers for .pptx files.

"""

import os
from pathlib import Path
from typing import Dict

from gpt_index.readers.file.base_parser import BaseParser


class PptxParser(BaseParser):
    """Powerpoint parser.

    Extract text, caption images, and specify slides.

    """

    def _init_parser(self) -> Dict:
        """Init parser."""
        try:
            from pptx import Presentation  # noqa: F401
        except ImportError:
            raise ImportError(
                "The package `python-pptx` is required to read Powerpoint files: "
                "`pip install python-pptx`"
            )
        try:
            import torch  # noqa: F401
        except ImportError:
            raise ImportError(
                "The package `pytorch` is required to caption images: "
                "`pip install torch`"
            )
        try:
            from transformers import (
                AutoTokenizer,
                VisionEncoderDecoderModel,
                ViTFeatureExtractor,
            )
        except ImportError:
            raise ImportError(
                "The package `transformers` is required to caption images: "
                "`pip install transformers`"
            )
        try:
            from PIL import Image  # noqa: F401
        except ImportError:
            raise ImportError(
                "PIL is required to read image files: " "`pip install Pillow`"
            )

        model = VisionEncoderDecoderModel.from_pretrained(
            "nlpconnect/vit-gpt2-image-captioning"
        )
        feature_extractor = ViTFeatureExtractor.from_pretrained(
            "nlpconnect/vit-gpt2-image-captioning"
        )
        tokenizer = AutoTokenizer.from_pretrained(
            "nlpconnect/vit-gpt2-image-captioning"
        )

        return {
            "feature_extractor": feature_extractor,
            "model": model,
            "tokenizer": tokenizer,
        }

    def caption_image(self, tmp_image_file: str) -> str:
        """Generate text caption of image."""
        import torch
        from PIL import Image

        model = self.parser_config["model"]
        feature_extractor = self.parser_config["feature_extractor"]
        tokenizer = self.parser_config["tokenizer"]

        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)

        max_length = 16
        num_beams = 4
        gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

        i_image = Image.open(tmp_image_file)
        if i_image.mode != "RGB":
            i_image = i_image.convert(mode="RGB")

        pixel_values = feature_extractor(
            images=[i_image], return_tensors="pt"
        ).pixel_values
        pixel_values = pixel_values.to(device)

        output_ids = model.generate(pixel_values, **gen_kwargs)

        preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        return preds[0].strip()

    def parse_file(self, file: Path, errors: str = "ignore") -> str:
        """Parse file."""
        from pptx import Presentation

        presentation = Presentation(file)
        result = ""
        for i, slide in enumerate(presentation.slides):
            result += f"\n\nSlide #{i}: \n"
            for shape in slide.shapes:
                if hasattr(shape, "image"):
                    image = shape.image
                    # get image "file" contents
                    image_bytes = image.blob
                    # temporarily save the image to feed into model
                    image_filename = f"tmp_image.{image.ext}"
                    with open(image_filename, "wb") as f:
                        f.write(image_bytes)
                    result += f"\n Image: {self.caption_image(image_filename)}\n\n"

                    os.remove(image_filename)
                if hasattr(shape, "text"):
                    result += f"{shape.text}\n"

        return result