SOAPAssistV00

Runtime error

App Files Files Community

SOAPAssistV00 / gpt_index /readers /file /slides_parser.py

AbeerTrial

Duplicate from AbeerTrial/SOAPAssist

35b22df over 1 year ago

raw

history blame

3.94 kB

	"""Slides parser.

	Contains parsers for .pptx files.

	"""

	import os
	from pathlib import Path
	from typing import Dict

	from gpt_index.readers.file.base_parser import BaseParser


	class PptxParser(BaseParser):
	"""Powerpoint parser.

	Extract text, caption images, and specify slides.

	"""

	def _init_parser(self) -> Dict:
	"""Init parser."""
	try:
	from pptx import Presentation # noqa: F401
	except ImportError:
	raise ImportError(
	"The package `python-pptx` is required to read Powerpoint files: "
	"`pip install python-pptx`"
	)
	try:
	import torch # noqa: F401
	except ImportError:
	raise ImportError(
	"The package `pytorch` is required to caption images: "
	"`pip install torch`"
	)
	try:
	from transformers import (
	AutoTokenizer,
	VisionEncoderDecoderModel,
	ViTFeatureExtractor,
	)
	except ImportError:
	raise ImportError(
	"The package `transformers` is required to caption images: "
	"`pip install transformers`"
	)
	try:
	from PIL import Image # noqa: F401
	except ImportError:
	raise ImportError(
	"PIL is required to read image files: " "`pip install Pillow`"
	)

	model = VisionEncoderDecoderModel.from_pretrained(
	"nlpconnect/vit-gpt2-image-captioning"
	)
	feature_extractor = ViTFeatureExtractor.from_pretrained(
	"nlpconnect/vit-gpt2-image-captioning"
	)
	tokenizer = AutoTokenizer.from_pretrained(
	"nlpconnect/vit-gpt2-image-captioning"
	)

	return {
	"feature_extractor": feature_extractor,
	"model": model,
	"tokenizer": tokenizer,
	}

	def caption_image(self, tmp_image_file: str) -> str:
	"""Generate text caption of image."""
	import torch
	from PIL import Image

	model = self.parser_config["model"]
	feature_extractor = self.parser_config["feature_extractor"]
	tokenizer = self.parser_config["tokenizer"]

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)

	max_length = 16
	num_beams = 4
	gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

	i_image = Image.open(tmp_image_file)
	if i_image.mode != "RGB":
	i_image = i_image.convert(mode="RGB")

	pixel_values = feature_extractor(
	images=[i_image], return_tensors="pt"
	).pixel_values
	pixel_values = pixel_values.to(device)

	output_ids = model.generate(pixel_values, **gen_kwargs)

	preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
	return preds[0].strip()

	def parse_file(self, file: Path, errors: str = "ignore") -> str:
	"""Parse file."""
	from pptx import Presentation

	presentation = Presentation(file)
	result = ""
	for i, slide in enumerate(presentation.slides):
	result += f"\n\nSlide #{i}: \n"
	for shape in slide.shapes:
	if hasattr(shape, "image"):
	image = shape.image
	# get image "file" contents
	image_bytes = image.blob
	# temporarily save the image to feed into model
	image_filename = f"tmp_image.{image.ext}"
	with open(image_filename, "wb") as f:
	f.write(image_bytes)
	result += f"\n Image: {self.caption_image(image_filename)}\n\n"

	os.remove(image_filename)
	if hasattr(shape, "text"):
	result += f"{shape.text}\n"

	return result