Spaces:

retopara
/

ragflow

Build error

App Files Files Community

ragflow / deepdoc /parser /ppt_parser.py

jinhai-2012

Update readme and add license (#1018)

9cba22c 9 months ago

raw

history blame

2.15 kB

	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	from io import BytesIO
	from pptx import Presentation


	class RAGFlowPptParser(object):
	def __init__(self):
	super().__init__()

	def __extract(self, shape):
	if shape.shape_type == 19:
	tb = shape.table
	rows = []
	for i in range(1, len(tb.rows)):
	rows.append("; ".join([tb.cell(
	0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
	return "\n".join(rows)

	if shape.has_text_frame:
	return shape.text_frame.text

	if shape.shape_type == 6:
	texts = []
	for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
	t = self.__extract(p)
	if t:
	texts.append(t)
	return "\n".join(texts)

	def __call__(self, fnm, from_page, to_page, callback=None):
	ppt = Presentation(fnm) if isinstance(
	fnm, str) else Presentation(
	BytesIO(fnm))
	txts = []
	self.total_page = len(ppt.slides)
	for i, slide in enumerate(ppt.slides):
	if i < from_page:
	continue
	if i >= to_page:
	break
	texts = []
	for shape in sorted(
	slide.shapes, key=lambda x: (x.top // 10, x.left)):
	txt = self.__extract(shape)
	if txt:
	texts.append(txt)
	txts.append("\n".join(texts))

	return txts