SOAPAssistV00

Runtime error

App Files Files Community

SOAPAssistV00 / gpt_index /readers /google_readers /gdocs.py

AbeerTrial

Duplicate from AbeerTrial/SOAPAssist

35b22df over 1 year ago

raw

history blame

5.62 kB

	"""Google docs reader."""

	import logging
	import os
	from typing import Any, List

	from gpt_index.readers.base import BaseReader
	from gpt_index.readers.schema.base import Document

	SCOPES = ["https://www.googleapis.com/auth/documents.readonly"]


	# Copyright 2019 Google LLC
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	class GoogleDocsReader(BaseReader):
	"""Google Docs reader.

	Reads a page from Google Docs

	"""

	def __init__(self) -> None:
	"""Initialize with parameters."""
	try:
	import google # noqa: F401
	import google_auth_oauthlib # noqa: F401
	import googleapiclient # noqa: F401
	except ImportError:
	raise ImportError(
	"`google_auth_oauthlib`, `googleapiclient` and `google` "
	"must be installed to use the GoogleDocsReader.\n"
	"Please run `pip install --upgrade google-api-python-client "
	"google-auth-httplib2 google-auth-oauthlib`."
	)

	def load_data(self, document_ids: List[str]) -> List[Document]:
	"""Load data from the input directory.

	Args:
	document_ids (List[str]): a list of document ids.
	"""
	if document_ids is None:
	raise ValueError('Must specify a "document_ids" in `load_kwargs`.')

	results = []
	for document_id in document_ids:
	doc = self._load_doc(document_id)
	results.append(Document(doc, extra_info={"document_id": document_id}))
	return results

	def _load_doc(self, document_id: str) -> str:
	"""Load a document from Google Docs.

	Args:
	document_id: the document id.

	Returns:
	The document text.
	"""
	import googleapiclient.discovery as discovery

	credentials = self._get_credentials()
	docs_service = discovery.build("docs", "v1", credentials=credentials)
	doc = docs_service.documents().get(documentId=document_id).execute()
	doc_content = doc.get("body").get("content")
	return self._read_structural_elements(doc_content)

	def _get_credentials(self) -> Any:
	"""Get valid user credentials from storage.

	The file token.json stores the user's access and refresh tokens, and is
	created automatically when the authorization flow completes for the first
	time.

	Returns:
	Credentials, the obtained credential.
	"""
	from google.auth.transport.requests import Request
	from google.oauth2.credentials import Credentials
	from google_auth_oauthlib.flow import InstalledAppFlow

	creds = None
	if os.path.exists("token.json"):
	creds = Credentials.from_authorized_user_file("token.json", SCOPES)
	# If there are no (valid) credentials available, let the user log in.
	if not creds or not creds.valid:
	if creds and creds.expired and creds.refresh_token:
	creds.refresh(Request())
	else:
	flow = InstalledAppFlow.from_client_secrets_file(
	"credentials.json", SCOPES
	)
	creds = flow.run_local_server(port=0)
	# Save the credentials for the next run
	with open("token.json", "w") as token:
	token.write(creds.to_json())

	return creds

	def _read_paragraph_element(self, element: Any) -> Any:
	"""Return the text in the given ParagraphElement.

	Args:
	element: a ParagraphElement from a Google Doc.
	"""
	text_run = element.get("textRun")
	if not text_run:
	return ""
	return text_run.get("content")

	def _read_structural_elements(self, elements: List[Any]) -> Any:
	"""Recurse through a list of Structural Elements.

	Read a document's text where text may be in nested elements.

	Args:
	elements: a list of Structural Elements.
	"""
	text = ""
	for value in elements:
	if "paragraph" in value:
	elements = value.get("paragraph").get("elements")
	for elem in elements:
	text += self._read_paragraph_element(elem)
	elif "table" in value:
	# The text in table cells are in nested Structural Elements
	# and tables may be nested.
	table = value.get("table")
	for row in table.get("tableRows"):
	cells = row.get("tableCells")
	for cell in cells:
	text += self._read_structural_elements(cell.get("content"))
	elif "tableOfContents" in value:
	# The text in the TOC is also in a Structural Element.
	toc = value.get("tableOfContents")
	text += self._read_structural_elements(toc.get("content"))
	return text


	if __name__ == "__main__":
	reader = GoogleDocsReader()
	logging.info(
	reader.load_data(document_ids=["11ctUj_tEf5S8vs_dk8_BNi-Zk8wW5YFhXkKqtmU_4B8"])
	)