Spaces:
Runtime error
Runtime error
File size: 5,619 Bytes
8a58cf3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
"""Google docs reader."""
import logging
import os
from typing import Any, List
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
SCOPES = ["https://www.googleapis.com/auth/documents.readonly"]
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class GoogleDocsReader(BaseReader):
"""Google Docs reader.
Reads a page from Google Docs
"""
def __init__(self) -> None:
"""Initialize with parameters."""
try:
import google # noqa: F401
import google_auth_oauthlib # noqa: F401
import googleapiclient # noqa: F401
except ImportError:
raise ImportError(
"`google_auth_oauthlib`, `googleapiclient` and `google` "
"must be installed to use the GoogleDocsReader.\n"
"Please run `pip install --upgrade google-api-python-client "
"google-auth-httplib2 google-auth-oauthlib`."
)
def load_data(self, document_ids: List[str]) -> List[Document]:
"""Load data from the input directory.
Args:
document_ids (List[str]): a list of document ids.
"""
if document_ids is None:
raise ValueError('Must specify a "document_ids" in `load_kwargs`.')
results = []
for document_id in document_ids:
doc = self._load_doc(document_id)
results.append(Document(doc, extra_info={"document_id": document_id}))
return results
def _load_doc(self, document_id: str) -> str:
"""Load a document from Google Docs.
Args:
document_id: the document id.
Returns:
The document text.
"""
import googleapiclient.discovery as discovery
credentials = self._get_credentials()
docs_service = discovery.build("docs", "v1", credentials=credentials)
doc = docs_service.documents().get(documentId=document_id).execute()
doc_content = doc.get("body").get("content")
return self._read_structural_elements(doc_content)
def _get_credentials(self) -> Any:
"""Get valid user credentials from storage.
The file token.json stores the user's access and refresh tokens, and is
created automatically when the authorization flow completes for the first
time.
Returns:
Credentials, the obtained credential.
"""
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
creds = None
if os.path.exists("token.json"):
creds = Credentials.from_authorized_user_file("token.json", SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
"credentials.json", SCOPES
)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open("token.json", "w") as token:
token.write(creds.to_json())
return creds
def _read_paragraph_element(self, element: Any) -> Any:
"""Return the text in the given ParagraphElement.
Args:
element: a ParagraphElement from a Google Doc.
"""
text_run = element.get("textRun")
if not text_run:
return ""
return text_run.get("content")
def _read_structural_elements(self, elements: List[Any]) -> Any:
"""Recurse through a list of Structural Elements.
Read a document's text where text may be in nested elements.
Args:
elements: a list of Structural Elements.
"""
text = ""
for value in elements:
if "paragraph" in value:
elements = value.get("paragraph").get("elements")
for elem in elements:
text += self._read_paragraph_element(elem)
elif "table" in value:
# The text in table cells are in nested Structural Elements
# and tables may be nested.
table = value.get("table")
for row in table.get("tableRows"):
cells = row.get("tableCells")
for cell in cells:
text += self._read_structural_elements(cell.get("content"))
elif "tableOfContents" in value:
# The text in the TOC is also in a Structural Element.
toc = value.get("tableOfContents")
text += self._read_structural_elements(toc.get("content"))
return text
if __name__ == "__main__":
reader = GoogleDocsReader()
logging.info(
reader.load_data(document_ids=["11ctUj_tEf5S8vs_dk8_BNi-Zk8wW5YFhXkKqtmU_4B8"])
)
|