google-sheet-extractor / sheet_extractor.py
smellslikeml
initial commit
a74e89c
raw
history blame
2.66 kB
import math
import json
import regex
import inspect
import gspread
import guidance
from ast import literal_eval
from transformers import Tool
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient.discovery import build
from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from ortools.linear_solver import pywraplp
guidance.llm = guidance.llms.OpenAI("gpt-4")
structure_program = guidance(
'''
{{#user~}}
{{description}}
Help me extract args from the data blob to apply the following algorithm:
{{code}}
----
{{~#each examples}}
Data Blob: {{this.input}}
Result: {{this.output}}
---
{{~/each}}
Please help me extract the input values from a given data blob into a JSON.
Data Blob: {{data_blob}}
Result:
{{~/user}}
{{#assistant~}}
{{gen 'output'}}
{{~/assistant}}
''')
class DataExtractorTool(Tool):
def __init__(self):
super().__init__()
def fetch_data(self, source_link: str) -> str:
"""
Abstract method to fetch raw data from the source.
"""
raise NotImplementedError("Subclasses must implement this method.")
def __call__(self, source_link: str) -> dict:
raw_data = self.fetch_data(source_link)
return raw_data
class GoogleSheetExtractorTool(DataExtractorTool):
name = "google_sheet_extractor_tool"
description = """
Tool to extract data from Google Sheets.
Input: source_link - str - a url or google sheets id
e.x. source_link='https://docs.google.com/spreadsheets/d/SHEETS_ID/'.
Output is a string.
"""
def fetch_data(self, source_link: str) -> str:
# Set up the credentials
scope = ["https://www.googleapis.com/auth/spreadsheets.readonly"]
creds = Credentials.from_authorized_user_file('token.json', scope)
service = build('sheets', 'v4', credentials=creds)
# Open the spreadsheet and get all values
source_link = source_link.rstrip("/")
spreadsheet_id = source_link.split('/')[-1]
range_name = 'Sheet1!A1:Z1000'
sheet = service.spreadsheets()
result = sheet.values().get(spreadsheetId=spreadsheet_id,
range=range_name).execute()
data = result.get('values', [])
# Convert the data to a string representation
raw_data = '\n'.join([','.join(row) for row in data])
print(raw_data)
return raw_data