Spaces:

tgas-theme2
/

tgas-theme2-ph2-demo

Sleeping

App Files Files Community

tgas-theme2-ph2-demo / utils_groupclassification /gspread_utils.py

manatoboys

demo2

103de27 9 months ago

raw

history blame contribute delete

4.52 kB

	import csv
	import datetime
	import json
	import os
	import pandas as pd

	import gspread
	from google.oauth2.service_account import Credentials
	from utils_groupclassification.research_html_hybrid import research_html_hybrid
	from utils_groupclassification.research_pdf_hybrid import research_pdf_hybrid
	from utils_groupclassification.check_openai import co
	from src.myLogger import set_logger
	# from utils.save_xlsx import save_google_spreadsheet

	logger = set_logger("my_app", level="INFO")


	def _init_client(auth_path):
	json_open = open(auth_path, "r")
	service_account_key = json.load(json_open)
	credentials = Credentials.from_service_account_info(service_account_key)
	scoped_credentials = credentials.with_scopes(
	[
	"https://spreadsheets.google.com/feeds",
	"https://www.googleapis.com/auth/drive",
	]
	)
	Client = gspread.authorize(scoped_credentials)
	return Client


	def write_gspread(company_information):
	return write_xlsx(company_information)


	def group_change(answer, group):
	if answer == 'Group 1-1' and group == 'Group 5':
	group = 'Group 1-1'
	elif answer == 'Group 3' and (group == 'Group 1-1' or group == 'Group 5'):
	group = 'Group 3'

	return group


	def write_xlsx(company_name) -> list[list]:
	base_path = "./gspread"
	os.makedirs(base_path, exist_ok=True)
	with open("./output.json") as f:
	config = json.load(f)

	input_urls = []
	related_url = []
	unrelated_url = []
	other_url = []
	for value in config:
	input_urls.append(value)

	group = 'Group 5'

	logger.info(f"urls: {input_urls}")
	for url in input_urls:
	logger.info(f"company_name: {company_name}, url: {url}")
	try:
	# urlの周りに余計な文字列がある場合に削除
	if url.endswith("'"):
	url = url[:-1]
	if url.endswith("']"):
	url = url[:-2]
	# urlの最後がpdfかhtmlかで処理を分ける
	if url.endswith(".pdf"):
	logger.info(f"pdf: {url}")
	# co関数でurl先の情報が会社名と一致するか判定
	judge, reason = co(company_name, url)
	logger.info(f"judge: {judge}, reason: {reason}")
	# 一致する場合はresearch_pdf_hybrid関数を実行
	if judge == 1:
	logger.info("research_pdf_hybrid")
	answer = research_pdf_hybrid(url, company_name)
	group = group_change(answer, group)
	related_url.append(url)
	# 一致しない場合はreasonを返す
	elif judge == 0:
	logger.info(f"reason: {reason}")
	answer = reason
	unrelated_url.append(url)
	# 取得できない場合はurl先の情報が取得できない旨を返す
	elif judge == -1:
	logger.debug("url先の情報が取得できません")
	answer = "url先の情報が取得できません"
	other_url.append(url)
	else:
	logger.info(f"html: {url}")
	# co関数でurl先の情報が会社名と一致するか判定
	judge, reason = co(company_name, url)
	logger.info(f"judge: {judge}, reason: {reason}")
	# 一致する場合はresearch_html_hybrid関数を実行
	if judge == 1:
	logger.info("research_html_hybrid")
	answer = research_html_hybrid(url, company_name)
	group = group_change(answer, group)
	related_url.append(url)
	# 一致しない場合はreasonを返す
	elif judge == 0:
	logger.info(f"reason: {reason}")
	answer = reason
	unrelated_url.append(url)
	# 取得できない場合はurl先の情報が取得できない旨を返す
	elif judge == -1:
	logger.debug("url先の情報が取得できません")
	answer = "url先の情報が取得できません"
	other_url.append(url)
	except Exception as e:
	logger.error(f"Error: {e}")
	answer = ""
	other_url.append(url)

	logger.info(f'Group: {group}')

	return group, related_url, unrelated_url, other_url