manatoboys's picture
demo2
103de27
import csv
import datetime
import json
import os
import pandas as pd
import gspread
from google.oauth2.service_account import Credentials
from utils_groupclassification.research_html_hybrid import research_html_hybrid
from utils_groupclassification.research_pdf_hybrid import research_pdf_hybrid
from utils_groupclassification.check_openai import co
from src.myLogger import set_logger
# from utils.save_xlsx import save_google_spreadsheet
logger = set_logger("my_app", level="INFO")
def _init_client(auth_path):
json_open = open(auth_path, "r")
service_account_key = json.load(json_open)
credentials = Credentials.from_service_account_info(service_account_key)
scoped_credentials = credentials.with_scopes(
[
"https://spreadsheets.google.com/feeds",
"https://www.googleapis.com/auth/drive",
]
)
Client = gspread.authorize(scoped_credentials)
return Client
def write_gspread(company_information):
return write_xlsx(company_information)
def group_change(answer, group):
if answer == 'Group 1-1' and group == 'Group 5':
group = 'Group 1-1'
elif answer == 'Group 3' and (group == 'Group 1-1' or group == 'Group 5'):
group = 'Group 3'
return group
def write_xlsx(company_name) -> list[list]:
base_path = "./gspread"
os.makedirs(base_path, exist_ok=True)
with open("./output.json") as f:
config = json.load(f)
input_urls = []
related_url = []
unrelated_url = []
other_url = []
for value in config:
input_urls.append(value)
group = 'Group 5'
logger.info(f"urls: {input_urls}")
for url in input_urls:
logger.info(f"company_name: {company_name}, url: {url}")
try:
# urlの周りに余計な文字列がある場合に削除
if url.endswith("'"):
url = url[:-1]
if url.endswith("']"):
url = url[:-2]
# urlの最後がpdfかhtmlかで処理を分ける
if url.endswith(".pdf"):
logger.info(f"pdf: {url}")
# co関数でurl先の情報が会社名と一致するか判定
judge, reason = co(company_name, url)
logger.info(f"judge: {judge}, reason: {reason}")
# 一致する場合はresearch_pdf_hybrid関数を実行
if judge == 1:
logger.info("research_pdf_hybrid")
answer = research_pdf_hybrid(url, company_name)
group = group_change(answer, group)
related_url.append(url)
# 一致しない場合はreasonを返す
elif judge == 0:
logger.info(f"reason: {reason}")
answer = reason
unrelated_url.append(url)
# 取得できない場合はurl先の情報が取得できない旨を返す
elif judge == -1:
logger.debug("url先の情報が取得できません")
answer = "url先の情報が取得できません"
other_url.append(url)
else:
logger.info(f"html: {url}")
# co関数でurl先の情報が会社名と一致するか判定
judge, reason = co(company_name, url)
logger.info(f"judge: {judge}, reason: {reason}")
# 一致する場合はresearch_html_hybrid関数を実行
if judge == 1:
logger.info("research_html_hybrid")
answer = research_html_hybrid(url, company_name)
group = group_change(answer, group)
related_url.append(url)
# 一致しない場合はreasonを返す
elif judge == 0:
logger.info(f"reason: {reason}")
answer = reason
unrelated_url.append(url)
# 取得できない場合はurl先の情報が取得できない旨を返す
elif judge == -1:
logger.debug("url先の情報が取得できません")
answer = "url先の情報が取得できません"
other_url.append(url)
except Exception as e:
logger.error(f"Error: {e}")
answer = ""
other_url.append(url)
logger.info(f'Group: {group}')
return group, related_url, unrelated_url, other_url