import csv import datetime import json import os import pandas as pd import gspread from google.oauth2.service_account import Credentials from utils_groupclassification.research_html_hybrid import research_html_hybrid from utils_groupclassification.research_pdf_hybrid import research_pdf_hybrid from utils_groupclassification.check_openai import co from src.myLogger import set_logger # from utils.save_xlsx import save_google_spreadsheet logger = set_logger("my_app", level="INFO") def _init_client(auth_path): json_open = open(auth_path, "r") service_account_key = json.load(json_open) credentials = Credentials.from_service_account_info(service_account_key) scoped_credentials = credentials.with_scopes( [ "https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive", ] ) Client = gspread.authorize(scoped_credentials) return Client def write_gspread(company_information): return write_xlsx(company_information) def group_change(answer, group): if answer == 'Group 1-1' and group == 'Group 5': group = 'Group 1-1' elif answer == 'Group 3' and (group == 'Group 1-1' or group == 'Group 5'): group = 'Group 3' return group def write_xlsx(company_name) -> list[list]: base_path = "./gspread" os.makedirs(base_path, exist_ok=True) with open("./output.json") as f: config = json.load(f) input_urls = [] related_url = [] unrelated_url = [] other_url = [] for value in config: input_urls.append(value) group = 'Group 5' logger.info(f"urls: {input_urls}") for url in input_urls: logger.info(f"company_name: {company_name}, url: {url}") try: # urlの周りに余計な文字列がある場合に削除 if url.endswith("'"): url = url[:-1] if url.endswith("']"): url = url[:-2] # urlの最後がpdfかhtmlかで処理を分ける if url.endswith(".pdf"): logger.info(f"pdf: {url}") # co関数でurl先の情報が会社名と一致するか判定 judge, reason = co(company_name, url) logger.info(f"judge: {judge}, reason: {reason}") # 一致する場合はresearch_pdf_hybrid関数を実行 if judge == 1: logger.info("research_pdf_hybrid") answer = research_pdf_hybrid(url, company_name) group = group_change(answer, group) related_url.append(url) # 一致しない場合はreasonを返す elif judge == 0: logger.info(f"reason: {reason}") answer = reason unrelated_url.append(url) # 取得できない場合はurl先の情報が取得できない旨を返す elif judge == -1: logger.debug("url先の情報が取得できません") answer = "url先の情報が取得できません" other_url.append(url) else: logger.info(f"html: {url}") # co関数でurl先の情報が会社名と一致するか判定 judge, reason = co(company_name, url) logger.info(f"judge: {judge}, reason: {reason}") # 一致する場合はresearch_html_hybrid関数を実行 if judge == 1: logger.info("research_html_hybrid") answer = research_html_hybrid(url, company_name) group = group_change(answer, group) related_url.append(url) # 一致しない場合はreasonを返す elif judge == 0: logger.info(f"reason: {reason}") answer = reason unrelated_url.append(url) # 取得できない場合はurl先の情報が取得できない旨を返す elif judge == -1: logger.debug("url先の情報が取得できません") answer = "url先の情報が取得できません" other_url.append(url) except Exception as e: logger.error(f"Error: {e}") answer = "" other_url.append(url) logger.info(f'Group: {group}') return group, related_url, unrelated_url, other_url