Spaces:
Sleeping
Sleeping
import csv | |
import datetime | |
import json | |
import os | |
import pandas as pd | |
import gspread | |
from google.oauth2.service_account import Credentials | |
from utils_groupclassification.research_html_hybrid import research_html_hybrid | |
from utils_groupclassification.research_pdf_hybrid import research_pdf_hybrid | |
from utils_groupclassification.check_openai import co | |
from src.myLogger import set_logger | |
# from utils.save_xlsx import save_google_spreadsheet | |
logger = set_logger("my_app", level="INFO") | |
def _init_client(auth_path): | |
json_open = open(auth_path, "r") | |
service_account_key = json.load(json_open) | |
credentials = Credentials.from_service_account_info(service_account_key) | |
scoped_credentials = credentials.with_scopes( | |
[ | |
"https://spreadsheets.google.com/feeds", | |
"https://www.googleapis.com/auth/drive", | |
] | |
) | |
Client = gspread.authorize(scoped_credentials) | |
return Client | |
def write_gspread(company_information): | |
return write_xlsx(company_information) | |
def group_change(answer, group): | |
if answer == 'Group 1-1' and group == 'Group 5': | |
group = 'Group 1-1' | |
elif answer == 'Group 3' and (group == 'Group 1-1' or group == 'Group 5'): | |
group = 'Group 3' | |
return group | |
def write_xlsx(company_name) -> list[list]: | |
base_path = "./gspread" | |
os.makedirs(base_path, exist_ok=True) | |
with open("./output.json") as f: | |
config = json.load(f) | |
input_urls = [] | |
related_url = [] | |
unrelated_url = [] | |
other_url = [] | |
for value in config: | |
input_urls.append(value) | |
group = 'Group 5' | |
logger.info(f"urls: {input_urls}") | |
for url in input_urls: | |
logger.info(f"company_name: {company_name}, url: {url}") | |
try: | |
# urlの周りに余計な文字列がある場合に削除 | |
if url.endswith("'"): | |
url = url[:-1] | |
if url.endswith("']"): | |
url = url[:-2] | |
# urlの最後がpdfかhtmlかで処理を分ける | |
if url.endswith(".pdf"): | |
logger.info(f"pdf: {url}") | |
# co関数でurl先の情報が会社名と一致するか判定 | |
judge, reason = co(company_name, url) | |
logger.info(f"judge: {judge}, reason: {reason}") | |
# 一致する場合はresearch_pdf_hybrid関数を実行 | |
if judge == 1: | |
logger.info("research_pdf_hybrid") | |
answer = research_pdf_hybrid(url, company_name) | |
group = group_change(answer, group) | |
related_url.append(url) | |
# 一致しない場合はreasonを返す | |
elif judge == 0: | |
logger.info(f"reason: {reason}") | |
answer = reason | |
unrelated_url.append(url) | |
# 取得できない場合はurl先の情報が取得できない旨を返す | |
elif judge == -1: | |
logger.debug("url先の情報が取得できません") | |
answer = "url先の情報が取得できません" | |
other_url.append(url) | |
else: | |
logger.info(f"html: {url}") | |
# co関数でurl先の情報が会社名と一致するか判定 | |
judge, reason = co(company_name, url) | |
logger.info(f"judge: {judge}, reason: {reason}") | |
# 一致する場合はresearch_html_hybrid関数を実行 | |
if judge == 1: | |
logger.info("research_html_hybrid") | |
answer = research_html_hybrid(url, company_name) | |
group = group_change(answer, group) | |
related_url.append(url) | |
# 一致しない場合はreasonを返す | |
elif judge == 0: | |
logger.info(f"reason: {reason}") | |
answer = reason | |
unrelated_url.append(url) | |
# 取得できない場合はurl先の情報が取得できない旨を返す | |
elif judge == -1: | |
logger.debug("url先の情報が取得できません") | |
answer = "url先の情報が取得できません" | |
other_url.append(url) | |
except Exception as e: | |
logger.error(f"Error: {e}") | |
answer = "" | |
other_url.append(url) | |
logger.info(f'Group: {group}') | |
return group, related_url, unrelated_url, other_url | |