Spaces:
Build error
Build error
from abc import ABC, abstractmethod | |
from pandas import DataFrame | |
from transformers import BertJapaneseTokenizer, BertModel | |
import pickle | |
import numpy as np | |
class AmbiguousSearchBackend(ABC): | |
def submit(self, query: str) -> DataFrame: | |
pass | |
class DummyAmbiguousSearchBackend(AmbiguousSearchBackend): | |
def submit(self, query: str) -> DataFrame: | |
return DataFrame( | |
{ | |
"類似度": [1, 0.9, 0.8, 0.7], | |
"名前": ["A", "B", "C", "D"], | |
"説明": ["a", "b", "c", "d"], | |
} | |
) | |
class SBAmbiguousSearchBackend(AmbiguousSearchBackend): | |
def __init__(self): | |
super().__init__() | |
with open("./himitsudogu_db.pkl", "rb") as file: | |
self.himitsudogu_db: dict = pickle.load(file) | |
self.feature_matrix = self.himitsudogu_db["feature_matrix_s"][ | |
"sonoisa/sentence-bert-base-ja-mean-tokens-v2" | |
] | |
# モデルsonoisa/sentence-bert-base-ja-mean-tokens-v2を使用する | |
self.tokenizer = BertJapaneseTokenizer.from_pretrained( | |
"sonoisa/sentence-bert-base-ja-mean-tokens-v2" | |
) | |
self.model = BertModel.from_pretrained( | |
"sonoisa/sentence-bert-base-ja-mean-tokens-v2" | |
) | |
def submit(self, query: str) -> DataFrame: | |
# 文章を形態素解析し、形態素ID列へ変換 | |
tokenized = self.tokenizer(query, return_tensors="pt") | |
# 言語モデルへ形態素ID列を代入 | |
output = self.model(**tokenized) | |
# 文章の特徴ベクトルを取得 | |
pooler_output = output["pooler_output"] | |
query_feature_vector = pooler_output[0].detach().numpy() | |
query_feature_unit_vector = query_feature_vector/np.linalg.norm(query_feature_vector) | |
# 各ひみつ道具の説明文の特徴ベクトルとの内積を取る | |
cs_s = self.feature_matrix @ query_feature_unit_vector | |
# 内積が大きかったもの順にひみつ道具を表示するようにする | |
ranked_index_s = np.argsort(cs_s)[::-1] | |
output = DataFrame(columns=["類似度", "名前", "説明"]) | |
for rank, i in enumerate(ranked_index_s[:20], 1): | |
output.loc[rank] = [ | |
cs_s[i], | |
self.himitsudogu_db["name_s"][i], | |
self.himitsudogu_db["description_s"][i], | |
] | |
return output |