Spaces:
Build error
Build error
File size: 2,418 Bytes
9f1a6f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
from abc import ABC, abstractmethod
from pandas import DataFrame
from transformers import BertJapaneseTokenizer, BertModel
import pickle
import numpy as np
class AmbiguousSearchBackend(ABC):
@abstractmethod
def submit(self, query: str) -> DataFrame:
pass
class DummyAmbiguousSearchBackend(AmbiguousSearchBackend):
def submit(self, query: str) -> DataFrame:
return DataFrame(
{
"類似度": [1, 0.9, 0.8, 0.7],
"名前": ["A", "B", "C", "D"],
"説明": ["a", "b", "c", "d"],
}
)
class SBAmbiguousSearchBackend(AmbiguousSearchBackend):
def __init__(self):
super().__init__()
with open("./himitsudogu_db.pkl", "rb") as file:
self.himitsudogu_db: dict = pickle.load(file)
self.feature_matrix = self.himitsudogu_db["feature_matrix_s"][
"sonoisa/sentence-bert-base-ja-mean-tokens-v2"
]
# モデルsonoisa/sentence-bert-base-ja-mean-tokens-v2を使用する
self.tokenizer = BertJapaneseTokenizer.from_pretrained(
"sonoisa/sentence-bert-base-ja-mean-tokens-v2"
)
self.model = BertModel.from_pretrained(
"sonoisa/sentence-bert-base-ja-mean-tokens-v2"
)
def submit(self, query: str) -> DataFrame:
# 文章を形態素解析し、形態素ID列へ変換
tokenized = self.tokenizer(query, return_tensors="pt")
# 言語モデルへ形態素ID列を代入
output = self.model(**tokenized)
# 文章の特徴ベクトルを取得
pooler_output = output["pooler_output"]
query_feature_vector = pooler_output[0].detach().numpy()
query_feature_unit_vector = query_feature_vector/np.linalg.norm(query_feature_vector)
# 各ひみつ道具の説明文の特徴ベクトルとの内積を取る
cs_s = self.feature_matrix @ query_feature_unit_vector
# 内積が大きかったもの順にひみつ道具を表示するようにする
ranked_index_s = np.argsort(cs_s)[::-1]
output = DataFrame(columns=["類似度", "名前", "説明"])
for rank, i in enumerate(ranked_index_s[:20], 1):
output.loc[rank] = [
cs_s[i],
self.himitsudogu_db["name_s"][i],
self.himitsudogu_db["description_s"][i],
]
return output |