File size: 2,418 Bytes
9f1a6f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from abc import ABC, abstractmethod
from pandas import DataFrame
from transformers import BertJapaneseTokenizer, BertModel
import pickle
import numpy as np


class AmbiguousSearchBackend(ABC):
    @abstractmethod
    def submit(self, query: str) -> DataFrame:
        pass


class DummyAmbiguousSearchBackend(AmbiguousSearchBackend):
    def submit(self, query: str) -> DataFrame:
        return DataFrame(
            {
                "類似度": [1, 0.9, 0.8, 0.7],
                "名前": ["A", "B", "C", "D"],
                "説明": ["a", "b", "c", "d"],
            }
        )


class SBAmbiguousSearchBackend(AmbiguousSearchBackend):
    def __init__(self):
        super().__init__()
        with open("./himitsudogu_db.pkl", "rb") as file:
            self.himitsudogu_db: dict = pickle.load(file)
        self.feature_matrix = self.himitsudogu_db["feature_matrix_s"][
            "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
        ]
        # モデルsonoisa/sentence-bert-base-ja-mean-tokens-v2を使用する
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(
            "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
        )
        self.model = BertModel.from_pretrained(
            "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
        )
    def submit(self, query: str) -> DataFrame:
        # 文章を形態素解析し、形態素ID列へ変換
        tokenized = self.tokenizer(query, return_tensors="pt")
        # 言語モデルへ形態素ID列を代入
        output = self.model(**tokenized)
        # 文章の特徴ベクトルを取得
        pooler_output = output["pooler_output"]
        query_feature_vector = pooler_output[0].detach().numpy()
        query_feature_unit_vector = query_feature_vector/np.linalg.norm(query_feature_vector)
        # 各ひみつ道具の説明文の特徴ベクトルとの内積を取る
        cs_s = self.feature_matrix @ query_feature_unit_vector
        # 内積が大きかったもの順にひみつ道具を表示するようにする
        ranked_index_s = np.argsort(cs_s)[::-1]
        output = DataFrame(columns=["類似度", "名前", "説明"])
        for rank, i in enumerate(ranked_index_s[:20], 1):
            output.loc[rank] = [
                cs_s[i],
                self.himitsudogu_db["name_s"][i],
                self.himitsudogu_db["description_s"][i],
            ]
        return output