Mya-Mya commited on
Commit
f0f53b7
·
1 Parent(s): ab080f6

Create ambiguous_search_backends.py

Browse files
Files changed (1) hide show
  1. ambiguous_search_backends.py +60 -0
ambiguous_search_backends.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from pandas import DataFrame
3
+ from transformers import BertJapaneseTokenizer, BertModel
4
+ import pickle
5
+ import numpy as np
6
+
7
+
8
+ class AmbiguousSearchBackend(ABC):
9
+ @abstractmethod
10
+ def submit(self, query: str) -> DataFrame:
11
+ pass
12
+
13
+
14
+ class DummyAmbiguousSearchBackend(AmbiguousSearchBackend):
15
+ def submit(self, query: str) -> DataFrame:
16
+ return DataFrame(
17
+ {
18
+ "類似度": [1, 0.9, 0.8, 0.7],
19
+ "名前": ["A", "B", "C", "D"],
20
+ "説明": ["a", "b", "c", "d"],
21
+ }
22
+ )
23
+
24
+
25
+ class SBAmbiguousSearchBackend(AmbiguousSearchBackend):
26
+ def __init__(self):
27
+ super().__init__()
28
+ with open("./himitsudogu_db.pkl", "rb") as file:
29
+ self.himitsudogu_db: dict = pickle.load(file)
30
+ self.feature_matrix = self.himitsudogu_db["feature_matrix_s"][
31
+ "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
32
+ ]
33
+ # モデルsonoisa/sentence-bert-base-ja-mean-tokens-v2を使用する
34
+ self.tokenizer = BertJapaneseTokenizer.from_pretrained(
35
+ "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
36
+ )
37
+ self.model = BertModel.from_pretrained(
38
+ "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
39
+ )
40
+ def submit(self, query: str) -> DataFrame:
41
+ # 文章を形態素解析し、形態素ID列へ変換
42
+ tokenized = self.tokenizer(query, return_tensors="pt")
43
+ # 言語モデルへ形態素ID列を代入
44
+ output = self.model(**tokenized)
45
+ # 文章の特徴ベクトルを取得
46
+ pooler_output = output["pooler_output"]
47
+ query_feature_vector = pooler_output[0].detach().numpy()
48
+ query_feature_unit_vector = query_feature_vector/np.linalg.norm(query_feature_vector)
49
+ # 各ひみつ道具の説明文の特徴ベクトルとの内積を取る
50
+ cs_s = self.feature_matrix @ query_feature_unit_vector
51
+ # 内積が大きかったもの順にひみつ道具を表示するようにする
52
+ ranked_index_s = np.argsort(cs_s)[::-1]
53
+ output = DataFrame(columns=["類似度", "名前", "説明"])
54
+ for rank, i in enumerate(ranked_index_s[:20], 1):
55
+ output.loc[rank] = [
56
+ cs_s[i],
57
+ self.himitsudogu_db["name_s"][i],
58
+ self.himitsudogu_db["description_s"][i],
59
+ ]
60
+ return output