File size: 2,090 Bytes
007d7a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from transformers import PreTrainedModel
from transformers.models.bert.modeling_bert import BertOnlyMLMHead

from peptriever.model.bert_embedding import BertEmbeddingConfig, BertForEmbedding


class BiEncoderConfig(BertEmbeddingConfig):
    max_length1: int
    max_length2: int


class BiEncoder(PreTrainedModel):
    config_class = BiEncoderConfig

    def __init__(self, config: BiEncoderConfig):
        super().__init__(config)
        config1 = _replace_max_length(config, "max_length1")
        self.bert1 = BertForEmbedding(config1)
        config2 = _replace_max_length(config, "max_length2")
        self.bert2 = BertForEmbedding(config2)
        self.post_init()

    def forward(self, x1, x2):
        y1 = self.forward1(x1)
        y2 = self.forward2(x2)
        return {"y1": y1, "y2": y2}

    def forward2(self, x2):
        y2 = self.bert2(input_ids=x2["input_ids"])
        return y2

    def forward1(self, x1):
        y1 = self.bert1(input_ids=x1["input_ids"])
        return y1


class BiEncoderWithMaskedLM(PreTrainedModel):
    config_class = BiEncoderConfig

    def __init__(self, config: BiEncoderConfig):
        super().__init__(config=config)
        config1 = _replace_max_length(config, "max_length1")
        self.bert1 = BertForEmbedding(config1)
        self.lm_head1 = BertOnlyMLMHead(config=config1)

        config2 = _replace_max_length(config, "max_length2")
        self.bert2 = BertForEmbedding(config2)
        self.lm_head2 = BertOnlyMLMHead(config=config2)
        self.post_init()

    def forward(self, x1, x2):
        y1, state1 = self.bert1.forward_with_state(input_ids=x1["input_ids"])
        y2, state2 = self.bert2.forward_with_state(input_ids=x2["input_ids"])
        scores1 = self.lm_head1(state1)
        scores2 = self.lm_head2(state2)
        outputs = {"y1": y1, "y2": y2, "scores1": scores1, "scores2": scores2}
        return outputs


def _replace_max_length(config, length_key):
    c1 = config.__dict__.copy()
    c1["max_position_embeddings"] = c1.pop(length_key)
    config1 = BertEmbeddingConfig(**c1)
    return config1