Spaces:
Runtime error
Runtime error
| # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ | |
| This code is refer from: | |
| https://github.com/roatienza/deep-text-recognition-benchmark/blob/master/modules/vitstr.py | |
| """ | |
| import numpy as np | |
| import paddle | |
| import paddle.nn as nn | |
| from ppocr.modeling.backbones.rec_svtrnet import Block, PatchEmbed, zeros_, trunc_normal_, ones_ | |
| scale_dim_heads = {'tiny': [192, 3], 'small': [384, 6], 'base': [768, 12]} | |
| class ViTSTR(nn.Layer): | |
| def __init__(self, | |
| img_size=[224, 224], | |
| in_channels=1, | |
| scale='tiny', | |
| seqlen=27, | |
| patch_size=[16, 16], | |
| embed_dim=None, | |
| depth=12, | |
| num_heads=None, | |
| mlp_ratio=4, | |
| qkv_bias=True, | |
| qk_scale=None, | |
| drop_path_rate=0., | |
| drop_rate=0., | |
| attn_drop_rate=0., | |
| norm_layer='nn.LayerNorm', | |
| act_layer='nn.GELU', | |
| epsilon=1e-6, | |
| out_channels=None, | |
| **kwargs): | |
| super().__init__() | |
| self.seqlen = seqlen | |
| embed_dim = embed_dim if embed_dim is not None else scale_dim_heads[ | |
| scale][0] | |
| num_heads = num_heads if num_heads is not None else scale_dim_heads[ | |
| scale][1] | |
| out_channels = out_channels if out_channels is not None else embed_dim | |
| self.patch_embed = PatchEmbed( | |
| img_size=img_size, | |
| in_channels=in_channels, | |
| embed_dim=embed_dim, | |
| patch_size=patch_size, | |
| mode='linear') | |
| num_patches = self.patch_embed.num_patches | |
| self.pos_embed = self.create_parameter( | |
| shape=[1, num_patches + 1, embed_dim], default_initializer=zeros_) | |
| self.add_parameter("pos_embed", self.pos_embed) | |
| self.cls_token = self.create_parameter( | |
| shape=[1, 1, embed_dim], default_initializer=zeros_) | |
| self.add_parameter("cls_token", self.cls_token) | |
| self.pos_drop = nn.Dropout(p=drop_rate) | |
| dpr = np.linspace(0, drop_path_rate, depth) | |
| self.blocks = nn.LayerList([ | |
| Block( | |
| dim=embed_dim, | |
| num_heads=num_heads, | |
| mlp_ratio=mlp_ratio, | |
| qkv_bias=qkv_bias, | |
| qk_scale=qk_scale, | |
| drop=drop_rate, | |
| attn_drop=attn_drop_rate, | |
| drop_path=dpr[i], | |
| norm_layer=norm_layer, | |
| act_layer=eval(act_layer), | |
| epsilon=epsilon, | |
| prenorm=False) for i in range(depth) | |
| ]) | |
| self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon) | |
| self.out_channels = out_channels | |
| trunc_normal_(self.pos_embed) | |
| trunc_normal_(self.cls_token) | |
| self.apply(self._init_weights) | |
| def _init_weights(self, m): | |
| if isinstance(m, nn.Linear): | |
| trunc_normal_(m.weight) | |
| if isinstance(m, nn.Linear) and m.bias is not None: | |
| zeros_(m.bias) | |
| elif isinstance(m, nn.LayerNorm): | |
| zeros_(m.bias) | |
| ones_(m.weight) | |
| def forward_features(self, x): | |
| B = x.shape[0] | |
| x = self.patch_embed(x) | |
| cls_tokens = paddle.tile(self.cls_token, repeat_times=[B, 1, 1]) | |
| x = paddle.concat((cls_tokens, x), axis=1) | |
| x = x + self.pos_embed | |
| x = self.pos_drop(x) | |
| for blk in self.blocks: | |
| x = blk(x) | |
| x = self.norm(x) | |
| return x | |
| def forward(self, x): | |
| x = self.forward_features(x) | |
| x = x[:, :self.seqlen] | |
| return x.transpose([0, 2, 1]).unsqueeze(2) | |