first commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- LICENSE +427 -0
- +38 -0
- annotator/openpose/ +73 -0
- annotator/openpose/ +219 -0
- annotator/openpose/ +86 -0
- annotator/openpose/ +219 -0
- annotator/openpose/ +163 -0
- annotator/segm/ +162 -0
- annotator/segm/modules/ +5 -0
- annotator/segm/modules/ +132 -0
- annotator/segm/modules/ +84 -0
- annotator/segm/modules/ +42 -0
- annotator/segm/modules/ +244 -0
- annotator/segm/modules/ +21 -0
- annotator/segm/modules/ +182 -0
- annotator/segm/modules/src/checks.h +15 -0
- annotator/segm/modules/src/inplace_abn.cpp +95 -0
- annotator/segm/modules/src/inplace_abn.h +88 -0
- annotator/segm/modules/src/inplace_abn_cpu.cpp +119 -0
- annotator/segm/modules/src/ +333 -0
- annotator/segm/modules/src/ +275 -0
- annotator/segm/modules/src/utils/checks.h +15 -0
- annotator/segm/modules/src/utils/common.h +49 -0
- annotator/segm/modules/src/utils/cuda.cuh +71 -0
- annotator/segm/networks/ +337 -0
- annotator/segm/networks/ +13 -0
- annotator/segm/networks/backbone/ +156 -0
- annotator/segm/networks/backbone/ +205 -0
- annotator/segm/networks/backbone/ +149 -0
- annotator/segm/networks/context_encoding/ +64 -0
- annotator/segm/networks/context_encoding/ +226 -0
- annotator/segm/networks/context_encoding/ +48 -0
- annotator/segm/ +167 -0
- annotator/ +49 -0
- +475 -0
- app_files/default_images/mask.png +0 -0
- app_files/default_images/pose.png +0 -0
- app_files/default_images/ref.png +0 -0
- app_files/samples/pose/MEN/full_1.png +0 -0
- app_files/samples/pose/MEN/full_2.png +0 -0
- app_files/samples/pose/MEN/half_back.png +0 -0
- app_files/samples/pose/MEN/half_front.png +0 -0
- app_files/samples/pose/MEN/half_left.png +0 -0
- app_files/samples/pose/WOMEN/pose_0.png +0 -0
- app_files/samples/pose/WOMEN/pose_1.png +0 -0
- app_files/samples/pose/WOMEN/pose_2.png +0 -0
- app_files/samples/pose/WOMEN/pose_3.png +0 -0
- app_files/samples/pose/WOMEN/pose_4.png +0 -0
- app_files/samples/pose/WOMEN/pose_5.png +0 -0
- app_files/samples/pose/WOMEN/pose_6.png +0 -0
@@ -0,0 +1,427 @@
1 |
2 |
Attribution-ShareAlike 4.0 International
3 |
4 |
5 |
Creative Commons Corporation ("Creative Commons") is not a law firm and
6 |
does not provide legal services or legal advice. Distribution of
7 |
Creative Commons public licenses does not create a lawyer-client or
8 |
other relationship. Creative Commons makes its licenses and related
9 |
information available on an "as-is" basis. Creative Commons gives no
10 |
warranties regarding its licenses, any material licensed under their
11 |
terms and conditions, or any related information. Creative Commons
12 |
disclaims all liability for damages resulting from their use to the
13 |
fullest extent possible.
14 |
15 |
Using Creative Commons Public Licenses
16 |
17 |
Creative Commons public licenses provide a standard set of terms and
18 |
conditions that creators and other rights holders may use to share
19 |
original works of authorship and other material subject to copyright
20 |
and certain other rights specified in the public license below. The
21 |
following considerations are for informational purposes only, are not
22 |
exhaustive, and do not form part of our licenses.
23 |
24 |
Considerations for licensors: Our public licenses are
25 |
intended for use by those authorized to give the public
26 |
permission to use material in ways otherwise restricted by
27 |
copyright and certain other rights. Our licenses are
28 |
irrevocable. Licensors should read and understand the terms
29 |
and conditions of the license they choose before applying it.
30 |
Licensors should also secure all rights necessary before
31 |
applying our licenses so that the public can reuse the
32 |
material as expected. Licensors should clearly mark any
33 |
material not subject to the license. This includes other CC-
34 |
licensed material, or material used under an exception or
35 |
limitation to copyright. More considerations for licensors:
36 |
37 |
38 |
Considerations for the public: By using one of our public
39 |
licenses, a licensor grants the public permission to use the
40 |
licensed material under specified terms and conditions. If
41 |
the licensor's permission is not necessary for any reason--for
42 |
example, because of any applicable exception or limitation to
43 |
copyright--then that use is not regulated by the license. Our
44 |
licenses grant only permissions under copyright and certain
45 |
other rights that a licensor has authority to grant. Use of
46 |
the licensed material may still be restricted for other
47 |
reasons, including because others have copyright or other
48 |
rights in the material. A licensor may make special requests,
49 |
such as asking that all changes be marked or described.
50 |
Although not required by our licenses, you are encouraged to
51 |
respect those requests where reasonable. More considerations
52 |
for the public:
53 |
54 |
55 |
56 |
57 |
Creative Commons Attribution-ShareAlike 4.0 International Public
58 |
59 |
60 |
By exercising the Licensed Rights (defined below), You accept and agree
61 |
to be bound by the terms and conditions of this Creative Commons
62 |
Attribution-ShareAlike 4.0 International Public License ("Public
63 |
License"). To the extent this Public License may be interpreted as a
64 |
contract, You are granted the Licensed Rights in consideration of Your
65 |
acceptance of these terms and conditions, and the Licensor grants You
66 |
such rights in consideration of benefits the Licensor receives from
67 |
making the Licensed Material available under these terms and
68 |
69 |
70 |
71 |
Section 1 -- Definitions.
72 |
73 |
a. Adapted Material means material subject to Copyright and Similar
74 |
Rights that is derived from or based upon the Licensed Material
75 |
and in which the Licensed Material is translated, altered,
76 |
arranged, transformed, or otherwise modified in a manner requiring
77 |
permission under the Copyright and Similar Rights held by the
78 |
Licensor. For purposes of this Public License, where the Licensed
79 |
Material is a musical work, performance, or sound recording,
80 |
Adapted Material is always produced where the Licensed Material is
81 |
synched in timed relation with a moving image.
82 |
83 |
b. Adapter's License means the license You apply to Your Copyright
84 |
and Similar Rights in Your contributions to Adapted Material in
85 |
accordance with the terms and conditions of this Public License.
86 |
87 |
c. BY-SA Compatible License means a license listed at
88 |
+, approved by Creative
89 |
Commons as essentially the equivalent of this Public License.
90 |
91 |
d. Copyright and Similar Rights means copyright and/or similar rights
92 |
closely related to copyright including, without limitation,
93 |
performance, broadcast, sound recording, and Sui Generis Database
94 |
Rights, without regard to how the rights are labeled or
95 |
categorized. For purposes of this Public License, the rights
96 |
specified in Section 2(b)(1)-(2) are not Copyright and Similar
97 |
98 |
99 |
e. Effective Technological Measures means those measures that, in the
100 |
absence of proper authority, may not be circumvented under laws
101 |
fulfilling obligations under Article 11 of the WIPO Copyright
102 |
Treaty adopted on December 20, 1996, and/or similar international
103 |
104 |
105 |
f. Exceptions and Limitations means fair use, fair dealing, and/or
106 |
any other exception or limitation to Copyright and Similar Rights
107 |
that applies to Your use of the Licensed Material.
108 |
109 |
g. License Elements means the license attributes listed in the name
110 |
of a Creative Commons Public License. The License Elements of this
111 |
Public License are Attribution and ShareAlike.
112 |
113 |
h. Licensed Material means the artistic or literary work, database,
114 |
or other material to which the Licensor applied this Public
115 |
116 |
117 |
i. Licensed Rights means the rights granted to You subject to the
118 |
terms and conditions of this Public License, which are limited to
119 |
all Copyright and Similar Rights that apply to Your use of the
120 |
Licensed Material and that the Licensor has authority to license.
121 |
122 |
j. Licensor means the individual(s) or entity(ies) granting rights
123 |
under this Public License.
124 |
125 |
k. Share means to provide material to the public by any means or
126 |
process that requires permission under the Licensed Rights, such
127 |
as reproduction, public display, public performance, distribution,
128 |
dissemination, communication, or importation, and to make material
129 |
available to the public including in ways that members of the
130 |
public may access the material from a place and at a time
131 |
individually chosen by them.
132 |
133 |
l. Sui Generis Database Rights means rights other than copyright
134 |
resulting from Directive 96/9/EC of the European Parliament and of
135 |
the Council of 11 March 1996 on the legal protection of databases,
136 |
as amended and/or succeeded, as well as other essentially
137 |
equivalent rights anywhere in the world.
138 |
139 |
m. You means the individual or entity exercising the Licensed Rights
140 |
under this Public License. Your has a corresponding meaning.
141 |
142 |
143 |
Section 2 -- Scope.
144 |
145 |
a. License grant.
146 |
147 |
1. Subject to the terms and conditions of this Public License,
148 |
the Licensor hereby grants You a worldwide, royalty-free,
149 |
non-sublicensable, non-exclusive, irrevocable license to
150 |
exercise the Licensed Rights in the Licensed Material to:
151 |
152 |
a. reproduce and Share the Licensed Material, in whole or
153 |
in part; and
154 |
155 |
b. produce, reproduce, and Share Adapted Material.
156 |
157 |
2. Exceptions and Limitations. For the avoidance of doubt, where
158 |
Exceptions and Limitations apply to Your use, this Public
159 |
License does not apply, and You do not need to comply with
160 |
its terms and conditions.
161 |
162 |
3. Term. The term of this Public License is specified in Section
163 |
164 |
165 |
4. Media and formats; technical modifications allowed. The
166 |
Licensor authorizes You to exercise the Licensed Rights in
167 |
all media and formats whether now known or hereafter created,
168 |
and to make technical modifications necessary to do so. The
169 |
Licensor waives and/or agrees not to assert any right or
170 |
authority to forbid You from making technical modifications
171 |
necessary to exercise the Licensed Rights, including
172 |
technical modifications necessary to circumvent Effective
173 |
Technological Measures. For purposes of this Public License,
174 |
simply making modifications authorized by this Section 2(a)
175 |
(4) never produces Adapted Material.
176 |
177 |
5. Downstream recipients.
178 |
179 |
a. Offer from the Licensor -- Licensed Material. Every
180 |
recipient of the Licensed Material automatically
181 |
receives an offer from the Licensor to exercise the
182 |
Licensed Rights under the terms and conditions of this
183 |
Public License.
184 |
185 |
b. Additional offer from the Licensor -- Adapted Material.
186 |
Every recipient of Adapted Material from You
187 |
automatically receives an offer from the Licensor to
188 |
exercise the Licensed Rights in the Adapted Material
189 |
under the conditions of the Adapter's License You apply.
190 |
191 |
c. No downstream restrictions. You may not offer or impose
192 |
any additional or different terms or conditions on, or
193 |
apply any Effective Technological Measures to, the
194 |
Licensed Material if doing so restricts exercise of the
195 |
Licensed Rights by any recipient of the Licensed
196 |
197 |
198 |
6. No endorsement. Nothing in this Public License constitutes or
199 |
may be construed as permission to assert or imply that You
200 |
are, or that Your use of the Licensed Material is, connected
201 |
with, or sponsored, endorsed, or granted official status by,
202 |
the Licensor or others designated to receive attribution as
203 |
provided in Section 3(a)(1)(A)(i).
204 |
205 |
b. Other rights.
206 |
207 |
1. Moral rights, such as the right of integrity, are not
208 |
licensed under this Public License, nor are publicity,
209 |
privacy, and/or other similar personality rights; however, to
210 |
the extent possible, the Licensor waives and/or agrees not to
211 |
assert any such rights held by the Licensor to the limited
212 |
extent necessary to allow You to exercise the Licensed
213 |
Rights, but not otherwise.
214 |
215 |
2. Patent and trademark rights are not licensed under this
216 |
Public License.
217 |
218 |
3. To the extent possible, the Licensor waives any right to
219 |
collect royalties from You for the exercise of the Licensed
220 |
Rights, whether directly or through a collecting society
221 |
under any voluntary or waivable statutory or compulsory
222 |
licensing scheme. In all other cases the Licensor expressly
223 |
reserves any right to collect such royalties.
224 |
225 |
226 |
Section 3 -- License Conditions.
227 |
228 |
Your exercise of the Licensed Rights is expressly made subject to the
229 |
following conditions.
230 |
231 |
a. Attribution.
232 |
233 |
1. If You Share the Licensed Material (including in modified
234 |
form), You must:
235 |
236 |
a. retain the following if it is supplied by the Licensor
237 |
with the Licensed Material:
238 |
239 |
i. identification of the creator(s) of the Licensed
240 |
Material and any others designated to receive
241 |
attribution, in any reasonable manner requested by
242 |
the Licensor (including by pseudonym if
243 |
244 |
245 |
ii. a copyright notice;
246 |
247 |
iii. a notice that refers to this Public License;
248 |
249 |
iv. a notice that refers to the disclaimer of
250 |
251 |
252 |
v. a URI or hyperlink to the Licensed Material to the
253 |
extent reasonably practicable;
254 |
255 |
b. indicate if You modified the Licensed Material and
256 |
retain an indication of any previous modifications; and
257 |
258 |
c. indicate the Licensed Material is licensed under this
259 |
Public License, and include the text of, or the URI or
260 |
hyperlink to, this Public License.
261 |
262 |
2. You may satisfy the conditions in Section 3(a)(1) in any
263 |
reasonable manner based on the medium, means, and context in
264 |
which You Share the Licensed Material. For example, it may be
265 |
reasonable to satisfy the conditions by providing a URI or
266 |
hyperlink to a resource that includes the required
267 |
268 |
269 |
3. If requested by the Licensor, You must remove any of the
270 |
information required by Section 3(a)(1)(A) to the extent
271 |
reasonably practicable.
272 |
273 |
b. ShareAlike.
274 |
275 |
In addition to the conditions in Section 3(a), if You Share
276 |
Adapted Material You produce, the following conditions also apply.
277 |
278 |
1. The Adapter's License You apply must be a Creative Commons
279 |
license with the same License Elements, this version or
280 |
later, or a BY-SA Compatible License.
281 |
282 |
2. You must include the text of, or the URI or hyperlink to, the
283 |
Adapter's License You apply. You may satisfy this condition
284 |
in any reasonable manner based on the medium, means, and
285 |
context in which You Share Adapted Material.
286 |
287 |
3. You may not offer or impose any additional or different terms
288 |
or conditions on, or apply any Effective Technological
289 |
Measures to, Adapted Material that restrict exercise of the
290 |
rights granted under the Adapter's License You apply.
291 |
292 |
293 |
Section 4 -- Sui Generis Database Rights.
294 |
295 |
Where the Licensed Rights include Sui Generis Database Rights that
296 |
apply to Your use of the Licensed Material:
297 |
298 |
a. for the avoidance of doubt, Section 2(a)(1) grants You the right
299 |
to extract, reuse, reproduce, and Share all or a substantial
300 |
portion of the contents of the database;
301 |
302 |
b. if You include all or a substantial portion of the database
303 |
contents in a database in which You have Sui Generis Database
304 |
Rights, then the database in which You have Sui Generis Database
305 |
Rights (but not its individual contents) is Adapted Material,
306 |
including for purposes of Section 3(b); and
307 |
308 |
c. You must comply with the conditions in Section 3(a) if You Share
309 |
all or a substantial portion of the contents of the database.
310 |
311 |
For the avoidance of doubt, this Section 4 supplements and does not
312 |
replace Your obligations under this Public License where the Licensed
313 |
Rights include other Copyright and Similar Rights.
314 |
315 |
316 |
Section 5 -- Disclaimer of Warranties and Limitation of Liability.
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
c. The disclaimer of warranties and limitation of liability provided
340 |
above shall be interpreted in a manner that, to the extent
341 |
possible, most closely approximates an absolute disclaimer and
342 |
waiver of all liability.
343 |
344 |
345 |
Section 6 -- Term and Termination.
346 |
347 |
a. This Public License applies for the term of the Copyright and
348 |
Similar Rights licensed here. However, if You fail to comply with
349 |
this Public License, then Your rights under this Public License
350 |
terminate automatically.
351 |
352 |
b. Where Your right to use the Licensed Material has terminated under
353 |
Section 6(a), it reinstates:
354 |
355 |
1. automatically as of the date the violation is cured, provided
356 |
it is cured within 30 days of Your discovery of the
357 |
violation; or
358 |
359 |
2. upon express reinstatement by the Licensor.
360 |
361 |
For the avoidance of doubt, this Section 6(b) does not affect any
362 |
right the Licensor may have to seek remedies for Your violations
363 |
of this Public License.
364 |
365 |
c. For the avoidance of doubt, the Licensor may also offer the
366 |
Licensed Material under separate terms or conditions or stop
367 |
distributing the Licensed Material at any time; however, doing so
368 |
will not terminate this Public License.
369 |
370 |
d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
371 |
372 |
373 |
374 |
Section 7 -- Other Terms and Conditions.
375 |
376 |
a. The Licensor shall not be bound by any additional or different
377 |
terms or conditions communicated by You unless expressly agreed.
378 |
379 |
b. Any arrangements, understandings, or agreements regarding the
380 |
Licensed Material not stated herein are separate from and
381 |
independent of the terms and conditions of this Public License.
382 |
383 |
384 |
Section 8 -- Interpretation.
385 |
386 |
a. For the avoidance of doubt, this Public License does not, and
387 |
shall not be interpreted to, reduce, limit, restrict, or impose
388 |
conditions on any use of the Licensed Material that could lawfully
389 |
be made without permission under this Public License.
390 |
391 |
b. To the extent possible, if any provision of this Public License is
392 |
deemed unenforceable, it shall be automatically reformed to the
393 |
minimum extent necessary to make it enforceable. If the provision
394 |
cannot be reformed, it shall be severed from this Public License
395 |
without affecting the enforceability of the remaining terms and
396 |
397 |
398 |
c. No term or condition of this Public License will be waived and no
399 |
failure to comply consented to unless expressly agreed to by the
400 |
401 |
402 |
d. Nothing in this Public License constitutes or may be interpreted
403 |
as a limitation upon, or waiver of, any privileges and immunities
404 |
that apply to the Licensor or You, including from the legal
405 |
processes of any jurisdiction or authority.
406 |
407 |
408 |
409 |
410 |
Creative Commons is not a party to its public
411 |
licenses. Notwithstanding, Creative Commons may elect to apply one of
412 |
its public licenses to material it publishes and in those instances
413 |
will be considered the “Licensor.†The text of the Creative Commons
414 |
public licenses is dedicated to the public domain under the CC0 Public
415 |
Domain Dedication. Except for the limited purpose of indicating that
416 |
material is shared under a Creative Commons public license or as
417 |
otherwise permitted by the Creative Commons policies published at
418 |
+, Creative Commons does not authorize the
419 |
use of the trademark "Creative Commons" or any other trademark or logo
420 |
of Creative Commons without its prior written consent including,
421 |
without limitation, in connection with any unauthorized modifications
422 |
to any of its public licenses or any other arrangements,
423 |
understandings, or agreements concerning use of licensed material. For
424 |
the avoidance of doubt, this paragraph does not form part of the
425 |
public licenses.
426 |
427 |
Creative Commons may be contacted at
@@ -0,0 +1,38 @@
1 |
## *ViscoNet*: Bridging and Harmonizing Visual and Textual Conditioning for ControlNet
2 |
[Soon Yau Cheong](
3 |
[Armin Mustafa](
4 |
[Andrew Gilbert](
5 |
6 |
7 |
<a href=''><img src=''></a>
8 |
<a href=''><img src=''></a>
9 |
10 |
11 |
12 |
13 |
### Requirements
14 |
A suitable [conda]( environment named `control` can be created
15 |
and activated with:
16 |
17 |
conda env create -f environment.yaml
18 |
conda activate control
19 |
20 |
### Files
21 |
All model and data files are in [here](
22 |
Including containing all images used in human evaluation.
23 |
24 |
### Gradio App
25 |
26 |
1. Download *visconet_v1.pth* and *exp-schp-201908301523-atr.pth* into directory ./models
27 |
2. (Optional) download and unzip it to home directory.
28 |
3. run ```python```
29 |
30 |
### Citation
31 |
32 |
33 |
author = {Cheong, Soon Yau and Mustafa, Armin and Gilbert, Andrew},
34 |
title = {ViscoNet: Bridging and Harmonizing Visual and Textual Conditioning for ControlNet},
35 |
journal = {Arxiv Preprint 2312.03154},
36 |
month = {December},
37 |
year = {2023}}
38 |
@@ -0,0 +1,73 @@
1 |
# Openpose
2 |
# Original from CMU
3 |
# 2nd Edited by
4 |
# 3rd Edited by ControlNet
5 |
6 |
import os
7 |
8 |
9 |
import torch
10 |
import numpy as np
11 |
from . import util
12 |
from .body import Body
13 |
from .hand import Hand
14 |
from annotator.util import annotator_ckpts_path
15 |
16 |
17 |
body_model_path = ""
18 |
hand_model_path = ""
19 |
20 |
21 |
class OpenposeDetector:
22 |
def __init__(self):
23 |
body_modelpath = os.path.join(annotator_ckpts_path, "body_pose_model.pth")
24 |
hand_modelpath = os.path.join(annotator_ckpts_path, "hand_pose_model.pth")
25 |
26 |
if not os.path.exists(hand_modelpath):
27 |
from basicsr.utils.download_util import load_file_from_url
28 |
load_file_from_url(body_model_path, model_dir=annotator_ckpts_path)
29 |
load_file_from_url(hand_model_path, model_dir=annotator_ckpts_path)
30 |
31 |
self.body_estimation = Body(body_modelpath)
32 |
self.hand_estimation = Hand(hand_modelpath)
33 |
34 |
def __call__(self, oriImg, hand=False):
35 |
oriImg = oriImg[:, :, ::-1].copy()
36 |
with torch.no_grad():
37 |
candidate, subset = self.body_estimation(oriImg)
38 |
canvas = np.zeros_like(oriImg)
39 |
canvas = util.draw_bodypose(canvas, candidate, subset)
40 |
if hand:
41 |
hands_list = util.handDetect(candidate, subset, oriImg)
42 |
all_hand_peaks = []
43 |
for x, y, w, is_left in hands_list:
44 |
peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :])
45 |
peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x)
46 |
peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y)
47 |
48 |
canvas = util.draw_handpose(canvas, all_hand_peaks)
49 |
return canvas, dict(candidate=candidate.tolist(), subset=subset.tolist())
50 |
51 |
52 |
class VisconetDetector(OpenposeDetector):
53 |
def __init__(self):
54 |
55 |
56 |
def __call__(self, oriImg):
57 |
oriImg = oriImg[:, :, ::-1].copy()
58 |
with torch.no_grad():
59 |
candidate, subset = self.body_estimation(oriImg)
60 |
canvas = util.draw_bodypose(np.zeros_like(oriImg), candidate, subset, stickwidth=1, circlewidth=2)
61 |
# detect hand
62 |
hands_list = util.handDetect(candidate, subset, oriImg)
63 |
64 |
all_hand_peaks = []
65 |
for x, y, w, is_left in hands_list:
66 |
67 |
peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :])
68 |
peaks[:, 0] = np.where(peaks[:, 0]==0, peaks[:, 0], peaks[:, 0]+x)
69 |
peaks[:, 1] = np.where(peaks[:, 1]==0, peaks[:, 1], peaks[:, 1]+y)
70 |
71 |
72 |
canvas = util.draw_handpose(canvas, all_hand_peaks,stickwidth=1)
73 |
return canvas, dict(candidate=candidate.tolist(), subset=subset.tolist())
@@ -0,0 +1,219 @@
1 |
import cv2
2 |
import numpy as np
3 |
import math
4 |
import time
5 |
from scipy.ndimage.filters import gaussian_filter
6 |
import matplotlib.pyplot as plt
7 |
import matplotlib
8 |
import torch
9 |
from torchvision import transforms
10 |
11 |
from . import util
12 |
from .model import bodypose_model
13 |
14 |
class Body(object):
15 |
def __init__(self, model_path):
16 |
self.model = bodypose_model()
17 |
if torch.cuda.is_available():
18 |
self.model = self.model.cuda()
19 |
20 |
model_dict = util.transfer(self.model, torch.load(model_path))
21 |
22 |
23 |
24 |
def __call__(self, oriImg):
25 |
# scale_search = [0.5, 1.0, 1.5, 2.0]
26 |
scale_search = [0.5]
27 |
boxsize = 368
28 |
stride = 8
29 |
padValue = 128
30 |
thre1 = 0.1
31 |
thre2 = 0.05
32 |
multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
33 |
heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
34 |
paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
35 |
36 |
for m in range(len(multiplier)):
37 |
scale = multiplier[m]
38 |
imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
39 |
imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
40 |
im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
41 |
im = np.ascontiguousarray(im)
42 |
43 |
data = torch.from_numpy(im).float()
44 |
if torch.cuda.is_available():
45 |
data = data.cuda()
46 |
# data = data.permute([2, 0, 1]).unsqueeze(0).float()
47 |
with torch.no_grad():
48 |
Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
49 |
Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
50 |
Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
51 |
52 |
# extract outputs, resize, and remove padding
53 |
# heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps
54 |
heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps
55 |
heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
56 |
heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
57 |
heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
58 |
59 |
# paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs
60 |
paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs
61 |
paf = cv2.resize(paf, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
62 |
paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
63 |
paf = cv2.resize(paf, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
64 |
65 |
heatmap_avg += heatmap_avg + heatmap / len(multiplier)
66 |
paf_avg += + paf / len(multiplier)
67 |
68 |
all_peaks = []
69 |
peak_counter = 0
70 |
71 |
for part in range(18):
72 |
map_ori = heatmap_avg[:, :, part]
73 |
one_heatmap = gaussian_filter(map_ori, sigma=3)
74 |
75 |
map_left = np.zeros(one_heatmap.shape)
76 |
map_left[1:, :] = one_heatmap[:-1, :]
77 |
map_right = np.zeros(one_heatmap.shape)
78 |
map_right[:-1, :] = one_heatmap[1:, :]
79 |
map_up = np.zeros(one_heatmap.shape)
80 |
map_up[:, 1:] = one_heatmap[:, :-1]
81 |
map_down = np.zeros(one_heatmap.shape)
82 |
map_down[:, :-1] = one_heatmap[:, 1:]
83 |
84 |
peaks_binary = np.logical_and.reduce(
85 |
(one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
86 |
peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse
87 |
peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
88 |
peak_id = range(peak_counter, peak_counter + len(peaks))
89 |
peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
90 |
91 |
92 |
peak_counter += len(peaks)
93 |
94 |
# find connection in the specified sequence, center 29 is in the position 15
95 |
limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
96 |
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
97 |
[1, 16], [16, 18], [3, 17], [6, 18]]
98 |
# the middle joints heatmap correpondence
99 |
mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
100 |
[23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
101 |
[55, 56], [37, 38], [45, 46]]
102 |
103 |
connection_all = []
104 |
special_k = []
105 |
mid_num = 10
106 |
107 |
for k in range(len(mapIdx)):
108 |
score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
109 |
candA = all_peaks[limbSeq[k][0] - 1]
110 |
candB = all_peaks[limbSeq[k][1] - 1]
111 |
nA = len(candA)
112 |
nB = len(candB)
113 |
indexA, indexB = limbSeq[k]
114 |
if (nA != 0 and nB != 0):
115 |
connection_candidate = []
116 |
for i in range(nA):
117 |
for j in range(nB):
118 |
vec = np.subtract(candB[j][:2], candA[i][:2])
119 |
norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
120 |
norm = max(0.001, norm)
121 |
vec = np.divide(vec, norm)
122 |
123 |
startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
124 |
np.linspace(candA[i][1], candB[j][1], num=mid_num)))
125 |
126 |
vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
127 |
for I in range(len(startend))])
128 |
vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
129 |
for I in range(len(startend))])
130 |
131 |
score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
132 |
score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
133 |
0.5 * oriImg.shape[0] / norm - 1, 0)
134 |
criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
135 |
criterion2 = score_with_dist_prior > 0
136 |
if criterion1 and criterion2:
137 |
138 |
[i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
139 |
140 |
connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
141 |
connection = np.zeros((0, 5))
142 |
for c in range(len(connection_candidate)):
143 |
i, j, s = connection_candidate[c][0:3]
144 |
if (i not in connection[:, 3] and j not in connection[:, 4]):
145 |
connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
146 |
if (len(connection) >= min(nA, nB)):
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
# last number in each row is the total parts number of that person
155 |
# the second last number in each row is the score of the overall configuration
156 |
subset = -1 * np.ones((0, 20))
157 |
candidate = np.array([item for sublist in all_peaks for item in sublist])
158 |
159 |
for k in range(len(mapIdx)):
160 |
if k not in special_k:
161 |
partAs = connection_all[k][:, 0]
162 |
partBs = connection_all[k][:, 1]
163 |
indexA, indexB = np.array(limbSeq[k]) - 1
164 |
165 |
for i in range(len(connection_all[k])): # = 1:size(temp,1)
166 |
found = 0
167 |
subset_idx = [-1, -1]
168 |
for j in range(len(subset)): # 1:size(subset,1):
169 |
if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
170 |
subset_idx[found] = j
171 |
found += 1
172 |
173 |
if found == 1:
174 |
j = subset_idx[0]
175 |
if subset[j][indexB] != partBs[i]:
176 |
subset[j][indexB] = partBs[i]
177 |
subset[j][-1] += 1
178 |
subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
179 |
elif found == 2: # if found 2 and disjoint, merge them
180 |
j1, j2 = subset_idx
181 |
membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
182 |
if len(np.nonzero(membership == 2)[0]) == 0: # merge
183 |
subset[j1][:-2] += (subset[j2][:-2] + 1)
184 |
subset[j1][-2:] += subset[j2][-2:]
185 |
subset[j1][-2] += connection_all[k][i][2]
186 |
subset = np.delete(subset, j2, 0)
187 |
else: # as like found == 1
188 |
subset[j1][indexB] = partBs[i]
189 |
subset[j1][-1] += 1
190 |
subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
191 |
192 |
# if find no partA in the subset, create a new subset
193 |
elif not found and k < 17:
194 |
row = -1 * np.ones(20)
195 |
row[indexA] = partAs[i]
196 |
row[indexB] = partBs[i]
197 |
row[-1] = 2
198 |
row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
199 |
subset = np.vstack([subset, row])
200 |
# delete some rows of subset which has few parts occur
201 |
deleteIdx = []
202 |
for i in range(len(subset)):
203 |
if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
204 |
205 |
subset = np.delete(subset, deleteIdx, axis=0)
206 |
207 |
# subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
208 |
# candidate: x, y, score, id
209 |
return candidate, subset
210 |
211 |
if __name__ == "__main__":
212 |
body_estimation = Body('../model/body_pose_model.pth')
213 |
214 |
test_image = '../images/ski.jpg'
215 |
oriImg = cv2.imread(test_image) # B,G,R order
216 |
candidate, subset = body_estimation(oriImg)
217 |
canvas = util.draw_bodypose(oriImg, candidate, subset)
218 |
plt.imshow(canvas[:, :, [2, 1, 0]])
219 |
@@ -0,0 +1,86 @@
1 |
import cv2
2 |
import json
3 |
import numpy as np
4 |
import math
5 |
import time
6 |
from scipy.ndimage.filters import gaussian_filter
7 |
import matplotlib.pyplot as plt
8 |
import matplotlib
9 |
import torch
10 |
from skimage.measure import label
11 |
12 |
from .model import handpose_model
13 |
from . import util
14 |
15 |
class Hand(object):
16 |
def __init__(self, model_path):
17 |
self.model = handpose_model()
18 |
if torch.cuda.is_available():
19 |
self.model = self.model.cuda()
20 |
21 |
model_dict = util.transfer(self.model, torch.load(model_path))
22 |
23 |
24 |
25 |
def __call__(self, oriImg):
26 |
scale_search = [0.5, 1.0, 1.5, 2.0]
27 |
# scale_search = [0.5]
28 |
boxsize = 368
29 |
stride = 8
30 |
padValue = 128
31 |
thre = 0.05
32 |
multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
33 |
heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 22))
34 |
# paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
35 |
36 |
for m in range(len(multiplier)):
37 |
scale = multiplier[m]
38 |
imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
39 |
imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
40 |
im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
41 |
im = np.ascontiguousarray(im)
42 |
43 |
data = torch.from_numpy(im).float()
44 |
if torch.cuda.is_available():
45 |
data = data.cuda()
46 |
# data = data.permute([2, 0, 1]).unsqueeze(0).float()
47 |
with torch.no_grad():
48 |
output = self.model(data).cpu().numpy()
49 |
# output = self.model(data).numpy()q
50 |
51 |
# extract outputs, resize, and remove padding
52 |
heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps
53 |
heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
54 |
heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
55 |
heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
56 |
57 |
heatmap_avg += heatmap / len(multiplier)
58 |
59 |
all_peaks = []
60 |
for part in range(21):
61 |
map_ori = heatmap_avg[:, :, part]
62 |
one_heatmap = gaussian_filter(map_ori, sigma=3)
63 |
binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
64 |
# 全部小于阈值
65 |
if np.sum(binary) == 0:
66 |
all_peaks.append([0, 0])
67 |
68 |
label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
69 |
max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
70 |
label_img[label_img != max_index] = 0
71 |
map_ori[label_img == 0] = 0
72 |
73 |
y, x = util.npmax(map_ori)
74 |
all_peaks.append([x, y])
75 |
return np.array(all_peaks)
76 |
77 |
if __name__ == "__main__":
78 |
hand_estimation = Hand('../model/hand_pose_model.pth')
79 |
80 |
# test_image = '../images/hand.jpg'
81 |
test_image = '../images/hand.jpg'
82 |
oriImg = cv2.imread(test_image) # B,G,R order
83 |
peaks = hand_estimation(oriImg)
84 |
canvas = util.draw_handpose(oriImg, peaks, True)
85 |
cv2.imshow('', canvas)
86 |
@@ -0,0 +1,219 @@
1 |
import torch
2 |
from collections import OrderedDict
3 |
4 |
import torch
5 |
import torch.nn as nn
6 |
7 |
def make_layers(block, no_relu_layers):
8 |
layers = []
9 |
for layer_name, v in block.items():
10 |
if 'pool' in layer_name:
11 |
layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
12 |
13 |
layers.append((layer_name, layer))
14 |
15 |
conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
16 |
kernel_size=v[2], stride=v[3],
17 |
18 |
layers.append((layer_name, conv2d))
19 |
if layer_name not in no_relu_layers:
20 |
layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
21 |
22 |
return nn.Sequential(OrderedDict(layers))
23 |
24 |
class bodypose_model(nn.Module):
25 |
def __init__(self):
26 |
super(bodypose_model, self).__init__()
27 |
28 |
# these layers have no relu layer
29 |
no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
30 |
'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
31 |
'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
32 |
'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
33 |
blocks = {}
34 |
block0 = OrderedDict([
35 |
('conv1_1', [3, 64, 3, 1, 1]),
36 |
('conv1_2', [64, 64, 3, 1, 1]),
37 |
('pool1_stage1', [2, 2, 0]),
38 |
('conv2_1', [64, 128, 3, 1, 1]),
39 |
('conv2_2', [128, 128, 3, 1, 1]),
40 |
('pool2_stage1', [2, 2, 0]),
41 |
('conv3_1', [128, 256, 3, 1, 1]),
42 |
('conv3_2', [256, 256, 3, 1, 1]),
43 |
('conv3_3', [256, 256, 3, 1, 1]),
44 |
('conv3_4', [256, 256, 3, 1, 1]),
45 |
('pool3_stage1', [2, 2, 0]),
46 |
('conv4_1', [256, 512, 3, 1, 1]),
47 |
('conv4_2', [512, 512, 3, 1, 1]),
48 |
('conv4_3_CPM', [512, 256, 3, 1, 1]),
49 |
('conv4_4_CPM', [256, 128, 3, 1, 1])
50 |
51 |
52 |
53 |
# Stage 1
54 |
block1_1 = OrderedDict([
55 |
('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
56 |
('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
57 |
('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
58 |
('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
59 |
('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
60 |
61 |
62 |
block1_2 = OrderedDict([
63 |
('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
64 |
('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
65 |
('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
66 |
('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
67 |
('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
68 |
69 |
blocks['block1_1'] = block1_1
70 |
blocks['block1_2'] = block1_2
71 |
72 |
self.model0 = make_layers(block0, no_relu_layers)
73 |
74 |
# Stages 2 - 6
75 |
for i in range(2, 7):
76 |
blocks['block%d_1' % i] = OrderedDict([
77 |
('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
78 |
('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
79 |
('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
80 |
('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
81 |
('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
82 |
('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
83 |
('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
84 |
85 |
86 |
blocks['block%d_2' % i] = OrderedDict([
87 |
('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
88 |
('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
89 |
('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
90 |
('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
91 |
('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
92 |
('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
93 |
('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
94 |
95 |
96 |
for k in blocks.keys():
97 |
blocks[k] = make_layers(blocks[k], no_relu_layers)
98 |
99 |
self.model1_1 = blocks['block1_1']
100 |
self.model2_1 = blocks['block2_1']
101 |
self.model3_1 = blocks['block3_1']
102 |
self.model4_1 = blocks['block4_1']
103 |
self.model5_1 = blocks['block5_1']
104 |
self.model6_1 = blocks['block6_1']
105 |
106 |
self.model1_2 = blocks['block1_2']
107 |
self.model2_2 = blocks['block2_2']
108 |
self.model3_2 = blocks['block3_2']
109 |
self.model4_2 = blocks['block4_2']
110 |
self.model5_2 = blocks['block5_2']
111 |
self.model6_2 = blocks['block6_2']
112 |
113 |
114 |
def forward(self, x):
115 |
116 |
out1 = self.model0(x)
117 |
118 |
out1_1 = self.model1_1(out1)
119 |
out1_2 = self.model1_2(out1)
120 |
out2 =[out1_1, out1_2, out1], 1)
121 |
122 |
out2_1 = self.model2_1(out2)
123 |
out2_2 = self.model2_2(out2)
124 |
out3 =[out2_1, out2_2, out1], 1)
125 |
126 |
out3_1 = self.model3_1(out3)
127 |
out3_2 = self.model3_2(out3)
128 |
out4 =[out3_1, out3_2, out1], 1)
129 |
130 |
out4_1 = self.model4_1(out4)
131 |
out4_2 = self.model4_2(out4)
132 |
out5 =[out4_1, out4_2, out1], 1)
133 |
134 |
out5_1 = self.model5_1(out5)
135 |
out5_2 = self.model5_2(out5)
136 |
out6 =[out5_1, out5_2, out1], 1)
137 |
138 |
out6_1 = self.model6_1(out6)
139 |
out6_2 = self.model6_2(out6)
140 |
141 |
return out6_1, out6_2
142 |
143 |
class handpose_model(nn.Module):
144 |
def __init__(self):
145 |
super(handpose_model, self).__init__()
146 |
147 |
# these layers have no relu layer
148 |
no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
149 |
'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
150 |
# stage 1
151 |
block1_0 = OrderedDict([
152 |
('conv1_1', [3, 64, 3, 1, 1]),
153 |
('conv1_2', [64, 64, 3, 1, 1]),
154 |
('pool1_stage1', [2, 2, 0]),
155 |
('conv2_1', [64, 128, 3, 1, 1]),
156 |
('conv2_2', [128, 128, 3, 1, 1]),
157 |
('pool2_stage1', [2, 2, 0]),
158 |
('conv3_1', [128, 256, 3, 1, 1]),
159 |
('conv3_2', [256, 256, 3, 1, 1]),
160 |
('conv3_3', [256, 256, 3, 1, 1]),
161 |
('conv3_4', [256, 256, 3, 1, 1]),
162 |
('pool3_stage1', [2, 2, 0]),
163 |
('conv4_1', [256, 512, 3, 1, 1]),
164 |
('conv4_2', [512, 512, 3, 1, 1]),
165 |
('conv4_3', [512, 512, 3, 1, 1]),
166 |
('conv4_4', [512, 512, 3, 1, 1]),
167 |
('conv5_1', [512, 512, 3, 1, 1]),
168 |
('conv5_2', [512, 512, 3, 1, 1]),
169 |
('conv5_3_CPM', [512, 128, 3, 1, 1])
170 |
171 |
172 |
block1_1 = OrderedDict([
173 |
('conv6_1_CPM', [128, 512, 1, 1, 0]),
174 |
('conv6_2_CPM', [512, 22, 1, 1, 0])
175 |
176 |
177 |
blocks = {}
178 |
blocks['block1_0'] = block1_0
179 |
blocks['block1_1'] = block1_1
180 |
181 |
# stage 2-6
182 |
for i in range(2, 7):
183 |
blocks['block%d' % i] = OrderedDict([
184 |
('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
185 |
('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
186 |
('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
187 |
('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
188 |
('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
189 |
('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
190 |
('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
191 |
192 |
193 |
for k in blocks.keys():
194 |
blocks[k] = make_layers(blocks[k], no_relu_layers)
195 |
196 |
self.model1_0 = blocks['block1_0']
197 |
self.model1_1 = blocks['block1_1']
198 |
self.model2 = blocks['block2']
199 |
self.model3 = blocks['block3']
200 |
self.model4 = blocks['block4']
201 |
self.model5 = blocks['block5']
202 |
self.model6 = blocks['block6']
203 |
204 |
def forward(self, x):
205 |
out1_0 = self.model1_0(x)
206 |
out1_1 = self.model1_1(out1_0)
207 |
concat_stage2 =[out1_1, out1_0], 1)
208 |
out_stage2 = self.model2(concat_stage2)
209 |
concat_stage3 =[out_stage2, out1_0], 1)
210 |
out_stage3 = self.model3(concat_stage3)
211 |
concat_stage4 =[out_stage3, out1_0], 1)
212 |
out_stage4 = self.model4(concat_stage4)
213 |
concat_stage5 =[out_stage4, out1_0], 1)
214 |
out_stage5 = self.model5(concat_stage5)
215 |
concat_stage6 =[out_stage5, out1_0], 1)
216 |
out_stage6 = self.model6(concat_stage6)
217 |
return out_stage6
218 |
219 |
@@ -0,0 +1,163 @@
1 |
import math
2 |
import numpy as np
3 |
import matplotlib
4 |
import cv2
5 |
6 |
7 |
def padRightDownCorner(img, stride, padValue):
8 |
h = img.shape[0]
9 |
w = img.shape[1]
10 |
11 |
pad = 4 * [None]
12 |
pad[0] = 0 # up
13 |
pad[1] = 0 # left
14 |
pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
15 |
pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
16 |
17 |
img_padded = img
18 |
pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
19 |
img_padded = np.concatenate((pad_up, img_padded), axis=0)
20 |
pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
21 |
img_padded = np.concatenate((pad_left, img_padded), axis=1)
22 |
pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
23 |
img_padded = np.concatenate((img_padded, pad_down), axis=0)
24 |
pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
25 |
img_padded = np.concatenate((img_padded, pad_right), axis=1)
26 |
27 |
return img_padded, pad
28 |
29 |
# transfer caffe model to pytorch which will match the layer name
30 |
def transfer(model, model_weights):
31 |
transfered_model_weights = {}
32 |
for weights_name in model.state_dict().keys():
33 |
transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
34 |
return transfered_model_weights
35 |
36 |
# draw the body keypoint and lims
37 |
def draw_bodypose(canvas, candidate, subset, stickwidth=4, circlewidth=4):
38 |
39 |
limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
40 |
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
41 |
[1, 16], [16, 18], [3, 17], [6, 18]]
42 |
43 |
colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
44 |
[0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
45 |
[170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
46 |
for i in range(18):
47 |
for n in range(len(subset)):
48 |
index = int(subset[n][i])
49 |
if index == -1:
50 |
51 |
x, y = candidate[index][0:2]
52 |
+, (int(x), int(y)), circlewidth, colors[i], thickness=-1)
53 |
for i in range(17):
54 |
for n in range(len(subset)):
55 |
index = subset[n][np.array(limbSeq[i]) - 1]
56 |
if -1 in index:
57 |
58 |
cur_canvas = canvas.copy()
59 |
Y = candidate[index.astype(int), 0]
60 |
X = candidate[index.astype(int), 1]
61 |
mX = np.mean(X)
62 |
mY = np.mean(Y)
63 |
length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
64 |
angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
65 |
polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
66 |
cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
67 |
canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
68 |
# plt.imsave("preview.jpg", canvas[:, :, [2, 1, 0]])
69 |
# plt.imshow(canvas[:, :, [2, 1, 0]])
70 |
return canvas
71 |
72 |
# image drawed by opencv is not good.
73 |
def draw_handpose(canvas, all_hand_peaks, show_number=False, stickwidth=2):
74 |
edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
75 |
[10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
76 |
77 |
for peaks in all_hand_peaks:
78 |
for ie, e in enumerate(edges):
79 |
if np.sum(np.all(peaks[e], axis=1)==0)==0:
80 |
x1, y1 = peaks[e[0]]
81 |
x2, y2 = peaks[e[1]]
82 |
cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie/float(len(edges)), 1.0, 1.0])*255, thickness=stickwidth)
83 |
84 |
for i, keyponit in enumerate(peaks):
85 |
x, y = keyponit
86 |
+, (x, y), 4, (0, 0, 255), thickness=-1)
87 |
if show_number:
88 |
cv2.putText(canvas, str(i), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), lineType=cv2.LINE_AA)
89 |
return canvas
90 |
91 |
# detect hand according to body pose keypoints
92 |
# please refer to
93 |
def handDetect(candidate, subset, oriImg):
94 |
# right hand: wrist 4, elbow 3, shoulder 2
95 |
# left hand: wrist 7, elbow 6, shoulder 5
96 |
ratioWristElbow = 0.33
97 |
detect_result = []
98 |
image_height, image_width = oriImg.shape[0:2]
99 |
for person in subset.astype(int):
100 |
# if any of three not detected
101 |
has_left = np.sum(person[[5, 6, 7]] == -1) == 0
102 |
has_right = np.sum(person[[2, 3, 4]] == -1) == 0
103 |
if not (has_left or has_right):
104 |
105 |
hands = []
106 |
#left hand
107 |
if has_left:
108 |
left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
109 |
x1, y1 = candidate[left_shoulder_index][:2]
110 |
x2, y2 = candidate[left_elbow_index][:2]
111 |
x3, y3 = candidate[left_wrist_index][:2]
112 |
hands.append([x1, y1, x2, y2, x3, y3, True])
113 |
# right hand
114 |
if has_right:
115 |
right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
116 |
x1, y1 = candidate[right_shoulder_index][:2]
117 |
x2, y2 = candidate[right_elbow_index][:2]
118 |
x3, y3 = candidate[right_wrist_index][:2]
119 |
hands.append([x1, y1, x2, y2, x3, y3, False])
120 |
121 |
for x1, y1, x2, y2, x3, y3, is_left in hands:
122 |
# pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
123 |
# handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
124 |
# handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
125 |
# const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
126 |
# const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
127 |
# handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
128 |
x = x3 + ratioWristElbow * (x3 - x2)
129 |
y = y3 + ratioWristElbow * (y3 - y2)
130 |
distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
131 |
distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
132 |
width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
133 |
# x-y refers to the center --> offset to topLeft point
134 |
# handRectangle.x -= handRectangle.width / 2.f;
135 |
# handRectangle.y -= handRectangle.height / 2.f;
136 |
x -= width / 2
137 |
y -= width / 2 # width = height
138 |
# overflow the image
139 |
if x < 0: x = 0
140 |
if y < 0: y = 0
141 |
width1 = width
142 |
width2 = width
143 |
if x + width > image_width: width1 = image_width - x
144 |
if y + width > image_height: width2 = image_height - y
145 |
width = min(width1, width2)
146 |
# the max hand box value is 20 pixels
147 |
if width >= 20:
148 |
detect_result.append([int(x), int(y), int(width), is_left])
149 |
150 |
151 |
return value: [[x, y, w, True if left hand else False]].
152 |
width=height since the network require squared input.
153 |
x, y is the coordinate of top left
154 |
155 |
return detect_result
156 |
157 |
# get max index of 2d array
158 |
def npmax(array):
159 |
arrayindex = array.argmax(1)
160 |
arrayvalue = array.max(1)
161 |
i = arrayvalue.argmax()
162 |
j = arrayindex[i]
163 |
return i, j
@@ -0,0 +1,162 @@
1 |
# Self-Correction-Human-Parsing
2 |
# Original
3 |
4 |
import os
5 |
import torch
6 |
import numpy as np
7 |
from PIL import Image
8 |
import cv2
9 |
10 |
import torchvision.transforms as T
11 |
12 |
from .transforms import transform_logits, get_affine_transform
13 |
from . import networks
14 |
from annotator.util import annotator_ckpts_path
15 |
from huggingface_hub import snapshot_download
16 |
17 |
dataset_settings = {
18 |
'lip': {
19 |
'input_size': [473, 473],
20 |
'num_classes': 20,
21 |
'label': ['Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'Upper-clothes', 'Dress', 'Coat',
22 |
'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm', 'Right-arm',
23 |
'Left-leg', 'Right-leg', 'Left-shoe', 'Right-shoe']
24 |
25 |
'atr': {
26 |
'input_size': [512, 512],
27 |
'num_classes': 18,
28 |
'label': ['Background', 'Hat', 'Hair', 'Sunglasses', 'Upper-clothes', 'Skirt', 'Pants', 'Dress', 'Belt',
29 |
'Left-shoe', 'Right-shoe', 'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm', 'Bag', 'Scarf']
30 |
31 |
'pascal': {
32 |
'input_size': [512, 512],
33 |
'num_classes': 7,
34 |
'label': ['Background', 'Head', 'Torso', 'Upper Arms', 'Lower Arms', 'Upper Legs', 'Lower Legs'],
35 |
36 |
37 |
38 |
39 |
def get_palette(num_cls):
40 |
""" Returns the color map for visualizing the segmentation mask.
41 |
42 |
num_cls: Number of classes
43 |
44 |
The color map
45 |
46 |
n = num_cls
47 |
palette = [0] * (n * 3)
48 |
for j in range(0, n):
49 |
lab = j
50 |
palette[j * 3 + 0] = 0
51 |
palette[j * 3 + 1] = 0
52 |
palette[j * 3 + 2] = 0
53 |
i = 0
54 |
while lab:
55 |
palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
56 |
palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
57 |
palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
58 |
i += 1
59 |
lab >>= 3
60 |
return palette
61 |
62 |
class Segmentator(torch.nn.Module):
63 |
def __init__(self, dataset='lip'):
64 |
65 |
66 |
num_classes = dataset_settings[dataset]['num_classes']
67 |
input_size = dataset_settings[dataset]['input_size']
68 |
label = dataset_settings[dataset]['label']
69 |
70 |
if dataset == 'atr':
71 |
72 |
elif dataset == 'lip':
73 |
74 |
75 |
model_path = os.path.join(annotator_ckpts_path, model_path)
76 |
77 |
snapshot_download(repo_id="soonyau/visconet", allow_patterns="exp-schp-201908301523-atr.pth", local_dir=annotator_ckpts_path)
78 |
79 |
self.model = networks.init_model('resnet101', num_classes=num_classes, pretrained=None)
80 |
state_dict = torch.load(model_path)['state_dict']
81 |
from collections import OrderedDict
82 |
new_state_dict = OrderedDict()
83 |
for k, v in state_dict.items():
84 |
name = k[7:] # remove `module.`
85 |
new_state_dict[name] = v
86 |
87 |
88 |
89 |
self.palette = get_palette(num_classes)
90 |
91 |
self.transform = T.Compose([
92 |
93 |
T.Normalize(mean=[0.406, 0.456, 0.485], std=[0.225, 0.224, 0.229])
94 |
95 |
self.aspect_ratio = input_size[1] * 1.0 / input_size[0]
96 |
self.input_size = np.asarray(input_size)
97 |
98 |
def _box2cs(self, box):
99 |
x, y, w, h = box[:4]
100 |
return self._xywh2cs(x, y, w, h)
101 |
102 |
def _xywh2cs(self, x, y, w, h):
103 |
center = np.zeros((2), dtype=np.float32)
104 |
center[0] = x + w * 0.5
105 |
center[1] = y + h * 0.5
106 |
if w > self.aspect_ratio * h:
107 |
h = w * 1.0 / self.aspect_ratio
108 |
elif w < self.aspect_ratio * h:
109 |
w = h * self.aspect_ratio
110 |
scale = np.array([w, h], dtype=np.float32)
111 |
return center, scale
112 |
113 |
def preprocess(self, image:np.array):
114 |
# convert numpy to cv2
115 |
image = image[:,:,::-1]
116 |
h, w, _ = image.shape
117 |
118 |
# Get person center and scale
119 |
person_center, s = self._box2cs([0, 0, w - 1, h - 1])
120 |
r = 0
121 |
trans = get_affine_transform(person_center, s, r, self.input_size)
122 |
input = cv2.warpAffine(
123 |
124 |
125 |
(int(self.input_size[1]), int(self.input_size[0])),
126 |
127 |
128 |
borderValue=(0, 0, 0))
129 |
130 |
input = self.transform(input)
131 |
meta = {
132 |
'center': person_center,
133 |
'height': h,
134 |
'width': w,
135 |
'scale': s,
136 |
'rotation': r
137 |
138 |
139 |
return input, meta
140 |
141 |
142 |
def __call__(self, input_image):
143 |
image, meta = self.preprocess(input_image)
144 |
c = meta['center']
145 |
s = meta['scale']
146 |
w = meta['width']
147 |
h = meta['height']
148 |
input_size = list(self.input_size)
149 |
device = next(self.parameters()).device
150 |
output = self.model(image.unsqueeze(0).to(device))
151 |
upsample = torch.nn.Upsample(size=input_size, mode='bilinear', align_corners=True)
152 |
upsample_output = upsample(output[0][-1][0].unsqueeze(0))
153 |
upsample_output = upsample_output.squeeze()
154 |
upsample_output = upsample_output.permute(1, 2, 0) # CHW -> HWC
155 |
logits_result = transform_logits(, c, s, w, h, input_size=input_size)
156 |
parsing_result = np.argmax(logits_result, axis=2)
157 |
output_img = Image.fromarray(np.asarray(parsing_result, dtype=np.uint8))
158 |
#return output_img
159 |
160 |
return output_img
161 |
#return np.array(output_img)
162 |
@@ -0,0 +1,5 @@
1 |
from .bn import ABN, InPlaceABN, InPlaceABNSync
2 |
from .functions import ACT_RELU, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
3 |
from .misc import GlobalAvgPool2d, SingleGPU
4 |
from .residual import IdentityResidualBlock
5 |
from .dense import DenseModule
@@ -0,0 +1,132 @@
1 |
import torch
2 |
import torch.nn as nn
3 |
import torch.nn.functional as functional
4 |
5 |
6 |
from queue import Queue
7 |
except ImportError:
8 |
from Queue import Queue
9 |
10 |
from .functions import *
11 |
12 |
13 |
class ABN(nn.Module):
14 |
"""Activated Batch Normalization
15 |
16 |
This gathers a `BatchNorm2d` and an activation function in a single module
17 |
18 |
19 |
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
20 |
"""Creates an Activated Batch Normalization module
21 |
22 |
23 |
24 |
num_features : int
25 |
Number of feature channels in the input and output.
26 |
eps : float
27 |
Small constant to prevent numerical issues.
28 |
momentum : float
29 |
Momentum factor applied to compute running statistics as.
30 |
affine : bool
31 |
If `True` apply learned scale and shift transformation after normalization.
32 |
activation : str
33 |
Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
34 |
slope : float
35 |
Negative slope for the `leaky_relu` activation.
36 |
37 |
super(ABN, self).__init__()
38 |
self.num_features = num_features
39 |
self.affine = affine
40 |
self.eps = eps
41 |
self.momentum = momentum
42 |
self.activation = activation
43 |
self.slope = slope
44 |
if self.affine:
45 |
self.weight = nn.Parameter(torch.ones(num_features))
46 |
self.bias = nn.Parameter(torch.zeros(num_features))
47 |
48 |
self.register_parameter('weight', None)
49 |
self.register_parameter('bias', None)
50 |
self.register_buffer('running_mean', torch.zeros(num_features))
51 |
self.register_buffer('running_var', torch.ones(num_features))
52 |
53 |
54 |
def reset_parameters(self):
55 |
nn.init.constant_(self.running_mean, 0)
56 |
nn.init.constant_(self.running_var, 1)
57 |
if self.affine:
58 |
nn.init.constant_(self.weight, 1)
59 |
nn.init.constant_(self.bias, 0)
60 |
61 |
def forward(self, x):
62 |
x = functional.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias,
63 |
+, self.momentum, self.eps)
64 |
65 |
if self.activation == ACT_RELU:
66 |
return functional.relu(x, inplace=True)
67 |
elif self.activation == ACT_LEAKY_RELU:
68 |
return functional.leaky_relu(x, negative_slope=self.slope, inplace=True)
69 |
elif self.activation == ACT_ELU:
70 |
return functional.elu(x, inplace=True)
71 |
72 |
return x
73 |
74 |
def __repr__(self):
75 |
rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
76 |
' affine={affine}, activation={activation}'
77 |
if self.activation == "leaky_relu":
78 |
rep += ', slope={slope})'
79 |
80 |
rep += ')'
81 |
return rep.format(name=self.__class__.__name__, **self.__dict__)
82 |
83 |
84 |
class InPlaceABN(ABN):
85 |
"""InPlace Activated Batch Normalization"""
86 |
87 |
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
88 |
"""Creates an InPlace Activated Batch Normalization module
89 |
90 |
91 |
92 |
num_features : int
93 |
Number of feature channels in the input and output.
94 |
eps : float
95 |
Small constant to prevent numerical issues.
96 |
momentum : float
97 |
Momentum factor applied to compute running statistics as.
98 |
affine : bool
99 |
If `True` apply learned scale and shift transformation after normalization.
100 |
activation : str
101 |
Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
102 |
slope : float
103 |
Negative slope for the `leaky_relu` activation.
104 |
105 |
super(InPlaceABN, self).__init__(num_features, eps, momentum, affine, activation, slope)
106 |
107 |
def forward(self, x):
108 |
x, _, _ = inplace_abn(x, self.weight, self.bias, self.running_mean, self.running_var,
109 |
+, self.momentum, self.eps, self.activation, self.slope)
110 |
return x
111 |
112 |
113 |
class InPlaceABNSync(ABN):
114 |
"""InPlace Activated Batch Normalization with cross-GPU synchronization
115 |
This assumes that it will be replicated across GPUs using the same mechanism as in `nn.DistributedDataParallel`.
116 |
117 |
118 |
def forward(self, x):
119 |
x, _, _ = inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
120 |
+, self.momentum, self.eps, self.activation, self.slope)
121 |
return x
122 |
123 |
def __repr__(self):
124 |
rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
125 |
' affine={affine}, activation={activation}'
126 |
if self.activation == "leaky_relu":
127 |
rep += ', slope={slope})'
128 |
129 |
rep += ')'
130 |
return rep.format(name=self.__class__.__name__, **self.__dict__)
131 |
132 |
@@ -0,0 +1,84 @@
1 |
import torch
2 |
import torch.nn as nn
3 |
import torch.nn.functional as functional
4 |
5 |
from models._util import try_index
6 |
from .bn import ABN
7 |
8 |
9 |
class DeeplabV3(nn.Module):
10 |
def __init__(self,
11 |
12 |
13 |
14 |
dilations=(12, 24, 36),
15 |
16 |
17 |
super(DeeplabV3, self).__init__()
18 |
self.pooling_size = pooling_size
19 |
20 |
self.map_convs = nn.ModuleList([
21 |
nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
22 |
nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[0], padding=dilations[0]),
23 |
nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[1], padding=dilations[1]),
24 |
nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[2], padding=dilations[2])
25 |
26 |
self.map_bn = norm_act(hidden_channels * 4)
27 |
28 |
self.global_pooling_conv = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
29 |
self.global_pooling_bn = norm_act(hidden_channels)
30 |
31 |
self.red_conv = nn.Conv2d(hidden_channels * 4, out_channels, 1, bias=False)
32 |
self.pool_red_conv = nn.Conv2d(hidden_channels, out_channels, 1, bias=False)
33 |
self.red_bn = norm_act(out_channels)
34 |
35 |
self.reset_parameters(self.map_bn.activation, self.map_bn.slope)
36 |
37 |
def reset_parameters(self, activation, slope):
38 |
gain = nn.init.calculate_gain(activation, slope)
39 |
for m in self.modules():
40 |
if isinstance(m, nn.Conv2d):
41 |
nn.init.xavier_normal_(, gain)
42 |
if hasattr(m, "bias") and m.bias is not None:
43 |
nn.init.constant_(m.bias, 0)
44 |
elif isinstance(m, ABN):
45 |
if hasattr(m, "weight") and m.weight is not None:
46 |
nn.init.constant_(m.weight, 1)
47 |
if hasattr(m, "bias") and m.bias is not None:
48 |
nn.init.constant_(m.bias, 0)
49 |
50 |
def forward(self, x):
51 |
# Map convolutions
52 |
out =[m(x) for m in self.map_convs], dim=1)
53 |
out = self.map_bn(out)
54 |
out = self.red_conv(out)
55 |
56 |
# Global pooling
57 |
pool = self._global_pooling(x)
58 |
pool = self.global_pooling_conv(pool)
59 |
pool = self.global_pooling_bn(pool)
60 |
pool = self.pool_red_conv(pool)
61 |
if or self.pooling_size is None:
62 |
pool = pool.repeat(1, 1, x.size(2), x.size(3))
63 |
64 |
out += pool
65 |
out = self.red_bn(out)
66 |
return out
67 |
68 |
def _global_pooling(self, x):
69 |
if or self.pooling_size is None:
70 |
pool = x.view(x.size(0), x.size(1), -1).mean(dim=-1)
71 |
pool = pool.view(x.size(0), x.size(1), 1, 1)
72 |
73 |
pooling_size = (min(try_index(self.pooling_size, 0), x.shape[2]),
74 |
min(try_index(self.pooling_size, 1), x.shape[3]))
75 |
padding = (
76 |
(pooling_size[1] - 1) // 2,
77 |
(pooling_size[1] - 1) // 2 if pooling_size[1] % 2 == 1 else (pooling_size[1] - 1) // 2 + 1,
78 |
(pooling_size[0] - 1) // 2,
79 |
(pooling_size[0] - 1) // 2 if pooling_size[0] % 2 == 1 else (pooling_size[0] - 1) // 2 + 1
80 |
81 |
82 |
pool = functional.avg_pool2d(x, pooling_size, stride=1)
83 |
pool = functional.pad(pool, pad=padding, mode="replicate")
84 |
return pool
@@ -0,0 +1,42 @@
1 |
from collections import OrderedDict
2 |
3 |
import torch
4 |
import torch.nn as nn
5 |
6 |
from .bn import ABN
7 |
8 |
9 |
class DenseModule(nn.Module):
10 |
def __init__(self, in_channels, growth, layers, bottleneck_factor=4, norm_act=ABN, dilation=1):
11 |
super(DenseModule, self).__init__()
12 |
self.in_channels = in_channels
13 |
self.growth = growth
14 |
self.layers = layers
15 |
16 |
self.convs1 = nn.ModuleList()
17 |
self.convs3 = nn.ModuleList()
18 |
for i in range(self.layers):
19 |
20 |
("bn", norm_act(in_channels)),
21 |
("conv", nn.Conv2d(in_channels, self.growth * bottleneck_factor, 1, bias=False))
22 |
23 |
24 |
("bn", norm_act(self.growth * bottleneck_factor)),
25 |
("conv", nn.Conv2d(self.growth * bottleneck_factor, self.growth, 3, padding=dilation, bias=False,
26 |
27 |
28 |
in_channels += self.growth
29 |
30 |
31 |
def out_channels(self):
32 |
return self.in_channels + self.growth * self.layers
33 |
34 |
def forward(self, x):
35 |
inputs = [x]
36 |
for i in range(self.layers):
37 |
x =, dim=1)
38 |
x = self.convs1[i](x)
39 |
x = self.convs3[i](x)
40 |
inputs += [x]
41 |
42 |
return, dim=1)
@@ -0,0 +1,244 @@
1 |
from os import path
2 |
import torch
3 |
import torch.distributed as dist
4 |
import torch.autograd as autograd
5 |
import torch.cuda.comm as comm
6 |
from torch.autograd.function import once_differentiable
7 |
from torch.utils.cpp_extension import load
8 |
9 |
_src_path = path.join(path.dirname(path.abspath(__file__)), "src")
10 |
_backend = load(name="inplace_abn",
11 |
12 |
sources=[path.join(_src_path, f) for f in [
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
# Activation names
21 |
ACT_RELU = "relu"
22 |
ACT_LEAKY_RELU = "leaky_relu"
23 |
ACT_ELU = "elu"
24 |
ACT_NONE = "none"
25 |
26 |
27 |
def _check(fn, *args, **kwargs):
28 |
success = fn(*args, **kwargs)
29 |
if not success:
30 |
raise RuntimeError("CUDA Error encountered in {}".format(fn))
31 |
32 |
33 |
def _broadcast_shape(x):
34 |
out_size = []
35 |
for i, s in enumerate(x.size()):
36 |
if i != 1:
37 |
38 |
39 |
40 |
return out_size
41 |
42 |
43 |
def _reduce(x):
44 |
if len(x.size()) == 2:
45 |
return x.sum(dim=0)
46 |
47 |
n, c = x.size()[0:2]
48 |
return x.contiguous().view((n, c, -1)).sum(2).sum(0)
49 |
50 |
51 |
def _count_samples(x):
52 |
count = 1
53 |
for i, s in enumerate(x.size()):
54 |
if i != 1:
55 |
count *= s
56 |
return count
57 |
58 |
59 |
def _act_forward(ctx, x):
60 |
if ctx.activation == ACT_LEAKY_RELU:
61 |
_backend.leaky_relu_forward(x, ctx.slope)
62 |
elif ctx.activation == ACT_ELU:
63 |
64 |
elif ctx.activation == ACT_NONE:
65 |
66 |
67 |
68 |
def _act_backward(ctx, x, dx):
69 |
if ctx.activation == ACT_LEAKY_RELU:
70 |
_backend.leaky_relu_backward(x, dx, ctx.slope)
71 |
elif ctx.activation == ACT_ELU:
72 |
_backend.elu_backward(x, dx)
73 |
elif ctx.activation == ACT_NONE:
74 |
75 |
76 |
77 |
class InPlaceABN(autograd.Function):
78 |
79 |
def forward(ctx, x, weight, bias, running_mean, running_var,
80 |
training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01):
81 |
# Save context
82 |
+ = training
83 |
ctx.momentum = momentum
84 |
ctx.eps = eps
85 |
ctx.activation = activation
86 |
ctx.slope = slope
87 |
ctx.affine = weight is not None and bias is not None
88 |
89 |
# Prepare inputs
90 |
count = _count_samples(x)
91 |
x = x.contiguous()
92 |
weight = weight.contiguous() if ctx.affine else x.new_empty(0)
93 |
bias = bias.contiguous() if ctx.affine else x.new_empty(0)
94 |
95 |
96 |
mean, var = _backend.mean_var(x)
97 |
98 |
# Update running stats
99 |
running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
100 |
running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * count / (count - 1))
101 |
102 |
# Mark in-place modified tensors
103 |
ctx.mark_dirty(x, running_mean, running_var)
104 |
105 |
mean, var = running_mean.contiguous(), running_var.contiguous()
106 |
107 |
108 |
# BN forward + activation
109 |
_backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
110 |
_act_forward(ctx, x)
111 |
112 |
# Output
113 |
ctx.var = var
114 |
ctx.save_for_backward(x, var, weight, bias)
115 |
ctx.mark_non_differentiable(running_mean, running_var)
116 |
return x, running_mean, running_var
117 |
118 |
119 |
120 |
def backward(ctx, dz, _drunning_mean, _drunning_var):
121 |
z, var, weight, bias = ctx.saved_tensors
122 |
dz = dz.contiguous()
123 |
124 |
# Undo activation
125 |
_act_backward(ctx, z, dz)
126 |
127 |
128 |
edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
129 |
130 |
# TODO: implement simplified CUDA backward for inference mode
131 |
edz = dz.new_zeros(dz.size(1))
132 |
eydz = dz.new_zeros(dz.size(1))
133 |
134 |
dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
135 |
# dweight = eydz * weight.sign() if ctx.affine else None
136 |
dweight = eydz if ctx.affine else None
137 |
if dweight is not None:
138 |
dweight[weight < 0] *= -1
139 |
dbias = edz if ctx.affine else None
140 |
141 |
return dx, dweight, dbias, None, None, None, None, None, None, None
142 |
143 |
144 |
class InPlaceABNSync(autograd.Function):
145 |
146 |
def forward(cls, ctx, x, weight, bias, running_mean, running_var,
147 |
training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01, equal_batches=True):
148 |
# Save context
149 |
+ = training
150 |
ctx.momentum = momentum
151 |
ctx.eps = eps
152 |
ctx.activation = activation
153 |
ctx.slope = slope
154 |
ctx.affine = weight is not None and bias is not None
155 |
156 |
# Prepare inputs
157 |
ctx.world_size = dist.get_world_size() if dist.is_initialized() else 1
158 |
159 |
# count = _count_samples(x)
160 |
batch_size = x.new_tensor([x.shape[0]], dtype=torch.long)
161 |
162 |
x = x.contiguous()
163 |
weight = weight.contiguous() if ctx.affine else x.new_empty(0)
164 |
bias = bias.contiguous() if ctx.affine else x.new_empty(0)
165 |
166 |
167 |
mean, var = _backend.mean_var(x)
168 |
if ctx.world_size > 1:
169 |
# get global batch size
170 |
if equal_batches:
171 |
batch_size *= ctx.world_size
172 |
173 |
dist.all_reduce(batch_size, dist.ReduceOp.SUM)
174 |
175 |
ctx.factor = x.shape[0] / float(batch_size.item())
176 |
177 |
mean_all = mean.clone() * ctx.factor
178 |
dist.all_reduce(mean_all, dist.ReduceOp.SUM)
179 |
180 |
var_all = (var + (mean - mean_all) ** 2) * ctx.factor
181 |
dist.all_reduce(var_all, dist.ReduceOp.SUM)
182 |
183 |
mean = mean_all
184 |
var = var_all
185 |
186 |
# Update running stats
187 |
running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
188 |
count = batch_size.item() * x.view(x.shape[0], x.shape[1], -1).shape[-1]
189 |
running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * (float(count) / (count - 1)))
190 |
191 |
# Mark in-place modified tensors
192 |
ctx.mark_dirty(x, running_mean, running_var)
193 |
194 |
mean, var = running_mean.contiguous(), running_var.contiguous()
195 |
196 |
197 |
# BN forward + activation
198 |
_backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
199 |
_act_forward(ctx, x)
200 |
201 |
# Output
202 |
ctx.var = var
203 |
ctx.save_for_backward(x, var, weight, bias)
204 |
ctx.mark_non_differentiable(running_mean, running_var)
205 |
return x, running_mean, running_var
206 |
207 |
208 |
209 |
def backward(ctx, dz, _drunning_mean, _drunning_var):
210 |
z, var, weight, bias = ctx.saved_tensors
211 |
dz = dz.contiguous()
212 |
213 |
# Undo activation
214 |
_act_backward(ctx, z, dz)
215 |
216 |
217 |
edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
218 |
edz_local = edz.clone()
219 |
eydz_local = eydz.clone()
220 |
221 |
if ctx.world_size > 1:
222 |
edz *= ctx.factor
223 |
dist.all_reduce(edz, dist.ReduceOp.SUM)
224 |
225 |
eydz *= ctx.factor
226 |
dist.all_reduce(eydz, dist.ReduceOp.SUM)
227 |
228 |
edz_local = edz = dz.new_zeros(dz.size(1))
229 |
eydz_local = eydz = dz.new_zeros(dz.size(1))
230 |
231 |
dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
232 |
# dweight = eydz_local * weight.sign() if ctx.affine else None
233 |
dweight = eydz_local if ctx.affine else None
234 |
if dweight is not None:
235 |
dweight[weight < 0] *= -1
236 |
dbias = edz_local if ctx.affine else None
237 |
238 |
return dx, dweight, dbias, None, None, None, None, None, None, None
239 |
240 |
241 |
inplace_abn = InPlaceABN.apply
242 |
inplace_abn_sync = InPlaceABNSync.apply
243 |
244 |
__all__ = ["inplace_abn", "inplace_abn_sync", "ACT_RELU", "ACT_LEAKY_RELU", "ACT_ELU", "ACT_NONE"]
@@ -0,0 +1,21 @@
1 |
import torch.nn as nn
2 |
import torch
3 |
import torch.distributed as dist
4 |
5 |
class GlobalAvgPool2d(nn.Module):
6 |
def __init__(self):
7 |
"""Global average pooling over the input's spatial dimensions"""
8 |
super(GlobalAvgPool2d, self).__init__()
9 |
10 |
def forward(self, inputs):
11 |
in_size = inputs.size()
12 |
return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2)
13 |
14 |
class SingleGPU(nn.Module):
15 |
def __init__(self, module):
16 |
super(SingleGPU, self).__init__()
17 |
18 |
19 |
def forward(self, input):
20 |
return self.module(input.cuda(non_blocking=True))
21 |
@@ -0,0 +1,182 @@
1 |
from collections import OrderedDict
2 |
3 |
import torch.nn as nn
4 |
5 |
6 |
import torch.nn.functional as functional
7 |
8 |
9 |
class ResidualBlock(nn.Module):
10 |
"""Configurable residual block
11 |
12 |
13 |
14 |
in_channels : int
15 |
Number of input channels.
16 |
channels : list of int
17 |
Number of channels in the internal feature maps. Can either have two or three elements: if three construct
18 |
a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
19 |
`3 x 3` then `1 x 1` convolutions.
20 |
stride : int
21 |
Stride of the first `3 x 3` convolution
22 |
dilation : int
23 |
Dilation to apply to the `3 x 3` convolutions.
24 |
groups : int
25 |
Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
26 |
bottleneck blocks.
27 |
norm_act : callable
28 |
Function to create normalization / activation Module.
29 |
dropout: callable
30 |
Function to create Dropout Module.
31 |
32 |
33 |
def __init__(self,
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
super(ResidualBlock, self).__init__()
42 |
43 |
# Check parameters for inconsistencies
44 |
if len(channels) != 2 and len(channels) != 3:
45 |
raise ValueError("channels must contain either two or three values")
46 |
if len(channels) == 2 and groups != 1:
47 |
raise ValueError("groups > 1 are only valid if len(channels) == 3")
48 |
49 |
is_bottleneck = len(channels) == 3
50 |
need_proj_conv = stride != 1 or in_channels != channels[-1]
51 |
52 |
if not is_bottleneck:
53 |
bn2 = norm_act(channels[1])
54 |
bn2.activation = ACT_NONE
55 |
layers = [
56 |
("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
57 |
58 |
("bn1", norm_act(channels[0])),
59 |
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
60 |
61 |
("bn2", bn2)
62 |
63 |
if dropout is not None:
64 |
layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
65 |
66 |
bn3 = norm_act(channels[2])
67 |
bn3.activation = ACT_NONE
68 |
layers = [
69 |
("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=1, padding=0, bias=False)),
70 |
("bn1", norm_act(channels[0])),
71 |
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=stride, padding=dilation, bias=False,
72 |
groups=groups, dilation=dilation)),
73 |
("bn2", norm_act(channels[1])),
74 |
("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False)),
75 |
("bn3", bn3)
76 |
77 |
if dropout is not None:
78 |
layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
79 |
self.convs = nn.Sequential(OrderedDict(layers))
80 |
81 |
if need_proj_conv:
82 |
self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
83 |
self.proj_bn = norm_act(channels[-1])
84 |
self.proj_bn.activation = ACT_NONE
85 |
86 |
def forward(self, x):
87 |
if hasattr(self, "proj_conv"):
88 |
residual = self.proj_conv(x)
89 |
residual = self.proj_bn(residual)
90 |
91 |
residual = x
92 |
x = self.convs(x) + residual
93 |
94 |
if self.convs.bn1.activation == ACT_LEAKY_RELU:
95 |
return functional.leaky_relu(x, negative_slope=self.convs.bn1.slope, inplace=True)
96 |
elif self.convs.bn1.activation == ACT_ELU:
97 |
return functional.elu(x, inplace=True)
98 |
99 |
return x
100 |
101 |
102 |
class IdentityResidualBlock(nn.Module):
103 |
def __init__(self,
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
"""Configurable identity-mapping residual block
112 |
113 |
114 |
115 |
in_channels : int
116 |
Number of input channels.
117 |
channels : list of int
118 |
Number of channels in the internal feature maps. Can either have two or three elements: if three construct
119 |
a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
120 |
`3 x 3` then `1 x 1` convolutions.
121 |
stride : int
122 |
Stride of the first `3 x 3` convolution
123 |
dilation : int
124 |
Dilation to apply to the `3 x 3` convolutions.
125 |
groups : int
126 |
Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
127 |
bottleneck blocks.
128 |
norm_act : callable
129 |
Function to create normalization / activation Module.
130 |
dropout: callable
131 |
Function to create Dropout Module.
132 |
133 |
super(IdentityResidualBlock, self).__init__()
134 |
135 |
# Check parameters for inconsistencies
136 |
if len(channels) != 2 and len(channels) != 3:
137 |
raise ValueError("channels must contain either two or three values")
138 |
if len(channels) == 2 and groups != 1:
139 |
raise ValueError("groups > 1 are only valid if len(channels) == 3")
140 |
141 |
is_bottleneck = len(channels) == 3
142 |
need_proj_conv = stride != 1 or in_channels != channels[-1]
143 |
144 |
self.bn1 = norm_act(in_channels)
145 |
if not is_bottleneck:
146 |
layers = [
147 |
("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
148 |
149 |
("bn2", norm_act(channels[0])),
150 |
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
151 |
152 |
153 |
if dropout is not None:
154 |
layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
155 |
156 |
layers = [
157 |
("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=stride, padding=0, bias=False)),
158 |
("bn2", norm_act(channels[0])),
159 |
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
160 |
groups=groups, dilation=dilation)),
161 |
("bn3", norm_act(channels[1])),
162 |
("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False))
163 |
164 |
if dropout is not None:
165 |
layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
166 |
self.convs = nn.Sequential(OrderedDict(layers))
167 |
168 |
if need_proj_conv:
169 |
self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
170 |
171 |
def forward(self, x):
172 |
if hasattr(self, "proj_conv"):
173 |
bn1 = self.bn1(x)
174 |
shortcut = self.proj_conv(bn1)
175 |
176 |
shortcut = x.clone()
177 |
bn1 = self.bn1(x)
178 |
179 |
out = self.convs(bn1)
180 |
181 |
182 |
return out
@@ -0,0 +1,15 @@
1 |
#pragma once
2 |
3 |
#include <ATen/ATen.h>
4 |
5 |
// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
6 |
#ifndef AT_CHECK
7 |
8 |
9 |
10 |
#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
11 |
#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
12 |
#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
13 |
14 |
15 |
@@ -0,0 +1,95 @@
1 |
#include <torch/extension.h>
2 |
3 |
#include <vector>
4 |
5 |
#include "inplace_abn.h"
6 |
7 |
std::vector<at::Tensor> mean_var(at::Tensor x) {
8 |
if (x.is_cuda()) {
9 |
if (x.type().scalarType() == at::ScalarType::Half) {
10 |
return mean_var_cuda_h(x);
11 |
} else {
12 |
return mean_var_cuda(x);
13 |
14 |
} else {
15 |
return mean_var_cpu(x);
16 |
17 |
18 |
19 |
at::Tensor forward(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
20 |
bool affine, float eps) {
21 |
if (x.is_cuda()) {
22 |
if (x.type().scalarType() == at::ScalarType::Half) {
23 |
return forward_cuda_h(x, mean, var, weight, bias, affine, eps);
24 |
} else {
25 |
return forward_cuda(x, mean, var, weight, bias, affine, eps);
26 |
27 |
} else {
28 |
return forward_cpu(x, mean, var, weight, bias, affine, eps);
29 |
30 |
31 |
32 |
std::vector<at::Tensor> edz_eydz(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
33 |
bool affine, float eps) {
34 |
if (z.is_cuda()) {
35 |
if (z.type().scalarType() == at::ScalarType::Half) {
36 |
return edz_eydz_cuda_h(z, dz, weight, bias, affine, eps);
37 |
} else {
38 |
return edz_eydz_cuda(z, dz, weight, bias, affine, eps);
39 |
40 |
} else {
41 |
return edz_eydz_cpu(z, dz, weight, bias, affine, eps);
42 |
43 |
44 |
45 |
at::Tensor backward(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
46 |
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
47 |
if (z.is_cuda()) {
48 |
if (z.type().scalarType() == at::ScalarType::Half) {
49 |
return backward_cuda_h(z, dz, var, weight, bias, edz, eydz, affine, eps);
50 |
} else {
51 |
return backward_cuda(z, dz, var, weight, bias, edz, eydz, affine, eps);
52 |
53 |
} else {
54 |
return backward_cpu(z, dz, var, weight, bias, edz, eydz, affine, eps);
55 |
56 |
57 |
58 |
void leaky_relu_forward(at::Tensor z, float slope) {
59 |
at::leaky_relu_(z, slope);
60 |
61 |
62 |
void leaky_relu_backward(at::Tensor z, at::Tensor dz, float slope) {
63 |
if (z.is_cuda()) {
64 |
if (z.type().scalarType() == at::ScalarType::Half) {
65 |
return leaky_relu_backward_cuda_h(z, dz, slope);
66 |
} else {
67 |
return leaky_relu_backward_cuda(z, dz, slope);
68 |
69 |
} else {
70 |
return leaky_relu_backward_cpu(z, dz, slope);
71 |
72 |
73 |
74 |
void elu_forward(at::Tensor z) {
75 |
76 |
77 |
78 |
void elu_backward(at::Tensor z, at::Tensor dz) {
79 |
if (z.is_cuda()) {
80 |
return elu_backward_cuda(z, dz);
81 |
} else {
82 |
return elu_backward_cpu(z, dz);
83 |
84 |
85 |
86 |
87 |
m.def("mean_var", &mean_var, "Mean and variance computation");
88 |
m.def("forward", &forward, "In-place forward computation");
89 |
m.def("edz_eydz", &edz_eydz, "First part of backward computation");
90 |
m.def("backward", &backward, "Second part of backward computation");
91 |
m.def("leaky_relu_forward", &leaky_relu_forward, "Leaky relu forward computation");
92 |
m.def("leaky_relu_backward", &leaky_relu_backward, "Leaky relu backward computation and inversion");
93 |
m.def("elu_forward", &elu_forward, "Elu forward computation");
94 |
m.def("elu_backward", &elu_backward, "Elu backward computation and inversion");
95 |
@@ -0,0 +1,88 @@
1 |
#pragma once
2 |
3 |
#include <ATen/ATen.h>
4 |
5 |
#include <vector>
6 |
7 |
std::vector<at::Tensor> mean_var_cpu(at::Tensor x);
8 |
std::vector<at::Tensor> mean_var_cuda(at::Tensor x);
9 |
std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x);
10 |
11 |
at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
12 |
bool affine, float eps);
13 |
at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
14 |
bool affine, float eps);
15 |
at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
16 |
bool affine, float eps);
17 |
18 |
std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
19 |
bool affine, float eps);
20 |
std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
21 |
bool affine, float eps);
22 |
std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
23 |
bool affine, float eps);
24 |
25 |
at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
26 |
at::Tensor edz, at::Tensor eydz, bool affine, float eps);
27 |
at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
28 |
at::Tensor edz, at::Tensor eydz, bool affine, float eps);
29 |
at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
30 |
at::Tensor edz, at::Tensor eydz, bool affine, float eps);
31 |
32 |
void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope);
33 |
void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope);
34 |
void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope);
35 |
36 |
void elu_backward_cpu(at::Tensor z, at::Tensor dz);
37 |
void elu_backward_cuda(at::Tensor z, at::Tensor dz);
38 |
39 |
static void get_dims(at::Tensor x, int64_t& num, int64_t& chn, int64_t& sp) {
40 |
num = x.size(0);
41 |
chn = x.size(1);
42 |
sp = 1;
43 |
for (int64_t i = 2; i < x.ndimension(); ++i)
44 |
sp *= x.size(i);
45 |
46 |
47 |
48 |
* Specialized CUDA reduction functions for BN
49 |
50 |
#ifdef __CUDACC__
51 |
52 |
#include "utils/cuda.cuh"
53 |
54 |
template <typename T, typename Op>
55 |
__device__ T reduce(Op op, int plane, int N, int S) {
56 |
T sum = (T)0;
57 |
for (int batch = 0; batch < N; ++batch) {
58 |
for (int x = threadIdx.x; x < S; x += blockDim.x) {
59 |
sum += op(batch, plane, x);
60 |
61 |
62 |
63 |
// sum over NumThreads within a warp
64 |
sum = warpSum(sum);
65 |
66 |
// 'transpose', and reduce within warp again
67 |
__shared__ T shared[32];
68 |
69 |
if (threadIdx.x % WARP_SIZE == 0) {
70 |
shared[threadIdx.x / WARP_SIZE] = sum;
71 |
72 |
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
73 |
// zero out the other entries in shared
74 |
shared[threadIdx.x] = (T)0;
75 |
76 |
77 |
if (threadIdx.x / WARP_SIZE == 0) {
78 |
sum = warpSum(shared[threadIdx.x]);
79 |
if (threadIdx.x == 0) {
80 |
shared[0] = sum;
81 |
82 |
83 |
84 |
85 |
// Everyone picks it up, should be broadcast into the whole gradInput
86 |
return shared[0];
87 |
88 |
@@ -0,0 +1,119 @@
1 |
#include <ATen/ATen.h>
2 |
3 |
#include <vector>
4 |
5 |
#include "utils/checks.h"
6 |
#include "inplace_abn.h"
7 |
8 |
at::Tensor reduce_sum(at::Tensor x) {
9 |
if (x.ndimension() == 2) {
10 |
return x.sum(0);
11 |
} else {
12 |
auto x_view = x.view({x.size(0), x.size(1), -1});
13 |
return x_view.sum(-1).sum(0);
14 |
15 |
16 |
17 |
at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
18 |
if (x.ndimension() == 2) {
19 |
return v;
20 |
} else {
21 |
std::vector<int64_t> broadcast_size = {1, -1};
22 |
for (int64_t i = 2; i < x.ndimension(); ++i)
23 |
24 |
25 |
return v.view(broadcast_size);
26 |
27 |
28 |
29 |
int64_t count(at::Tensor x) {
30 |
int64_t count = x.size(0);
31 |
for (int64_t i = 2; i < x.ndimension(); ++i)
32 |
count *= x.size(i);
33 |
34 |
return count;
35 |
36 |
37 |
at::Tensor invert_affine(at::Tensor z, at::Tensor weight, at::Tensor bias, bool affine, float eps) {
38 |
if (affine) {
39 |
return (z - broadcast_to(bias, z)) / broadcast_to(at::abs(weight) + eps, z);
40 |
} else {
41 |
return z;
42 |
43 |
44 |
45 |
std::vector<at::Tensor> mean_var_cpu(at::Tensor x) {
46 |
auto num = count(x);
47 |
auto mean = reduce_sum(x) / num;
48 |
auto diff = x - broadcast_to(mean, x);
49 |
auto var = reduce_sum(diff.pow(2)) / num;
50 |
51 |
return {mean, var};
52 |
53 |
54 |
at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
55 |
bool affine, float eps) {
56 |
auto gamma = affine ? at::abs(weight) + eps : at::ones_like(var);
57 |
auto mul = at::rsqrt(var + eps) * gamma;
58 |
59 |
x.sub_(broadcast_to(mean, x));
60 |
x.mul_(broadcast_to(mul, x));
61 |
if (affine) x.add_(broadcast_to(bias, x));
62 |
63 |
return x;
64 |
65 |
66 |
std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
67 |
bool affine, float eps) {
68 |
auto edz = reduce_sum(dz);
69 |
auto y = invert_affine(z, weight, bias, affine, eps);
70 |
auto eydz = reduce_sum(y * dz);
71 |
72 |
return {edz, eydz};
73 |
74 |
75 |
at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
76 |
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
77 |
auto y = invert_affine(z, weight, bias, affine, eps);
78 |
auto mul = affine ? at::rsqrt(var + eps) * (at::abs(weight) + eps) : at::rsqrt(var + eps);
79 |
80 |
auto num = count(z);
81 |
auto dx = (dz - broadcast_to(edz / num, dz) - y * broadcast_to(eydz / num, dz)) * broadcast_to(mul, dz);
82 |
return dx;
83 |
84 |
85 |
void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope) {
86 |
87 |
88 |
89 |
AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cpu", ([&] {
90 |
int64_t count = z.numel();
91 |
auto *_z =<scalar_t>();
92 |
auto *_dz =<scalar_t>();
93 |
94 |
for (int64_t i = 0; i < count; ++i) {
95 |
if (_z[i] < 0) {
96 |
_z[i] *= 1 / slope;
97 |
_dz[i] *= slope;
98 |
99 |
100 |
101 |
102 |
103 |
void elu_backward_cpu(at::Tensor z, at::Tensor dz) {
104 |
105 |
106 |
107 |
AT_DISPATCH_FLOATING_TYPES(z.type(), "elu_backward_cpu", ([&] {
108 |
int64_t count = z.numel();
109 |
auto *_z =<scalar_t>();
110 |
auto *_dz =<scalar_t>();
111 |
112 |
for (int64_t i = 0; i < count; ++i) {
113 |
if (_z[i] < 0) {
114 |
_z[i] = log1p(_z[i]);
115 |
_dz[i] *= (_z[i] + 1.f);
116 |
117 |
118 |
119 |
@@ -0,0 +1,333 @@
1 |
#include <ATen/ATen.h>
2 |
3 |
#include <thrust/device_ptr.h>
4 |
#include <thrust/transform.h>
5 |
6 |
#include <vector>
7 |
8 |
#include "utils/checks.h"
9 |
#include "utils/cuda.cuh"
10 |
#include "inplace_abn.h"
11 |
12 |
#include <ATen/cuda/CUDAContext.h>
13 |
14 |
// Operations for reduce
15 |
template<typename T>
16 |
struct SumOp {
17 |
__device__ SumOp(const T *t, int c, int s)
18 |
: tensor(t), chn(c), sp(s) {}
19 |
__device__ __forceinline__ T operator()(int batch, int plane, int n) {
20 |
return tensor[(batch * chn + plane) * sp + n];
21 |
22 |
const T *tensor;
23 |
const int chn;
24 |
const int sp;
25 |
26 |
27 |
template<typename T>
28 |
struct VarOp {
29 |
__device__ VarOp(T m, const T *t, int c, int s)
30 |
: mean(m), tensor(t), chn(c), sp(s) {}
31 |
__device__ __forceinline__ T operator()(int batch, int plane, int n) {
32 |
T val = tensor[(batch * chn + plane) * sp + n];
33 |
return (val - mean) * (val - mean);
34 |
35 |
const T mean;
36 |
const T *tensor;
37 |
const int chn;
38 |
const int sp;
39 |
40 |
41 |
template<typename T>
42 |
struct GradOp {
43 |
__device__ GradOp(T _weight, T _bias, const T *_z, const T *_dz, int c, int s)
44 |
: weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
45 |
__device__ __forceinline__ Pair<T> operator()(int batch, int plane, int n) {
46 |
T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight;
47 |
T _dz = dz[(batch * chn + plane) * sp + n];
48 |
return Pair<T>(_dz, _y * _dz);
49 |
50 |
const T weight;
51 |
const T bias;
52 |
const T *z;
53 |
const T *dz;
54 |
const int chn;
55 |
const int sp;
56 |
57 |
58 |
59 |
* mean_var
60 |
61 |
62 |
template<typename T>
63 |
__global__ void mean_var_kernel(const T *x, T *mean, T *var, int num, int chn, int sp) {
64 |
int plane = blockIdx.x;
65 |
T norm = T(1) / T(num * sp);
66 |
67 |
T _mean = reduce<T, SumOp<T>>(SumOp<T>(x, chn, sp), plane, num, sp) * norm;
68 |
69 |
T _var = reduce<T, VarOp<T>>(VarOp<T>(_mean, x, chn, sp), plane, num, sp) * norm;
70 |
71 |
if (threadIdx.x == 0) {
72 |
mean[plane] = _mean;
73 |
var[plane] = _var;
74 |
75 |
76 |
77 |
std::vector<at::Tensor> mean_var_cuda(at::Tensor x) {
78 |
79 |
80 |
// Extract dimensions
81 |
int64_t num, chn, sp;
82 |
get_dims(x, num, chn, sp);
83 |
84 |
// Prepare output tensors
85 |
auto mean = at::empty({chn}, x.options());
86 |
auto var = at::empty({chn}, x.options());
87 |
88 |
// Run kernel
89 |
dim3 blocks(chn);
90 |
dim3 threads(getNumThreads(sp));
91 |
auto stream = at::cuda::getCurrentCUDAStream();
92 |
AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] {
93 |
mean_var_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
94 |
95 |
96 |
97 |
num, chn, sp);
98 |
99 |
100 |
return {mean, var};
101 |
102 |
103 |
104 |
* forward
105 |
106 |
107 |
template<typename T>
108 |
__global__ void forward_kernel(T *x, const T *mean, const T *var, const T *weight, const T *bias,
109 |
bool affine, float eps, int num, int chn, int sp) {
110 |
int plane = blockIdx.x;
111 |
112 |
T _mean = mean[plane];
113 |
T _var = var[plane];
114 |
T _weight = affine ? abs(weight[plane]) + eps : T(1);
115 |
T _bias = affine ? bias[plane] : T(0);
116 |
117 |
T mul = rsqrt(_var + eps) * _weight;
118 |
119 |
for (int batch = 0; batch < num; ++batch) {
120 |
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
121 |
T _x = x[(batch * chn + plane) * sp + n];
122 |
T _y = (_x - _mean) * mul + _bias;
123 |
124 |
x[(batch * chn + plane) * sp + n] = _y;
125 |
126 |
127 |
128 |
129 |
at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
130 |
bool affine, float eps) {
131 |
132 |
133 |
134 |
135 |
136 |
137 |
// Extract dimensions
138 |
int64_t num, chn, sp;
139 |
get_dims(x, num, chn, sp);
140 |
141 |
// Run kernel
142 |
dim3 blocks(chn);
143 |
dim3 threads(getNumThreads(sp));
144 |
auto stream = at::cuda::getCurrentCUDAStream();
145 |
AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] {
146 |
forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
147 |
148 |
149 |
150 |
151 |
152 |
affine, eps, num, chn, sp);
153 |
154 |
155 |
return x;
156 |
157 |
158 |
159 |
* edz_eydz
160 |
161 |
162 |
template<typename T>
163 |
__global__ void edz_eydz_kernel(const T *z, const T *dz, const T *weight, const T *bias,
164 |
T *edz, T *eydz, bool affine, float eps, int num, int chn, int sp) {
165 |
int plane = blockIdx.x;
166 |
167 |
T _weight = affine ? abs(weight[plane]) + eps : 1.f;
168 |
T _bias = affine ? bias[plane] : 0.f;
169 |
170 |
Pair<T> res = reduce<Pair<T>, GradOp<T>>(GradOp<T>(_weight, _bias, z, dz, chn, sp), plane, num, sp);
171 |
172 |
173 |
if (threadIdx.x == 0) {
174 |
edz[plane] = res.v1;
175 |
eydz[plane] = res.v2;
176 |
177 |
178 |
179 |
std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
180 |
bool affine, float eps) {
181 |
182 |
183 |
184 |
185 |
186 |
// Extract dimensions
187 |
int64_t num, chn, sp;
188 |
get_dims(z, num, chn, sp);
189 |
190 |
auto edz = at::empty({chn}, z.options());
191 |
auto eydz = at::empty({chn}, z.options());
192 |
193 |
// Run kernel
194 |
dim3 blocks(chn);
195 |
dim3 threads(getNumThreads(sp));
196 |
auto stream = at::cuda::getCurrentCUDAStream();
197 |
AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] {
198 |
edz_eydz_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
199 |
200 |
201 |
202 |
203 |
204 |
205 |
affine, eps, num, chn, sp);
206 |
207 |
208 |
return {edz, eydz};
209 |
210 |
211 |
212 |
* backward
213 |
214 |
215 |
template<typename T>
216 |
__global__ void backward_kernel(const T *z, const T *dz, const T *var, const T *weight, const T *bias, const T *edz,
217 |
const T *eydz, T *dx, bool affine, float eps, int num, int chn, int sp) {
218 |
int plane = blockIdx.x;
219 |
220 |
T _weight = affine ? abs(weight[plane]) + eps : 1.f;
221 |
T _bias = affine ? bias[plane] : 0.f;
222 |
T _var = var[plane];
223 |
T _edz = edz[plane];
224 |
T _eydz = eydz[plane];
225 |
226 |
T _mul = _weight * rsqrt(_var + eps);
227 |
T count = T(num * sp);
228 |
229 |
for (int batch = 0; batch < num; ++batch) {
230 |
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
231 |
T _dz = dz[(batch * chn + plane) * sp + n];
232 |
T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight;
233 |
234 |
dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul;
235 |
236 |
237 |
238 |
239 |
at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
240 |
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
// Extract dimensions
250 |
int64_t num, chn, sp;
251 |
get_dims(z, num, chn, sp);
252 |
253 |
auto dx = at::zeros_like(z);
254 |
255 |
// Run kernel
256 |
dim3 blocks(chn);
257 |
dim3 threads(getNumThreads(sp));
258 |
auto stream = at::cuda::getCurrentCUDAStream();
259 |
AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] {
260 |
backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
affine, eps, num, chn, sp);
270 |
271 |
272 |
return dx;
273 |
274 |
275 |
276 |
* activations
277 |
278 |
279 |
template<typename T>
280 |
inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) {
281 |
// Create thrust pointers
282 |
thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
283 |
thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
284 |
285 |
auto stream = at::cuda::getCurrentCUDAStream();
286 |
287 |
th_dz, th_dz + count, th_z, th_dz,
288 |
[slope] __device__ (const T& dz) { return dz * slope; },
289 |
[] __device__ (const T& z) { return z < 0; });
290 |
291 |
th_z, th_z + count, th_z,
292 |
[slope] __device__ (const T& z) { return z / slope; },
293 |
[] __device__ (const T& z) { return z < 0; });
294 |
295 |
296 |
void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) {
297 |
298 |
299 |
300 |
int64_t count = z.numel();
301 |
302 |
AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
303 |
leaky_relu_backward_impl<scalar_t>(<scalar_t>(),<scalar_t>(), slope, count);
304 |
305 |
306 |
307 |
template<typename T>
308 |
inline void elu_backward_impl(T *z, T *dz, int64_t count) {
309 |
// Create thrust pointers
310 |
thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
311 |
thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
312 |
313 |
auto stream = at::cuda::getCurrentCUDAStream();
314 |
315 |
th_dz, th_dz + count, th_z, th_z, th_dz,
316 |
[] __device__ (const T& dz, const T& z) { return dz * (z + 1.); },
317 |
[] __device__ (const T& z) { return z < 0; });
318 |
319 |
th_z, th_z + count, th_z,
320 |
[] __device__ (const T& z) { return log1p(z); },
321 |
[] __device__ (const T& z) { return z < 0; });
322 |
323 |
324 |
void elu_backward_cuda(at::Tensor z, at::Tensor dz) {
325 |
326 |
327 |
328 |
int64_t count = z.numel();
329 |
330 |
AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
331 |
elu_backward_impl<scalar_t>(<scalar_t>(),<scalar_t>(), count);
332 |
333 |
@@ -0,0 +1,275 @@
1 |
#include <ATen/ATen.h>
2 |
3 |
#include <cuda_fp16.h>
4 |
5 |
#include <vector>
6 |
7 |
#include "utils/checks.h"
8 |
#include "utils/cuda.cuh"
9 |
#include "inplace_abn.h"
10 |
11 |
#include <ATen/cuda/CUDAContext.h>
12 |
13 |
// Operations for reduce
14 |
struct SumOpH {
15 |
__device__ SumOpH(const half *t, int c, int s)
16 |
: tensor(t), chn(c), sp(s) {}
17 |
__device__ __forceinline__ float operator()(int batch, int plane, int n) {
18 |
return __half2float(tensor[(batch * chn + plane) * sp + n]);
19 |
20 |
const half *tensor;
21 |
const int chn;
22 |
const int sp;
23 |
24 |
25 |
struct VarOpH {
26 |
__device__ VarOpH(float m, const half *t, int c, int s)
27 |
: mean(m), tensor(t), chn(c), sp(s) {}
28 |
__device__ __forceinline__ float operator()(int batch, int plane, int n) {
29 |
const auto t = __half2float(tensor[(batch * chn + plane) * sp + n]);
30 |
return (t - mean) * (t - mean);
31 |
32 |
const float mean;
33 |
const half *tensor;
34 |
const int chn;
35 |
const int sp;
36 |
37 |
38 |
struct GradOpH {
39 |
__device__ GradOpH(float _weight, float _bias, const half *_z, const half *_dz, int c, int s)
40 |
: weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
41 |
__device__ __forceinline__ Pair<float> operator()(int batch, int plane, int n) {
42 |
float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - bias) / weight;
43 |
float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
44 |
return Pair<float>(_dz, _y * _dz);
45 |
46 |
const float weight;
47 |
const float bias;
48 |
const half *z;
49 |
const half *dz;
50 |
const int chn;
51 |
const int sp;
52 |
53 |
54 |
55 |
* mean_var
56 |
57 |
58 |
__global__ void mean_var_kernel_h(const half *x, float *mean, float *var, int num, int chn, int sp) {
59 |
int plane = blockIdx.x;
60 |
float norm = 1.f / static_cast<float>(num * sp);
61 |
62 |
float _mean = reduce<float, SumOpH>(SumOpH(x, chn, sp), plane, num, sp) * norm;
63 |
64 |
float _var = reduce<float, VarOpH>(VarOpH(_mean, x, chn, sp), plane, num, sp) * norm;
65 |
66 |
if (threadIdx.x == 0) {
67 |
mean[plane] = _mean;
68 |
var[plane] = _var;
69 |
70 |
71 |
72 |
std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x) {
73 |
74 |
75 |
// Extract dimensions
76 |
int64_t num, chn, sp;
77 |
get_dims(x, num, chn, sp);
78 |
79 |
// Prepare output tensors
80 |
auto mean = at::empty({chn},x.options().dtype(at::kFloat));
81 |
auto var = at::empty({chn},x.options().dtype(at::kFloat));
82 |
83 |
// Run kernel
84 |
dim3 blocks(chn);
85 |
dim3 threads(getNumThreads(sp));
86 |
auto stream = at::cuda::getCurrentCUDAStream();
87 |
mean_var_kernel_h<<<blocks, threads, 0, stream>>>(
88 |
89 |
90 |
91 |
num, chn, sp);
92 |
93 |
return {mean, var};
94 |
95 |
96 |
97 |
* forward
98 |
99 |
100 |
__global__ void forward_kernel_h(half *x, const float *mean, const float *var, const float *weight, const float *bias,
101 |
bool affine, float eps, int num, int chn, int sp) {
102 |
int plane = blockIdx.x;
103 |
104 |
const float _mean = mean[plane];
105 |
const float _var = var[plane];
106 |
const float _weight = affine ? abs(weight[plane]) + eps : 1.f;
107 |
const float _bias = affine ? bias[plane] : 0.f;
108 |
109 |
const float mul = rsqrt(_var + eps) * _weight;
110 |
111 |
for (int batch = 0; batch < num; ++batch) {
112 |
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
113 |
half *x_ptr = x + (batch * chn + plane) * sp + n;
114 |
float _x = __half2float(*x_ptr);
115 |
float _y = (_x - _mean) * mul + _bias;
116 |
117 |
*x_ptr = __float2half(_y);
118 |
119 |
120 |
121 |
122 |
at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
123 |
bool affine, float eps) {
124 |
125 |
126 |
127 |
128 |
129 |
130 |
// Extract dimensions
131 |
int64_t num, chn, sp;
132 |
get_dims(x, num, chn, sp);
133 |
134 |
// Run kernel
135 |
dim3 blocks(chn);
136 |
dim3 threads(getNumThreads(sp));
137 |
auto stream = at::cuda::getCurrentCUDAStream();
138 |
forward_kernel_h<<<blocks, threads, 0, stream>>>(
139 |
140 |
141 |
142 |
143 |
144 |
affine, eps, num, chn, sp);
145 |
146 |
return x;
147 |
148 |
149 |
__global__ void edz_eydz_kernel_h(const half *z, const half *dz, const float *weight, const float *bias,
150 |
float *edz, float *eydz, bool affine, float eps, int num, int chn, int sp) {
151 |
int plane = blockIdx.x;
152 |
153 |
float _weight = affine ? abs(weight[plane]) + eps : 1.f;
154 |
float _bias = affine ? bias[plane] : 0.f;
155 |
156 |
Pair<float> res = reduce<Pair<float>, GradOpH>(GradOpH(_weight, _bias, z, dz, chn, sp), plane, num, sp);
157 |
158 |
159 |
if (threadIdx.x == 0) {
160 |
edz[plane] = res.v1;
161 |
eydz[plane] = res.v2;
162 |
163 |
164 |
165 |
std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
166 |
bool affine, float eps) {
167 |
168 |
169 |
170 |
171 |
172 |
// Extract dimensions
173 |
int64_t num, chn, sp;
174 |
get_dims(z, num, chn, sp);
175 |
176 |
auto edz = at::empty({chn},z.options().dtype(at::kFloat));
177 |
auto eydz = at::empty({chn},z.options().dtype(at::kFloat));
178 |
179 |
// Run kernel
180 |
dim3 blocks(chn);
181 |
dim3 threads(getNumThreads(sp));
182 |
auto stream = at::cuda::getCurrentCUDAStream();
183 |
edz_eydz_kernel_h<<<blocks, threads, 0, stream>>>(
184 |
185 |
186 |
187 |
188 |
189 |
190 |
affine, eps, num, chn, sp);
191 |
192 |
return {edz, eydz};
193 |
194 |
195 |
__global__ void backward_kernel_h(const half *z, const half *dz, const float *var, const float *weight, const float *bias, const float *edz,
196 |
const float *eydz, half *dx, bool affine, float eps, int num, int chn, int sp) {
197 |
int plane = blockIdx.x;
198 |
199 |
float _weight = affine ? abs(weight[plane]) + eps : 1.f;
200 |
float _bias = affine ? bias[plane] : 0.f;
201 |
float _var = var[plane];
202 |
float _edz = edz[plane];
203 |
float _eydz = eydz[plane];
204 |
205 |
float _mul = _weight * rsqrt(_var + eps);
206 |
float count = float(num * sp);
207 |
208 |
for (int batch = 0; batch < num; ++batch) {
209 |
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
210 |
float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
211 |
float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - _bias) / _weight;
212 |
213 |
dx[(batch * chn + plane) * sp + n] = __float2half((_dz - _edz / count - _y * _eydz / count) * _mul);
214 |
215 |
216 |
217 |
218 |
at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
219 |
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
// Extract dimensions
229 |
int64_t num, chn, sp;
230 |
get_dims(z, num, chn, sp);
231 |
232 |
auto dx = at::zeros_like(z);
233 |
234 |
// Run kernel
235 |
dim3 blocks(chn);
236 |
dim3 threads(getNumThreads(sp));
237 |
auto stream = at::cuda::getCurrentCUDAStream();
238 |
backward_kernel_h<<<blocks, threads, 0, stream>>>(
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
affine, eps, num, chn, sp);
248 |
249 |
return dx;
250 |
251 |
252 |
__global__ void leaky_relu_backward_impl_h(half *z, half *dz, float slope, int64_t count) {
253 |
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x){
254 |
float _z = __half2float(z[i]);
255 |
if (_z < 0) {
256 |
dz[i] = __float2half(__half2float(dz[i]) * slope);
257 |
z[i] = __float2half(_z / slope);
258 |
259 |
260 |
261 |
262 |
void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope) {
263 |
264 |
265 |
266 |
int64_t count = z.numel();
267 |
dim3 threads(getNumThreads(count));
268 |
dim3 blocks = (count + threads.x - 1) / threads.x;
269 |
auto stream = at::cuda::getCurrentCUDAStream();
270 |
leaky_relu_backward_impl_h<<<blocks, threads, 0, stream>>>(
271 |
272 |
273 |
slope, count);
274 |
275 |
@@ -0,0 +1,15 @@
1 |
#pragma once
2 |
3 |
#include <ATen/ATen.h>
4 |
5 |
// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
6 |
#ifndef AT_CHECK
7 |
8 |
9 |
10 |
#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
11 |
#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
12 |
#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
13 |
14 |
15 |
@@ -0,0 +1,49 @@
1 |
#pragma once
2 |
3 |
#include <ATen/ATen.h>
4 |
5 |
6 |
* Functions to share code between CPU and GPU
7 |
8 |
9 |
#ifdef __CUDACC__
10 |
// CUDA versions
11 |
12 |
#define HOST_DEVICE __host__ __device__
13 |
#define INLINE_HOST_DEVICE __host__ __device__ inline
14 |
#define FLOOR(x) floor(x)
15 |
16 |
#if __CUDA_ARCH__ >= 600
17 |
// Recent compute capabilities have block-level atomicAdd for all data types, so we use that
18 |
#define ACCUM(x,y) atomicAdd_block(&(x),(y))
19 |
20 |
// Older architectures don't have block-level atomicAdd, nor atomicAdd for doubles, so we defer to atomicAdd for float
21 |
// and use the known atomicCAS-based implementation for double
22 |
template<typename data_t>
23 |
__device__ inline data_t atomic_add(data_t *address, data_t val) {
24 |
return atomicAdd(address, val);
25 |
26 |
27 |
28 |
__device__ inline double atomic_add(double *address, double val) {
29 |
unsigned long long int* address_as_ull = (unsigned long long int*)address;
30 |
unsigned long long int old = *address_as_ull, assumed;
31 |
do {
32 |
assumed = old;
33 |
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
34 |
} while (assumed != old);
35 |
return __longlong_as_double(old);
36 |
37 |
38 |
#define ACCUM(x,y) atomic_add(&(x),(y))
39 |
#endif // #if __CUDA_ARCH__ >= 600
40 |
41 |
42 |
// CPU versions
43 |
44 |
45 |
#define INLINE_HOST_DEVICE inline
46 |
#define FLOOR(x) std::floor(x)
47 |
#define ACCUM(x,y) (x) += (y)
48 |
49 |
#endif // #ifdef __CUDACC__
@@ -0,0 +1,71 @@
1 |
#pragma once
2 |
3 |
4 |
* General settings and functions
5 |
6 |
const int WARP_SIZE = 32;
7 |
const int MAX_BLOCK_SIZE = 1024;
8 |
9 |
static int getNumThreads(int nElem) {
10 |
int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
11 |
for (int i = 0; i < 6; ++i) {
12 |
if (nElem <= threadSizes[i]) {
13 |
return threadSizes[i];
14 |
15 |
16 |
17 |
18 |
19 |
20 |
* Reduction utilities
21 |
22 |
template <typename T>
23 |
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
24 |
unsigned int mask = 0xffffffff) {
25 |
#if CUDART_VERSION >= 9000
26 |
return __shfl_xor_sync(mask, value, laneMask, width);
27 |
28 |
return __shfl_xor(value, laneMask, width);
29 |
30 |
31 |
32 |
__device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }
33 |
34 |
template<typename T>
35 |
struct Pair {
36 |
T v1, v2;
37 |
__device__ Pair() {}
38 |
__device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
39 |
__device__ Pair(T v) : v1(v), v2(v) {}
40 |
__device__ Pair(int v) : v1(v), v2(v) {}
41 |
__device__ Pair &operator+=(const Pair<T> &a) {
42 |
v1 += a.v1;
43 |
v2 += a.v2;
44 |
return *this;
45 |
46 |
47 |
48 |
template<typename T>
49 |
static __device__ __forceinline__ T warpSum(T val) {
50 |
#if __CUDA_ARCH__ >= 300
51 |
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
52 |
val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
53 |
54 |
55 |
__shared__ T values[MAX_BLOCK_SIZE];
56 |
values[threadIdx.x] = val;
57 |
58 |
const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
59 |
for (int i = 1; i < WARP_SIZE; i++) {
60 |
val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
61 |
62 |
63 |
return val;
64 |
65 |
66 |
template<typename T>
67 |
static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
68 |
value.v1 = warpSum(value.v1);
69 |
value.v2 = warpSum(value.v2);
70 |
return value;
71 |
@@ -0,0 +1,337 @@
1 |
#!/usr/bin/env python
2 |
# -*- encoding: utf-8 -*-
3 |
4 |
5 |
@Author : Peike Li
6 |
@Contact : [email protected]
7 |
@File :
8 |
@Time : 8/4/19 3:35 PM
9 |
@Desc :
10 |
@License : This source code is licensed under the license found in the
11 |
LICENSE file in the root directory of this source tree.
12 |
13 |
14 |
import functools
15 |
16 |
import torch
17 |
import torch.nn as nn
18 |
from torch.nn import functional as F
19 |
# Note here we adopt the InplaceABNSync implementation from
20 |
# By default, the InplaceABNSync module contains a BatchNorm Layer and a LeakyReLu layer
21 |
from ..modules import InPlaceABNSync
22 |
23 |
BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
24 |
25 |
affine_par = True
26 |
27 |
pretrained_settings = {
28 |
'resnet101': {
29 |
'imagenet': {
30 |
'input_space': 'BGR',
31 |
'input_size': [3, 224, 224],
32 |
'input_range': [0, 1],
33 |
'mean': [0.406, 0.456, 0.485],
34 |
'std': [0.225, 0.224, 0.229],
35 |
'num_classes': 1000
36 |
37 |
38 |
39 |
40 |
41 |
def conv3x3(in_planes, out_planes, stride=1):
42 |
"3x3 convolution with padding"
43 |
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
44 |
padding=1, bias=False)
45 |
46 |
47 |
class Bottleneck(nn.Module):
48 |
expansion = 4
49 |
50 |
def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1, multi_grid=1):
51 |
super(Bottleneck, self).__init__()
52 |
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
53 |
self.bn1 = BatchNorm2d(planes)
54 |
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
55 |
padding=dilation * multi_grid, dilation=dilation * multi_grid, bias=False)
56 |
self.bn2 = BatchNorm2d(planes)
57 |
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
58 |
self.bn3 = BatchNorm2d(planes * 4)
59 |
self.relu = nn.ReLU(inplace=False)
60 |
self.relu_inplace = nn.ReLU(inplace=True)
61 |
self.downsample = downsample
62 |
self.dilation = dilation
63 |
self.stride = stride
64 |
65 |
def forward(self, x):
66 |
residual = x
67 |
68 |
out = self.conv1(x)
69 |
out = self.bn1(out)
70 |
out = self.relu(out)
71 |
72 |
out = self.conv2(out)
73 |
out = self.bn2(out)
74 |
out = self.relu(out)
75 |
76 |
out = self.conv3(out)
77 |
out = self.bn3(out)
78 |
79 |
if self.downsample is not None:
80 |
residual = self.downsample(x)
81 |
82 |
out = out + residual
83 |
out = self.relu_inplace(out)
84 |
85 |
return out
86 |
87 |
88 |
class PSPModule(nn.Module):
89 |
90 |
91 |
Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
92 |
93 |
94 |
def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
95 |
super(PSPModule, self).__init__()
96 |
97 |
self.stages = []
98 |
self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
99 |
self.bottleneck = nn.Sequential(
100 |
nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
101 |
102 |
103 |
104 |
105 |
def _make_stage(self, features, out_features, size):
106 |
prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
107 |
conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
108 |
bn = InPlaceABNSync(out_features)
109 |
return nn.Sequential(prior, conv, bn)
110 |
111 |
def forward(self, feats):
112 |
h, w = feats.size(2), feats.size(3)
113 |
priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
114 |
self.stages] + [feats]
115 |
bottle = self.bottleneck(, 1))
116 |
return bottle
117 |
118 |
119 |
class ASPPModule(nn.Module):
120 |
121 |
122 |
Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
123 |
124 |
125 |
def __init__(self, features, inner_features=256, out_features=512, dilations=(12, 24, 36)):
126 |
super(ASPPModule, self).__init__()
127 |
128 |
self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
129 |
nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
130 |
131 |
132 |
self.conv2 = nn.Sequential(
133 |
nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
134 |
135 |
self.conv3 = nn.Sequential(
136 |
nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
137 |
138 |
self.conv4 = nn.Sequential(
139 |
nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
140 |
141 |
self.conv5 = nn.Sequential(
142 |
nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
143 |
144 |
145 |
self.bottleneck = nn.Sequential(
146 |
nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
147 |
148 |
149 |
150 |
151 |
def forward(self, x):
152 |
_, _, h, w = x.size()
153 |
154 |
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
155 |
156 |
feat2 = self.conv2(x)
157 |
feat3 = self.conv3(x)
158 |
feat4 = self.conv4(x)
159 |
feat5 = self.conv5(x)
160 |
out =, feat2, feat3, feat4, feat5), 1)
161 |
162 |
bottle = self.bottleneck(out)
163 |
return bottle
164 |
165 |
166 |
class Edge_Module(nn.Module):
167 |
168 |
Edge Learning Branch
169 |
170 |
171 |
def __init__(self, in_fea=[256, 512, 1024], mid_fea=256, out_fea=2):
172 |
super(Edge_Module, self).__init__()
173 |
174 |
self.conv1 = nn.Sequential(
175 |
nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
176 |
177 |
178 |
self.conv2 = nn.Sequential(
179 |
nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
180 |
181 |
182 |
self.conv3 = nn.Sequential(
183 |
nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
184 |
185 |
186 |
self.conv4 = nn.Conv2d(mid_fea, out_fea, kernel_size=3, padding=1, dilation=1, bias=True)
187 |
self.conv5 = nn.Conv2d(out_fea * 3, out_fea, kernel_size=1, padding=0, dilation=1, bias=True)
188 |
189 |
def forward(self, x1, x2, x3):
190 |
_, _, h, w = x1.size()
191 |
192 |
edge1_fea = self.conv1(x1)
193 |
edge1 = self.conv4(edge1_fea)
194 |
edge2_fea = self.conv2(x2)
195 |
edge2 = self.conv4(edge2_fea)
196 |
edge3_fea = self.conv3(x3)
197 |
edge3 = self.conv4(edge3_fea)
198 |
199 |
edge2_fea = F.interpolate(edge2_fea, size=(h, w), mode='bilinear', align_corners=True)
200 |
edge3_fea = F.interpolate(edge3_fea, size=(h, w), mode='bilinear', align_corners=True)
201 |
edge2 = F.interpolate(edge2, size=(h, w), mode='bilinear', align_corners=True)
202 |
edge3 = F.interpolate(edge3, size=(h, w), mode='bilinear', align_corners=True)
203 |
204 |
edge =[edge1, edge2, edge3], dim=1)
205 |
edge_fea =[edge1_fea, edge2_fea, edge3_fea], dim=1)
206 |
edge = self.conv5(edge)
207 |
208 |
return edge, edge_fea
209 |
210 |
211 |
class Decoder_Module(nn.Module):
212 |
213 |
Parsing Branch Decoder Module.
214 |
215 |
216 |
def __init__(self, num_classes):
217 |
super(Decoder_Module, self).__init__()
218 |
self.conv1 = nn.Sequential(
219 |
nn.Conv2d(512, 256, kernel_size=1, padding=0, dilation=1, bias=False),
220 |
221 |
222 |
self.conv2 = nn.Sequential(
223 |
nn.Conv2d(256, 48, kernel_size=1, stride=1, padding=0, dilation=1, bias=False),
224 |
225 |
226 |
self.conv3 = nn.Sequential(
227 |
nn.Conv2d(304, 256, kernel_size=1, padding=0, dilation=1, bias=False),
228 |
229 |
nn.Conv2d(256, 256, kernel_size=1, padding=0, dilation=1, bias=False),
230 |
231 |
232 |
233 |
self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
234 |
235 |
def forward(self, xt, xl):
236 |
_, _, h, w = xl.size()
237 |
xt = F.interpolate(self.conv1(xt), size=(h, w), mode='bilinear', align_corners=True)
238 |
xl = self.conv2(xl)
239 |
x =[xt, xl], dim=1)
240 |
x = self.conv3(x)
241 |
seg = self.conv4(x)
242 |
return seg, x
243 |
244 |
245 |
class ResNet(nn.Module):
246 |
def __init__(self, block, layers, num_classes):
247 |
self.inplanes = 128
248 |
super(ResNet, self).__init__()
249 |
self.conv1 = conv3x3(3, 64, stride=2)
250 |
self.bn1 = BatchNorm2d(64)
251 |
self.relu1 = nn.ReLU(inplace=False)
252 |
self.conv2 = conv3x3(64, 64)
253 |
self.bn2 = BatchNorm2d(64)
254 |
self.relu2 = nn.ReLU(inplace=False)
255 |
self.conv3 = conv3x3(64, 128)
256 |
self.bn3 = BatchNorm2d(128)
257 |
self.relu3 = nn.ReLU(inplace=False)
258 |
259 |
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
260 |
261 |
self.layer1 = self._make_layer(block, 64, layers[0])
262 |
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
263 |
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
264 |
self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=2, multi_grid=(1, 1, 1))
265 |
266 |
self.context_encoding = PSPModule(2048, 512)
267 |
268 |
self.edge = Edge_Module()
269 |
self.decoder = Decoder_Module(num_classes)
270 |
271 |
self.fushion = nn.Sequential(
272 |
nn.Conv2d(1024, 256, kernel_size=1, padding=0, dilation=1, bias=False),
273 |
274 |
275 |
nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
276 |
277 |
278 |
def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1):
279 |
downsample = None
280 |
if stride != 1 or self.inplanes != planes * block.expansion:
281 |
downsample = nn.Sequential(
282 |
nn.Conv2d(self.inplanes, planes * block.expansion,
283 |
kernel_size=1, stride=stride, bias=False),
284 |
BatchNorm2d(planes * block.expansion, affine=affine_par))
285 |
286 |
layers = []
287 |
generate_multi_grid = lambda index, grids: grids[index % len(grids)] if isinstance(grids, tuple) else 1
288 |
layers.append(block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample,
289 |
multi_grid=generate_multi_grid(0, multi_grid)))
290 |
self.inplanes = planes * block.expansion
291 |
for i in range(1, blocks):
292 |
293 |
block(self.inplanes, planes, dilation=dilation, multi_grid=generate_multi_grid(i, multi_grid)))
294 |
295 |
return nn.Sequential(*layers)
296 |
297 |
def forward(self, x):
298 |
x = self.relu1(self.bn1(self.conv1(x)))
299 |
x = self.relu2(self.bn2(self.conv2(x)))
300 |
x = self.relu3(self.bn3(self.conv3(x)))
301 |
x = self.maxpool(x)
302 |
x2 = self.layer1(x)
303 |
x3 = self.layer2(x2)
304 |
x4 = self.layer3(x3)
305 |
x5 = self.layer4(x4)
306 |
x = self.context_encoding(x5)
307 |
parsing_result, parsing_fea = self.decoder(x, x2)
308 |
# Edge Branch
309 |
edge_result, edge_fea = self.edge(x2, x3, x4)
310 |
# Fusion Branch
311 |
x =[parsing_fea, edge_fea], dim=1)
312 |
fusion_result = self.fushion(x)
313 |
return [[parsing_result, fusion_result], [edge_result]]
314 |
315 |
316 |
def initialize_pretrained_model(model, settings, pretrained='./models/resnet101-imagenet.pth'):
317 |
model.input_space = settings['input_space']
318 |
model.input_size = settings['input_size']
319 |
model.input_range = settings['input_range']
320 |
model.mean = settings['mean']
321 |
model.std = settings['std']
322 |
323 |
if pretrained is not None:
324 |
saved_state_dict = torch.load(pretrained)
325 |
new_params = model.state_dict().copy()
326 |
for i in saved_state_dict:
327 |
i_parts = i.split('.')
328 |
if not i_parts[0] == 'fc':
329 |
new_params['.'.join(i_parts[0:])] = saved_state_dict[i]
330 |
331 |
332 |
333 |
def resnet101(num_classes=20, pretrained='./models/resnet101-imagenet.pth'):
334 |
model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
335 |
settings = pretrained_settings['resnet101']['imagenet']
336 |
initialize_pretrained_model(model, settings, pretrained)
337 |
return model
@@ -0,0 +1,13 @@
1 |
from __future__ import absolute_import
2 |
3 |
from ..networks.AugmentCE2P import resnet101
4 |
5 |
__factory = {
6 |
'resnet101': resnet101,
7 |
8 |
9 |
10 |
def init_model(name, *args, **kwargs):
11 |
if name not in __factory.keys():
12 |
raise KeyError("Unknown model arch: {}".format(name))
13 |
return __factory[name](*args, **kwargs)
@@ -0,0 +1,156 @@
1 |
#!/usr/bin/env python
2 |
# -*- encoding: utf-8 -*-
3 |
4 |
5 |
@Author : Peike Li
6 |
@Contact : [email protected]
7 |
@File :
8 |
@Time : 8/4/19 3:35 PM
9 |
@Desc :
10 |
@License : This source code is licensed under the license found in the
11 |
LICENSE file in the root directory of this source tree.
12 |
13 |
14 |
import torch.nn as nn
15 |
import math
16 |
import functools
17 |
18 |
from modules import InPlaceABN, InPlaceABNSync
19 |
20 |
BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
21 |
22 |
__all__ = ['mobilenetv2']
23 |
24 |
25 |
def conv_bn(inp, oup, stride):
26 |
return nn.Sequential(
27 |
nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
28 |
29 |
30 |
31 |
32 |
33 |
def conv_1x1_bn(inp, oup):
34 |
return nn.Sequential(
35 |
nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
36 |
37 |
38 |
39 |
40 |
41 |
class InvertedResidual(nn.Module):
42 |
def __init__(self, inp, oup, stride, expand_ratio):
43 |
super(InvertedResidual, self).__init__()
44 |
self.stride = stride
45 |
assert stride in [1, 2]
46 |
47 |
hidden_dim = round(inp * expand_ratio)
48 |
self.use_res_connect = self.stride == 1 and inp == oup
49 |
50 |
if expand_ratio == 1:
51 |
self.conv = nn.Sequential(
52 |
# dw
53 |
nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
54 |
55 |
56 |
# pw-linear
57 |
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
58 |
59 |
60 |
61 |
self.conv = nn.Sequential(
62 |
# pw
63 |
nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
64 |
65 |
66 |
# dw
67 |
nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
68 |
69 |
70 |
# pw-linear
71 |
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
72 |
73 |
74 |
75 |
def forward(self, x):
76 |
if self.use_res_connect:
77 |
return x + self.conv(x)
78 |
79 |
return self.conv(x)
80 |
81 |
82 |
class MobileNetV2(nn.Module):
83 |
def __init__(self, n_class=1000, input_size=224, width_mult=1.):
84 |
super(MobileNetV2, self).__init__()
85 |
block = InvertedResidual
86 |
input_channel = 32
87 |
last_channel = 1280
88 |
interverted_residual_setting = [
89 |
# t, c, n, s
90 |
[1, 16, 1, 1],
91 |
[6, 24, 2, 2], # layer 2
92 |
[6, 32, 3, 2], # layer 3
93 |
[6, 64, 4, 2],
94 |
[6, 96, 3, 1], # layer 4
95 |
[6, 160, 3, 2],
96 |
[6, 320, 1, 1], # layer 5
97 |
98 |
99 |
# building first layer
100 |
assert input_size % 32 == 0
101 |
input_channel = int(input_channel * width_mult)
102 |
self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
103 |
self.features = [conv_bn(3, input_channel, 2)]
104 |
# building inverted residual blocks
105 |
for t, c, n, s in interverted_residual_setting:
106 |
output_channel = int(c * width_mult)
107 |
for i in range(n):
108 |
if i == 0:
109 |
self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
110 |
111 |
self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
112 |
input_channel = output_channel
113 |
# building last several layers
114 |
self.features.append(conv_1x1_bn(input_channel, self.last_channel))
115 |
# make it nn.Sequential
116 |
self.features = nn.Sequential(*self.features)
117 |
118 |
# building classifier
119 |
self.classifier = nn.Sequential(
120 |
121 |
nn.Linear(self.last_channel, n_class),
122 |
123 |
124 |
125 |
126 |
def forward(self, x):
127 |
x = self.features(x)
128 |
x = x.mean(3).mean(2)
129 |
x = self.classifier(x)
130 |
return x
131 |
132 |
def _initialize_weights(self):
133 |
for m in self.modules():
134 |
if isinstance(m, nn.Conv2d):
135 |
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
136 |
+, math.sqrt(2. / n))
137 |
if m.bias is not None:
138 |
139 |
elif isinstance(m, BatchNorm2d):
140 |
141 |
142 |
elif isinstance(m, nn.Linear):
143 |
n = m.weight.size(1)
144 |
+, 0.01)
145 |
146 |
147 |
148 |
def mobilenetv2(pretrained=False, **kwargs):
149 |
"""Constructs a MobileNet_V2 model.
150 |
151 |
pretrained (bool): If True, returns a model pre-trained on ImageNet
152 |
153 |
model = MobileNetV2(n_class=1000, **kwargs)
154 |
if pretrained:
155 |
model.load_state_dict(load_url(model_urls['mobilenetv2']), strict=False)
156 |
return model
@@ -0,0 +1,205 @@
1 |
#!/usr/bin/env python
2 |
# -*- encoding: utf-8 -*-
3 |
4 |
5 |
@Author : Peike Li
6 |
@Contact : [email protected]
7 |
@File :
8 |
@Time : 8/4/19 3:35 PM
9 |
@Desc :
10 |
@License : This source code is licensed under the license found in the
11 |
LICENSE file in the root directory of this source tree.
12 |
13 |
14 |
import functools
15 |
import torch.nn as nn
16 |
import math
17 |
from torch.utils.model_zoo import load_url
18 |
19 |
from modules import InPlaceABNSync
20 |
21 |
BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
22 |
23 |
__all__ = ['ResNet', 'resnet18', 'resnet50', 'resnet101'] # resnet101 is coming soon!
24 |
25 |
model_urls = {
26 |
'resnet18': '',
27 |
'resnet50': '',
28 |
'resnet101': ''
29 |
30 |
31 |
32 |
def conv3x3(in_planes, out_planes, stride=1):
33 |
"3x3 convolution with padding"
34 |
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
35 |
padding=1, bias=False)
36 |
37 |
38 |
class BasicBlock(nn.Module):
39 |
expansion = 1
40 |
41 |
def __init__(self, inplanes, planes, stride=1, downsample=None):
42 |
super(BasicBlock, self).__init__()
43 |
self.conv1 = conv3x3(inplanes, planes, stride)
44 |
self.bn1 = BatchNorm2d(planes)
45 |
self.relu = nn.ReLU(inplace=True)
46 |
self.conv2 = conv3x3(planes, planes)
47 |
self.bn2 = BatchNorm2d(planes)
48 |
self.downsample = downsample
49 |
self.stride = stride
50 |
51 |
def forward(self, x):
52 |
residual = x
53 |
54 |
out = self.conv1(x)
55 |
out = self.bn1(out)
56 |
out = self.relu(out)
57 |
58 |
out = self.conv2(out)
59 |
out = self.bn2(out)
60 |
61 |
if self.downsample is not None:
62 |
residual = self.downsample(x)
63 |
64 |
out += residual
65 |
out = self.relu(out)
66 |
67 |
return out
68 |
69 |
70 |
class Bottleneck(nn.Module):
71 |
expansion = 4
72 |
73 |
def __init__(self, inplanes, planes, stride=1, downsample=None):
74 |
super(Bottleneck, self).__init__()
75 |
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
76 |
self.bn1 = BatchNorm2d(planes)
77 |
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
78 |
padding=1, bias=False)
79 |
self.bn2 = BatchNorm2d(planes)
80 |
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
81 |
self.bn3 = BatchNorm2d(planes * 4)
82 |
self.relu = nn.ReLU(inplace=True)
83 |
self.downsample = downsample
84 |
self.stride = stride
85 |
86 |
def forward(self, x):
87 |
residual = x
88 |
89 |
out = self.conv1(x)
90 |
out = self.bn1(out)
91 |
out = self.relu(out)
92 |
93 |
out = self.conv2(out)
94 |
out = self.bn2(out)
95 |
out = self.relu(out)
96 |
97 |
out = self.conv3(out)
98 |
out = self.bn3(out)
99 |
100 |
if self.downsample is not None:
101 |
residual = self.downsample(x)
102 |
103 |
out += residual
104 |
out = self.relu(out)
105 |
106 |
return out
107 |
108 |
109 |
class ResNet(nn.Module):
110 |
111 |
def __init__(self, block, layers, num_classes=1000):
112 |
self.inplanes = 128
113 |
super(ResNet, self).__init__()
114 |
self.conv1 = conv3x3(3, 64, stride=2)
115 |
self.bn1 = BatchNorm2d(64)
116 |
self.relu1 = nn.ReLU(inplace=True)
117 |
self.conv2 = conv3x3(64, 64)
118 |
self.bn2 = BatchNorm2d(64)
119 |
self.relu2 = nn.ReLU(inplace=True)
120 |
self.conv3 = conv3x3(64, 128)
121 |
self.bn3 = BatchNorm2d(128)
122 |
self.relu3 = nn.ReLU(inplace=True)
123 |
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
124 |
125 |
self.layer1 = self._make_layer(block, 64, layers[0])
126 |
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
127 |
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
128 |
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
129 |
self.avgpool = nn.AvgPool2d(7, stride=1)
130 |
self.fc = nn.Linear(512 * block.expansion, num_classes)
131 |
132 |
for m in self.modules():
133 |
if isinstance(m, nn.Conv2d):
134 |
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
135 |
+, math.sqrt(2. / n))
136 |
elif isinstance(m, BatchNorm2d):
137 |
138 |
139 |
140 |
def _make_layer(self, block, planes, blocks, stride=1):
141 |
downsample = None
142 |
if stride != 1 or self.inplanes != planes * block.expansion:
143 |
downsample = nn.Sequential(
144 |
nn.Conv2d(self.inplanes, planes * block.expansion,
145 |
kernel_size=1, stride=stride, bias=False),
146 |
BatchNorm2d(planes * block.expansion),
147 |
148 |
149 |
layers = []
150 |
layers.append(block(self.inplanes, planes, stride, downsample))
151 |
self.inplanes = planes * block.expansion
152 |
for i in range(1, blocks):
153 |
layers.append(block(self.inplanes, planes))
154 |
155 |
return nn.Sequential(*layers)
156 |
157 |
def forward(self, x):
158 |
x = self.relu1(self.bn1(self.conv1(x)))
159 |
x = self.relu2(self.bn2(self.conv2(x)))
160 |
x = self.relu3(self.bn3(self.conv3(x)))
161 |
x = self.maxpool(x)
162 |
163 |
x = self.layer1(x)
164 |
x = self.layer2(x)
165 |
x = self.layer3(x)
166 |
x = self.layer4(x)
167 |
168 |
x = self.avgpool(x)
169 |
x = x.view(x.size(0), -1)
170 |
x = self.fc(x)
171 |
172 |
return x
173 |
174 |
175 |
def resnet18(pretrained=False, **kwargs):
176 |
"""Constructs a ResNet-18 model.
177 |
178 |
pretrained (bool): If True, returns a model pre-trained on ImageNet
179 |
180 |
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
181 |
if pretrained:
182 |
183 |
return model
184 |
185 |
186 |
def resnet50(pretrained=False, **kwargs):
187 |
"""Constructs a ResNet-50 model.
188 |
189 |
pretrained (bool): If True, returns a model pre-trained on ImageNet
190 |
191 |
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
192 |
if pretrained:
193 |
model.load_state_dict(load_url(model_urls['resnet50']), strict=False)
194 |
return model
195 |
196 |
197 |
def resnet101(pretrained=False, **kwargs):
198 |
"""Constructs a ResNet-101 model.
199 |
200 |
pretrained (bool): If True, returns a model pre-trained on ImageNet
201 |
202 |
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
203 |
if pretrained:
204 |
model.load_state_dict(load_url(model_urls['resnet101']), strict=False)
205 |
return model
@@ -0,0 +1,149 @@
1 |
#!/usr/bin/env python
2 |
# -*- encoding: utf-8 -*-
3 |
4 |
5 |
@Author : Peike Li
6 |
@Contact : [email protected]
7 |
@File :
8 |
@Time : 8/11/19 8:58 PM
9 |
@Desc :
10 |
@License : This source code is licensed under the license found in the
11 |
LICENSE file in the root directory of this source tree.
12 |
13 |
import functools
14 |
import torch.nn as nn
15 |
import math
16 |
from torch.utils.model_zoo import load_url
17 |
18 |
from modules import InPlaceABNSync
19 |
20 |
BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
21 |
22 |
__all__ = ['ResNeXt', 'resnext101'] # support resnext 101
23 |
24 |
model_urls = {
25 |
'resnext50': '',
26 |
'resnext101': ''
27 |
28 |
29 |
30 |
def conv3x3(in_planes, out_planes, stride=1):
31 |
"3x3 convolution with padding"
32 |
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
33 |
padding=1, bias=False)
34 |
35 |
36 |
class GroupBottleneck(nn.Module):
37 |
expansion = 2
38 |
39 |
def __init__(self, inplanes, planes, stride=1, groups=1, downsample=None):
40 |
super(GroupBottleneck, self).__init__()
41 |
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
42 |
self.bn1 = BatchNorm2d(planes)
43 |
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
44 |
padding=1, groups=groups, bias=False)
45 |
self.bn2 = BatchNorm2d(planes)
46 |
self.conv3 = nn.Conv2d(planes, planes * 2, kernel_size=1, bias=False)
47 |
self.bn3 = BatchNorm2d(planes * 2)
48 |
self.relu = nn.ReLU(inplace=True)
49 |
self.downsample = downsample
50 |
self.stride = stride
51 |
52 |
def forward(self, x):
53 |
residual = x
54 |
55 |
out = self.conv1(x)
56 |
out = self.bn1(out)
57 |
out = self.relu(out)
58 |
59 |
out = self.conv2(out)
60 |
out = self.bn2(out)
61 |
out = self.relu(out)
62 |
63 |
out = self.conv3(out)
64 |
out = self.bn3(out)
65 |
66 |
if self.downsample is not None:
67 |
residual = self.downsample(x)
68 |
69 |
out += residual
70 |
out = self.relu(out)
71 |
72 |
return out
73 |
74 |
75 |
class ResNeXt(nn.Module):
76 |
77 |
def __init__(self, block, layers, groups=32, num_classes=1000):
78 |
self.inplanes = 128
79 |
super(ResNeXt, self).__init__()
80 |
self.conv1 = conv3x3(3, 64, stride=2)
81 |
self.bn1 = BatchNorm2d(64)
82 |
self.relu1 = nn.ReLU(inplace=True)
83 |
self.conv2 = conv3x3(64, 64)
84 |
self.bn2 = BatchNorm2d(64)
85 |
self.relu2 = nn.ReLU(inplace=True)
86 |
self.conv3 = conv3x3(64, 128)
87 |
self.bn3 = BatchNorm2d(128)
88 |
self.relu3 = nn.ReLU(inplace=True)
89 |
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
90 |
91 |
self.layer1 = self._make_layer(block, 128, layers[0], groups=groups)
92 |
self.layer2 = self._make_layer(block, 256, layers[1], stride=2, groups=groups)
93 |
self.layer3 = self._make_layer(block, 512, layers[2], stride=2, groups=groups)
94 |
self.layer4 = self._make_layer(block, 1024, layers[3], stride=2, groups=groups)
95 |
self.avgpool = nn.AvgPool2d(7, stride=1)
96 |
self.fc = nn.Linear(1024 * block.expansion, num_classes)
97 |
98 |
for m in self.modules():
99 |
if isinstance(m, nn.Conv2d):
100 |
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels // m.groups
101 |
+, math.sqrt(2. / n))
102 |
elif isinstance(m, BatchNorm2d):
103 |
104 |
105 |
106 |
def _make_layer(self, block, planes, blocks, stride=1, groups=1):
107 |
downsample = None
108 |
if stride != 1 or self.inplanes != planes * block.expansion:
109 |
downsample = nn.Sequential(
110 |
nn.Conv2d(self.inplanes, planes * block.expansion,
111 |
kernel_size=1, stride=stride, bias=False),
112 |
BatchNorm2d(planes * block.expansion),
113 |
114 |
115 |
layers = []
116 |
layers.append(block(self.inplanes, planes, stride, groups, downsample))
117 |
self.inplanes = planes * block.expansion
118 |
for i in range(1, blocks):
119 |
layers.append(block(self.inplanes, planes, groups=groups))
120 |
121 |
return nn.Sequential(*layers)
122 |
123 |
def forward(self, x):
124 |
x = self.relu1(self.bn1(self.conv1(x)))
125 |
x = self.relu2(self.bn2(self.conv2(x)))
126 |
x = self.relu3(self.bn3(self.conv3(x)))
127 |
x = self.maxpool(x)
128 |
129 |
x = self.layer1(x)
130 |
x = self.layer2(x)
131 |
x = self.layer3(x)
132 |
x = self.layer4(x)
133 |
134 |
x = self.avgpool(x)
135 |
x = x.view(x.size(0), -1)
136 |
x = self.fc(x)
137 |
138 |
return x
139 |
140 |
141 |
def resnext101(pretrained=False, **kwargs):
142 |
"""Constructs a ResNet-101 model.
143 |
144 |
pretrained (bool): If True, returns a model pre-trained on Places
145 |
146 |
model = ResNeXt(GroupBottleneck, [3, 4, 23, 3], **kwargs)
147 |
if pretrained:
148 |
model.load_state_dict(load_url(model_urls['resnext101']), strict=False)
149 |
return model
@@ -0,0 +1,64 @@
1 |
#!/usr/bin/env python
2 |
# -*- encoding: utf-8 -*-
3 |
4 |
5 |
@Author : Peike Li
6 |
@Contact : [email protected]
7 |
@File :
8 |
@Time : 8/4/19 3:36 PM
9 |
@Desc :
10 |
@License : This source code is licensed under the license found in the
11 |
LICENSE file in the root directory of this source tree.
12 |
13 |
14 |
import torch
15 |
import torch.nn as nn
16 |
from torch.nn import functional as F
17 |
18 |
from modules import InPlaceABNSync
19 |
20 |
21 |
class ASPPModule(nn.Module):
22 |
23 |
24 |
Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
25 |
26 |
def __init__(self, features, out_features=512, inner_features=256, dilations=(12, 24, 36)):
27 |
super(ASPPModule, self).__init__()
28 |
29 |
self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
30 |
nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
31 |
32 |
33 |
self.conv2 = nn.Sequential(
34 |
nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
35 |
36 |
self.conv3 = nn.Sequential(
37 |
nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
38 |
39 |
self.conv4 = nn.Sequential(
40 |
nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
41 |
42 |
self.conv5 = nn.Sequential(
43 |
nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
44 |
45 |
46 |
self.bottleneck = nn.Sequential(
47 |
nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
48 |
49 |
50 |
51 |
52 |
def forward(self, x):
53 |
_, _, h, w = x.size()
54 |
55 |
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
56 |
57 |
feat2 = self.conv2(x)
58 |
feat3 = self.conv3(x)
59 |
feat4 = self.conv4(x)
60 |
feat5 = self.conv5(x)
61 |
out =, feat2, feat3, feat4, feat5), 1)
62 |
63 |
bottle = self.bottleneck(out)
64 |
return bottle
@@ -0,0 +1,226 @@
1 |
#!/usr/bin/env python
2 |
# -*- encoding: utf-8 -*-
3 |
4 |
5 |
@Author : Peike Li
6 |
@Contact : [email protected]
7 |
@File :
8 |
@Time : 8/4/19 3:36 PM
9 |
@Desc :
10 |
@License : This source code is licensed under the license found in the
11 |
LICENSE file in the root directory of this source tree.
12 |
13 |
14 |
import functools
15 |
16 |
import torch
17 |
import torch.nn as nn
18 |
from torch.autograd import Variable
19 |
from torch.nn import functional as F
20 |
21 |
from modules import InPlaceABNSync
22 |
BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
23 |
24 |
25 |
class _SelfAttentionBlock(nn.Module):
26 |
27 |
The basic implementation for self-attention block/non-local block
28 |
29 |
30 |
31 |
in_channels : the dimension of the input feature map
32 |
key_channels : the dimension after the key/query transform
33 |
value_channels : the dimension after the value transform
34 |
scale : choose the scale to downsample the input feature maps (save memory cost)
35 |
36 |
37 |
position-aware context features.(w/o concate or add with the input)
38 |
39 |
40 |
def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
41 |
super(_SelfAttentionBlock, self).__init__()
42 |
self.scale = scale
43 |
self.in_channels = in_channels
44 |
self.out_channels = out_channels
45 |
self.key_channels = key_channels
46 |
self.value_channels = value_channels
47 |
if out_channels == None:
48 |
self.out_channels = in_channels
49 |
self.pool = nn.MaxPool2d(kernel_size=(scale, scale))
50 |
self.f_key = nn.Sequential(
51 |
nn.Conv2d(in_channels=self.in_channels, out_channels=self.key_channels,
52 |
kernel_size=1, stride=1, padding=0),
53 |
54 |
55 |
self.f_query = self.f_key
56 |
self.f_value = nn.Conv2d(in_channels=self.in_channels, out_channels=self.value_channels,
57 |
kernel_size=1, stride=1, padding=0)
58 |
self.W = nn.Conv2d(in_channels=self.value_channels, out_channels=self.out_channels,
59 |
kernel_size=1, stride=1, padding=0)
60 |
nn.init.constant(self.W.weight, 0)
61 |
nn.init.constant(self.W.bias, 0)
62 |
63 |
def forward(self, x):
64 |
batch_size, h, w = x.size(0), x.size(2), x.size(3)
65 |
if self.scale > 1:
66 |
x = self.pool(x)
67 |
68 |
value = self.f_value(x).view(batch_size, self.value_channels, -1)
69 |
value = value.permute(0, 2, 1)
70 |
query = self.f_query(x).view(batch_size, self.key_channels, -1)
71 |
query = query.permute(0, 2, 1)
72 |
key = self.f_key(x).view(batch_size, self.key_channels, -1)
73 |
74 |
sim_map = torch.matmul(query, key)
75 |
sim_map = (self.key_channels ** -.5) * sim_map
76 |
sim_map = F.softmax(sim_map, dim=-1)
77 |
78 |
context = torch.matmul(sim_map, value)
79 |
context = context.permute(0, 2, 1).contiguous()
80 |
context = context.view(batch_size, self.value_channels, *x.size()[2:])
81 |
context = self.W(context)
82 |
if self.scale > 1:
83 |
context = F.upsample(input=context, size=(h, w), mode='bilinear', align_corners=True)
84 |
return context
85 |
86 |
87 |
class SelfAttentionBlock2D(_SelfAttentionBlock):
88 |
def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
89 |
super(SelfAttentionBlock2D, self).__init__(in_channels,
90 |
91 |
92 |
93 |
94 |
95 |
96 |
class BaseOC_Module(nn.Module):
97 |
98 |
Implementation of the BaseOC module
99 |
100 |
in_features / out_features: the channels of the input / output feature maps.
101 |
dropout: we choose 0.05 as the default value.
102 |
size: you can apply multiple sizes. Here we only use one size.
103 |
104 |
features fused with Object context information.
105 |
106 |
107 |
def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
108 |
super(BaseOC_Module, self).__init__()
109 |
self.stages = []
110 |
self.stages = nn.ModuleList(
111 |
[self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
112 |
self.conv_bn_dropout = nn.Sequential(
113 |
nn.Conv2d(2 * in_channels, out_channels, kernel_size=1, padding=0),
114 |
115 |
116 |
117 |
118 |
def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
119 |
return SelfAttentionBlock2D(in_channels,
120 |
121 |
122 |
123 |
124 |
125 |
def forward(self, feats):
126 |
priors = [stage(feats) for stage in self.stages]
127 |
context = priors[0]
128 |
for i in range(1, len(priors)):
129 |
context += priors[i]
130 |
output = self.conv_bn_dropout([context, feats], 1))
131 |
return output
132 |
133 |
134 |
class BaseOC_Context_Module(nn.Module):
135 |
136 |
Output only the context features.
137 |
138 |
in_features / out_features: the channels of the input / output feature maps.
139 |
dropout: specify the dropout ratio
140 |
fusion: We provide two different fusion method, "concat" or "add"
141 |
size: we find that directly learn the attention weights on even 1/8 feature maps is hard.
142 |
143 |
features after "concat" or "add"
144 |
145 |
146 |
def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
147 |
super(BaseOC_Context_Module, self).__init__()
148 |
self.stages = []
149 |
self.stages = nn.ModuleList(
150 |
[self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
151 |
self.conv_bn_dropout = nn.Sequential(
152 |
nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0),
153 |
154 |
155 |
156 |
def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
157 |
return SelfAttentionBlock2D(in_channels,
158 |
159 |
160 |
161 |
162 |
163 |
def forward(self, feats):
164 |
priors = [stage(feats) for stage in self.stages]
165 |
context = priors[0]
166 |
for i in range(1, len(priors)):
167 |
context += priors[i]
168 |
output = self.conv_bn_dropout(context)
169 |
return output
170 |
171 |
172 |
class ASP_OC_Module(nn.Module):
173 |
def __init__(self, features, out_features=256, dilations=(12, 24, 36)):
174 |
super(ASP_OC_Module, self).__init__()
175 |
self.context = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=3, padding=1, dilation=1, bias=True),
176 |
177 |
BaseOC_Context_Module(in_channels=out_features, out_channels=out_features,
178 |
key_channels=out_features // 2, value_channels=out_features,
179 |
dropout=0, sizes=([2])))
180 |
self.conv2 = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
181 |
182 |
self.conv3 = nn.Sequential(
183 |
nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
184 |
185 |
self.conv4 = nn.Sequential(
186 |
nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
187 |
188 |
self.conv5 = nn.Sequential(
189 |
nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
190 |
191 |
192 |
self.conv_bn_dropout = nn.Sequential(
193 |
nn.Conv2d(out_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
194 |
195 |
196 |
197 |
198 |
def _cat_each(self, feat1, feat2, feat3, feat4, feat5):
199 |
assert (len(feat1) == len(feat2))
200 |
z = []
201 |
for i in range(len(feat1)):
202 |
z.append([i], feat2[i], feat3[i], feat4[i], feat5[i]), 1))
203 |
return z
204 |
205 |
def forward(self, x):
206 |
if isinstance(x, Variable):
207 |
_, _, h, w = x.size()
208 |
elif isinstance(x, tuple) or isinstance(x, list):
209 |
_, _, h, w = x[0].size()
210 |
211 |
raise RuntimeError('unknown input type')
212 |
213 |
feat1 = self.context(x)
214 |
feat2 = self.conv2(x)
215 |
feat3 = self.conv3(x)
216 |
feat4 = self.conv4(x)
217 |
feat5 = self.conv5(x)
218 |
219 |
if isinstance(x, Variable):
220 |
out =, feat2, feat3, feat4, feat5), 1)
221 |
elif isinstance(x, tuple) or isinstance(x, list):
222 |
out = self._cat_each(feat1, feat2, feat3, feat4, feat5)
223 |
224 |
raise RuntimeError('unknown input type')
225 |
output = self.conv_bn_dropout(out)
226 |
return output
@@ -0,0 +1,48 @@
1 |
#!/usr/bin/env python
2 |
# -*- encoding: utf-8 -*-
3 |
4 |
5 |
@Author : Peike Li
6 |
@Contact : [email protected]
7 |
@File :
8 |
@Time : 8/4/19 3:36 PM
9 |
@Desc :
10 |
@License : This source code is licensed under the license found in the
11 |
LICENSE file in the root directory of this source tree.
12 |
13 |
14 |
import torch
15 |
import torch.nn as nn
16 |
from torch.nn import functional as F
17 |
18 |
from modules import InPlaceABNSync
19 |
20 |
21 |
class PSPModule(nn.Module):
22 |
23 |
24 |
Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
25 |
26 |
def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
27 |
super(PSPModule, self).__init__()
28 |
29 |
self.stages = []
30 |
self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
31 |
self.bottleneck = nn.Sequential(
32 |
nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
33 |
34 |
35 |
36 |
37 |
def _make_stage(self, features, out_features, size):
38 |
prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
39 |
conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
40 |
bn = InPlaceABNSync(out_features)
41 |
return nn.Sequential(prior, conv, bn)
42 |
43 |
def forward(self, feats):
44 |
h, w = feats.size(2), feats.size(3)
45 |
priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
46 |
self.stages] + [feats]
47 |
bottle = self.bottleneck(, 1))
48 |
return bottle
@@ -0,0 +1,167 @@
1 |
# ------------------------------------------------------------------------------
2 |
# Copyright (c) Microsoft
3 |
# Licensed under the MIT License.
4 |
# Written by Bin Xiao ([email protected])
5 |
# ------------------------------------------------------------------------------
6 |
7 |
from __future__ import absolute_import
8 |
from __future__ import division
9 |
from __future__ import print_function
10 |
11 |
import numpy as np
12 |
import cv2
13 |
import torch
14 |
15 |
class BRG2Tensor_transform(object):
16 |
def __call__(self, pic):
17 |
img = torch.from_numpy(pic.transpose((2, 0, 1)))
18 |
if isinstance(img, torch.ByteTensor):
19 |
return img.float()
20 |
21 |
return img
22 |
23 |
class BGR2RGB_transform(object):
24 |
def __call__(self, tensor):
25 |
return tensor[[2,1,0],:,:]
26 |
27 |
def flip_back(output_flipped, matched_parts):
28 |
29 |
ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
30 |
31 |
assert output_flipped.ndim == 4,\
32 |
'output_flipped should be [batch_size, num_joints, height, width]'
33 |
34 |
output_flipped = output_flipped[:, :, :, ::-1]
35 |
36 |
for pair in matched_parts:
37 |
tmp = output_flipped[:, pair[0], :, :].copy()
38 |
output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
39 |
output_flipped[:, pair[1], :, :] = tmp
40 |
41 |
return output_flipped
42 |
43 |
44 |
def fliplr_joints(joints, joints_vis, width, matched_parts):
45 |
46 |
flip coords
47 |
48 |
# Flip horizontal
49 |
joints[:, 0] = width - joints[:, 0] - 1
50 |
51 |
# Change left-right parts
52 |
for pair in matched_parts:
53 |
joints[pair[0], :], joints[pair[1], :] = \
54 |
joints[pair[1], :], joints[pair[0], :].copy()
55 |
joints_vis[pair[0], :], joints_vis[pair[1], :] = \
56 |
joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
57 |
58 |
return joints*joints_vis, joints_vis
59 |
60 |
61 |
def transform_preds(coords, center, scale, input_size):
62 |
target_coords = np.zeros(coords.shape)
63 |
trans = get_affine_transform(center, scale, 0, input_size, inv=1)
64 |
for p in range(coords.shape[0]):
65 |
target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
66 |
return target_coords
67 |
68 |
def transform_parsing(pred, center, scale, width, height, input_size):
69 |
70 |
trans = get_affine_transform(center, scale, 0, input_size, inv=1)
71 |
target_pred = cv2.warpAffine(
72 |
73 |
74 |
(int(width), int(height)), #(int(width), int(height)),
75 |
76 |
77 |
78 |
79 |
return target_pred
80 |
81 |
def transform_logits(logits, center, scale, width, height, input_size):
82 |
83 |
trans = get_affine_transform(center, scale, 0, input_size, inv=1)
84 |
channel = logits.shape[2]
85 |
target_logits = []
86 |
for i in range(channel):
87 |
target_logit = cv2.warpAffine(
88 |
89 |
90 |
(int(width), int(height)), #(int(width), int(height)),
91 |
92 |
93 |
94 |
95 |
target_logits = np.stack(target_logits,axis=2)
96 |
97 |
return target_logits
98 |
99 |
100 |
def get_affine_transform(center,
101 |
102 |
103 |
104 |
shift=np.array([0, 0], dtype=np.float32),
105 |
106 |
if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
107 |
108 |
scale = np.array([scale, scale])
109 |
110 |
scale_tmp = scale
111 |
112 |
src_w = scale_tmp[0]
113 |
dst_w = output_size[1]
114 |
dst_h = output_size[0]
115 |
116 |
rot_rad = np.pi * rot / 180
117 |
src_dir = get_dir([0, src_w * -0.5], rot_rad)
118 |
dst_dir = np.array([0, (dst_w-1) * -0.5], np.float32)
119 |
120 |
src = np.zeros((3, 2), dtype=np.float32)
121 |
dst = np.zeros((3, 2), dtype=np.float32)
122 |
src[0, :] = center + scale_tmp * shift
123 |
src[1, :] = center + src_dir + scale_tmp * shift
124 |
dst[0, :] = [(dst_w-1) * 0.5, (dst_h-1) * 0.5]
125 |
dst[1, :] = np.array([(dst_w-1) * 0.5, (dst_h-1) * 0.5]) + dst_dir
126 |
127 |
src[2:, :] = get_3rd_point(src[0, :], src[1, :])
128 |
dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
129 |
130 |
if inv:
131 |
trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
132 |
133 |
trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
134 |
135 |
return trans
136 |
137 |
138 |
def affine_transform(pt, t):
139 |
new_pt = np.array([pt[0], pt[1], 1.]).T
140 |
new_pt =, new_pt)
141 |
return new_pt[:2]
142 |
143 |
144 |
def get_3rd_point(a, b):
145 |
direct = a - b
146 |
return b + np.array([-direct[1], direct[0]], dtype=np.float32)
147 |
148 |
149 |
def get_dir(src_point, rot_rad):
150 |
sn, cs = np.sin(rot_rad), np.cos(rot_rad)
151 |
152 |
src_result = [0, 0]
153 |
src_result[0] = src_point[0] * cs - src_point[1] * sn
154 |
src_result[1] = src_point[0] * sn + src_point[1] * cs
155 |
156 |
return src_result
157 |
158 |
159 |
def crop(img, center, scale, output_size, rot=0):
160 |
trans = get_affine_transform(center, scale, rot, output_size)
161 |
162 |
dst_img = cv2.warpAffine(img,
163 |
164 |
(int(output_size[1]), int(output_size[0])),
165 |
166 |
167 |
return dst_img
@@ -0,0 +1,49 @@
1 |
import numpy as np
2 |
import cv2
3 |
import os
4 |
5 |
6 |
annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
7 |
8 |
9 |
def HWC3(x):
10 |
assert x.dtype == np.uint8
11 |
if x.ndim == 2:
12 |
x = x[:, :, None]
13 |
assert x.ndim == 3
14 |
H, W, C = x.shape
15 |
assert C == 1 or C == 3 or C == 4
16 |
if C == 3:
17 |
return x
18 |
if C == 1:
19 |
return np.concatenate([x, x, x], axis=2)
20 |
if C == 4:
21 |
color = x[:, :, 0:3].astype(np.float32)
22 |
alpha = x[:, :, 3:4].astype(np.float32) / 255.0
23 |
y = color * alpha + 255.0 * (1.0 - alpha)
24 |
y = y.clip(0, 255).astype(np.uint8)
25 |
return y
26 |
27 |
28 |
def resize_image(input_image, resolution):
29 |
H, W, C = input_image.shape
30 |
H = float(H)
31 |
W = float(W)
32 |
k = float(resolution) / min(H, W)
33 |
H *= k
34 |
W *= k
35 |
H = int(np.round(H / 64.0)) * 64
36 |
W = int(np.round(W / 64.0)) * 64
37 |
img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
38 |
return img
39 |
40 |
def pad_image(img, min_aspect_ratio=0.625):
41 |
H, W, C = img.shape
42 |
if W/H < min_aspect_ratio:
43 |
NEW_W = int(min_aspect_ratio * H)
44 |
width_padding = (NEW_W-W)//2
45 |
black_bg = np.zeros((H, NEW_W, 3), dtype=img.dtype)
46 |
black_bg[:, width_padding:width_padding+W,:] = img
47 |
return black_bg
48 |
49 |
return img
@@ -0,0 +1,475 @@
1 |
from share import *
2 |
import config
3 |
import os
4 |
import cv2
5 |
import einops
6 |
import gradio as gr
7 |
import numpy as np
8 |
import torch
9 |
import random
10 |
import re
11 |
from datetime import datetime
12 |
from glob import glob
13 |
import argparse
14 |
15 |
from pytorch_lightning import seed_everything
16 |
from torchvision.transforms import ToPILImage
17 |
from annotator.util import pad_image, resize_image, HWC3
18 |
from annotator.openpose import OpenposeDetector
19 |
from cldm.model import create_model, load_state_dict
20 |
from cldm.ddim_hacked import DDIMSampler
21 |
from pathlib import Path
22 |
from PIL import Image
23 |
from omegaconf import OmegaConf
24 |
from ldm.util import instantiate_from_config, log_txt_as_img
25 |
from visconet.segm import ATRSegmentCropper as SegmentCropper
26 |
from huggingface_hub import snapshot_download
27 |
28 |
# supply directory of visual prompt images
29 |
HF_REPO = 'soonyau/visconet'
30 |
GALLERY_PATH = Path('./fashion/')
31 |
32 |
33 |
34 |
DEMO = True
35 |
36 |
APP_FILES_PATH = Path('./app_files')
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
'DeepFakes':[1.0, 1.0, 1.0,
45 |
1.0, 1.0, 1.0,
46 |
0.5, 0.5, 0.5,
47 |
0.0, 0.0, 0.0, 0.0,],
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
'Pose': [0.0,0.0,0.0,
57 |
58 |
59 |
60 |
'Texture Transfer': [1.0,1.0,1.0,
61 |
62 |
63 |
64 |
65 |
66 |
ignore_style_list = ['headwear', 'accesories', 'shoes']
67 |
68 |
global device
69 |
global segmentor
70 |
global apply_openpose
71 |
global style_encoder
72 |
global model
73 |
global ddim_sampler
74 |
75 |
def convert_fname(long_name):
76 |
gender = 'MEN' if long_name[7:10] == 'MEN' else 'WOMEN'
77 |
78 |
input_list = long_name.replace('fashion','').split('___')
79 |
80 |
# Define a regular expression pattern to match the relevant parts of each input string
81 |
if gender == 'MEN':
82 |
pattern = r'MEN(\w+)id(\d+)_(\d)(\w+)'
83 |
84 |
pattern = r'WOMEN(\w+)id(\d+)_(\d)(\w+)'
85 |
# Use a list comprehension to extract the matching substrings from each input string, and format them into the desired output format
86 |
output_list = [f'{gender}/{category}/id_{id_num[:8]}/{id_num[8:]}_{view_num}_{view_desc}' for (category, id_num, view_num, view_desc) in re.findall(pattern, ' '.join(input_list))]
87 |
88 |
# Print the resulting list of formatted strings
89 |
return [f +'.jpg' for f in output_list]
90 |
91 |
def fetch_deepfashion(deepfashion_names):
92 |
src_name, dst_name = convert_fname(deepfashion_names)
93 |
input_image = np.array(
94 |
pose_image = np.array(
95 |
mask_image ='.jpg','_mask.png'))
96 |
97 |
temp = src_name.replace('.jpg','').split('/')
98 |
lastfolder = temp.pop(-1).replace('_','/', 1)
99 |
style_folder = style_root/('/'.join(temp+[lastfolder]))
100 |
viscon_images = []
101 |
for style_name in style_names:
102 |
f_path = style_folder/f'{style_name}.jpg'
103 |
if os.path.exists(str(f_path)):
104 |
105 |
106 |
107 |
return [input_image, pose_image, mask_image, *viscon_images]
108 |
109 |
def select_gallery_image(evt: gr.SelectData):
110 |
111 |
112 |
def select_default_strength(strength_config):
113 |
return SCALE_CONFIG[strength_config]
114 |
115 |
def change_all_scales(scale):
116 |
return [float(scale)]*13
117 |
118 |
def encode_style_images(style_images):
119 |
style_embeddings = []
120 |
121 |
for style_name, style_image in zip(style_names, style_images):
122 |
if style_image == None:
123 |
style_image = Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8))
124 |
125 |
#style_image = style_image.resize((224,224))
126 |
style_image = style_encoder.preprocess(style_image).to(device)
127 |
style_emb = style_encoder.postprocess(style_encoder(style_image)[0])
128 |
129 |
130 |
styles = torch.tensor(np.array(style_embeddings)).squeeze(-2).unsqueeze(0).float().to(device)
131 |
return styles
132 |
133 |
def save_viscon_images(*viscon_images):
134 |
ret_images = []
135 |
for image, name in zip(viscon_images, style_names):
136 |
fname = str(VISCON_IMAGE_PATH/name)+'.jpg'
137 |
if image:
138 |
image = image.resize((224,224))
139 |
if os.path.exists(fname):
140 |
141 |
142 |
143 |
return ret_images
144 |
145 |
146 |
def extract_pose_mask(input_image, detect_resolution,
147 |
ignore_head=True, ignore_hair=False):
148 |
# skeleton
149 |
input_image = pad_image(input_image, min_aspect_ratio=0.625)
150 |
detected_map, _ = apply_openpose(resize_image(input_image, detect_resolution), hand=True)
151 |
detected_map = HWC3(detected_map)
152 |
153 |
# human mask
154 |
cropped = segmentor(input_image, ignore_head=ignore_head, ignore_hair=ignore_hair)
155 |
mask = cropped['human_mask']
156 |
mask = Image.fromarray(np.array(mask*255, dtype=np.uint8), mode='L')
157 |
158 |
return [detected_map, mask]
159 |
160 |
def extract_fashion(input_image):
161 |
162 |
# style images
163 |
cropped = segmentor(input_image)
164 |
cropped_images = []
165 |
for style_name in style_names:
166 |
if style_name in cropped and style_name not in ignore_style_list:
167 |
168 |
169 |
170 |
171 |
return [*cropped_images]
172 |
173 |
def get_image_files(image_path, ret_image=True, exts=['.jpg','.jpeg','.png']):
174 |
images = []
175 |
for ext in exts:
176 |
images += [x for x in glob(str(Path(image_path)/f'*{ext}'))]
177 |
if ret_image:
178 |
images = [ for x in images]
179 |
return images
180 |
181 |
def log_sample(seed, results, prompt, skeleton_image, mask_image, control_scales, *viscon_images):
182 |
time_str ="%Y-%m-%d_%H-%M-%S")
183 |
184 |
log_dir = LOG_PATH/time_str
185 |
os.makedirs(str(log_dir), exist_ok=True)
186 |
187 |
# save result
188 |
concat = np.hstack((skeleton_image, *results))
189 |
190 |
191 |
for i, result in enumerate(results):
192 |
193 |
194 |
# save text
195 |
with open(str(log_dir/'info.txt'),'w') as f:
196 |
f.write(f'prompt: {prompt} \n')
197 |
f.write(f'seed: {seed}\n')
198 |
control_str = [str(x) for x in control_scales]
199 |
f.write(','.join(control_str) + '\n')
200 |
# save vison images
201 |
for style_name, style_image in zip(style_names, viscon_images):
202 |
if style_image is not None:
203 |
204 |
205 |
206 |
def process(prompt, a_prompt, n_prompt, num_samples,
207 |
ddim_steps, scale, seed, eta, mask_image, pose_image,
208 |
c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0,
209 |
210 |
211 |
with torch.no_grad():
212 |
control_scales = [c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0]
213 |
mask = torch.tensor(mask_image.mean(-1)/255.,dtype=torch.float) #(512,512), [0,1]
214 |
mask = mask.unsqueeze(0).to(device) # (1, 512, 512)
215 |
style_emb = encode_style_images(viscon_images)
216 |
217 |
# fix me
218 |
detected_map = HWC3(pose_image)
219 |
#detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
220 |
H, W, C = detected_map.shape
221 |
control = torch.from_numpy(detected_map.copy()).float().to(device) / 255.0
222 |
control = torch.stack([control for _ in range(num_samples)], dim=0)
223 |
control = einops.rearrange(control, 'b h w c -> b c h w').clone()
224 |
225 |
if seed == -1:
226 |
seed = random.randint(0, 65535)
227 |
228 |
229 |
if config.save_memory:
230 |
231 |
new_style_shape = [num_samples] + [1] * (len(style_emb.shape)-1)
232 |
233 |
cond = {"c_concat": [control],
234 |
"c_crossattn": [style_emb.repeat(new_style_shape)],
235 |
"c_text": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)],
236 |
'c_concat_mask': [mask.repeat(num_samples, 1, 1, 1)]}
237 |
238 |
un_cond = {"c_concat": [control],
239 |
"c_crossattn": [torch.zeros_like(style_emb).repeat(new_style_shape)],
240 |
"c_text":[model.get_learned_conditioning([n_prompt] * num_samples)],
241 |
'c_concat_mask': [torch.zeros_like(mask).repeat(num_samples, 1, 1, 1)]}
242 |
243 |
shape = (4, H // 8, W // 8)
244 |
245 |
if config.save_memory:
246 |
247 |
248 |
model.control_scales = control_scales
249 |
250 |
samples, _ = ddim_sampler.sample(ddim_steps, num_samples,
251 |
shape, cond, verbose=False, eta=eta,
252 |
253 |
254 |
255 |
if config.save_memory:
256 |
257 |
258 |
x_samples = model.decode_first_stage(samples)
259 |
x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
260 |
261 |
results = [x_samples[i] for i in range(num_samples)]
262 |
263 |
264 |
log_sample(seed, results, prompt, detected_map, mask_image, control_scales, *viscon_images)
265 |
return results
266 |
267 |
def get_image(name, file_ext='.jpg'):
268 |
fname = str(VISCON_IMAGE_PATH/name)+file_ext
269 |
if not os.path.exists(fname):
270 |
return None
271 |
272 |
273 |
def get_image_numpy(name, file_ext='.png'):
274 |
fname = str(VISCON_IMAGE_PATH/name)+file_ext
275 |
if not os.path.exists(fname):
276 |
return None
277 |
return np.array(
278 |
279 |
def create_app():
280 |
block = gr.Blocks().queue()
281 |
with block:
282 |
with gr.Row():
283 |
gr.Markdown("## ViscoNet: Visual ControlNet with Human Pose and Fashion <br> [Video tutorial](")
284 |
with gr.Row():
285 |
with gr.Column():
286 |
with gr.Accordion("Get pose and mask", open=False):
287 |
with gr.Row():
288 |
input_image = gr.Image(source='upload', type="numpy", label='input image', value=np.array(get_image_numpy('ref')))
289 |
pose_image = gr.Image(source='upload', type="numpy", label='pose', value=np.array(get_image_numpy('pose')))
290 |
mask_image = gr.Image(source='upload', type="numpy", label='mask', value=np.array(get_image_numpy('mask')))
291 |
with gr.Accordion("Samples", open=False):
292 |
with gr.Tab('Female'):
293 |
samples = get_image_files(str(SAMPLE_IMAGE_PATH/'pose/WOMEN/'))
294 |
female_pose_gallery = gr.Gallery(label='pose', show_label=False, value=samples).style(grid=3, height='auto')
295 |
with gr.Tab('Male'):
296 |
samples = get_image_files(str(SAMPLE_IMAGE_PATH/'pose/MEN/'))
297 |
male_pose_gallery = gr.Gallery(label='pose', show_label=False, value=samples).style(grid=3, height='auto')
298 |
with gr.Row():
299 |
#pad_checkbox = gr.Checkbox(label='Pad pose to square', value=True)
300 |
ignorehead_checkbox = gr.Checkbox(label='Ignore face in masking (for DeepFake)', value=True)
301 |
ignorehair_checkbox = gr.Checkbox(label='Ignore hair in masking', value=False, visible=True)
302 |
with gr.Row():
303 |
#ignore_head_checkbox = gr.Checkbox(label='Ignore head', value=False)
304 |
get_pose_button = gr.Button(label="Get pose", value='Get pose')
305 |
get_fashion_button = gr.Button(label="Get visual", value='Get visual prompt')
306 |
307 |
308 |
with gr.Accordion("Visual Conditions", open=False):
309 |
gr.Markdown('Drag-and-drop, or click from samples below.')
310 |
with gr.Column():
311 |
viscon_images = []
312 |
viscon_images_names2index = {}
313 |
viscon_len = len(style_names)
314 |
v_idx = 0
315 |
316 |
with gr.Row():
317 |
for _ in range(8):
318 |
viscon_name = style_names[v_idx]
319 |
vis = False if viscon_name in ignore_style_list else True
320 |
viscon_images.append(gr.Image(source='upload', type="pil", min_height=112, min_width=112, label=viscon_name, value=get_image(viscon_name), visible=vis))
321 |
viscon_images_names2index[viscon_name] = v_idx
322 |
v_idx += 1
323 |
324 |
viscon_button = gr.Button(value='Save as Default',visible=False if DEMO else True)
325 |
326 |
viscon_galleries = []
327 |
328 |
with gr.Column():
329 |
with gr.Accordion("Female", open=False):
330 |
for garment, number in zip(['hair', 'top', 'bottom', 'outer'], [150, 500, 500, 250]):
331 |
with gr.Tab(garment):
332 |
samples = []
333 |
334 |
samples = glob(os.path.join(WOMEN_GALLERY_PATH, f'**/{garment}.jpg'), recursive=True)
335 |
#samples = glob(f'/home/soon/datasets/deepfashion_inshop/styles_default/WOMEN/**/{garment}.jpg', recursive=True)
336 |
samples = random.choices(samples, k=number)
337 |
viscon_gallery = gr.Gallery(label='hair', allow_preview=False, show_label=False, value=samples).style(grid=4, height='auto')
338 |
viscon_galleries.append({'component':viscon_gallery, 'inputs':[garment]})
339 |
with gr.Accordion("Male", open=False):
340 |
for garment, number in zip(['hair', 'top', 'bottom', 'outer'], [150, 500, 500, 250]):
341 |
with gr.Tab(garment):
342 |
samples = []
343 |
if MEN_GALLERY_PATH and os.path.exists(MEN_GALLERY_PATH):
344 |
samples = glob(os.path.join(MEN_GALLERY_PATH, f'**/{garment}.jpg'), recursive=True)
345 |
samples = random.choices(samples, k=number)
346 |
viscon_gallery = gr.Gallery(label='hair', allow_preview=False, show_label=False, value=samples).style(grid=4, height='auto')
347 |
viscon_galleries.append({'component':viscon_gallery, 'inputs':[garment]})
348 |
349 |
with gr.Accordion("Control Strength Scaling", open=False):
350 |
gr.Markdown("smaller value for stronger textual influence. c12 is highest spatial resolution controlling textures")
351 |
with gr.Row():
352 |
strength_select = gr.Dropdown(list(SCALE_CONFIG.keys()), label='strength settings', value=DEFAULT_SCALE_CONFIG)
353 |
scale_all = gr.Slider(label=f'set all scales', minimum=0, maximum=1, value=DEFAULT_CONTROL_SCALE, step=0.05)
354 |
355 |
control_scales = []
356 |
c_idx = 12
357 |
with gr.Accordion("Advanced settings", open=False):
358 |
with gr.Row():
359 |
for _ in range(3):
360 |
control_scales.append(gr.Slider(label=f'c{c_idx}', minimum=0, maximum=1, value=scale_values[12-c_idx], step=0.05))
361 |
c_idx -= 1
362 |
with gr.Row():
363 |
for _ in range(3):
364 |
control_scales.append(gr.Slider(label=f'c{c_idx}', minimum=0, maximum=1, value=scale_values[12-c_idx], step=0.05))
365 |
c_idx -= 1
366 |
with gr.Row():
367 |
for _ in range(3):
368 |
control_scales.append(gr.Slider(label=f'c{c_idx}', minimum=0, maximum=1, value=scale_values[12-c_idx], step=0.05))
369 |
c_idx -= 1
370 |
with gr.Row():
371 |
for _ in range(4):
372 |
control_scales.append(gr.Slider(label=f'c{c_idx}', minimum=0, maximum=1, value=scale_values[12-c_idx], step=0.05))
373 |
c_idx -= 1
374 |
375 |
with gr.Accordion("Advanced options", open=False):
376 |
with gr.Row():
377 |
detect_resolution = gr.Slider(label="OpenPose Resolution", minimum=128, maximum=512, value=512, step=1)
378 |
ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=20, step=1)
379 |
scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=12.0, step=0.1)
380 |
381 |
eta = gr.Number(label="eta (DDIM)", value=0.0, visible=False)
382 |
a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
383 |
n_prompt = gr.Textbox(label="Negative Prompt",
384 |
value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, sunglasses, hat')
385 |
with gr.Column():
386 |
result_gallery = gr.Gallery(label='Output', show_label=False, show_download_button=True, elem_id="gallery").style(grid=1, height='auto')
387 |
with gr.Row():
388 |
max_samples = 8 if not DEMO else 4
389 |
num_samples = gr.Slider(label="Images", minimum=1, maximum=max_samples, value=1, step=1)
390 |
seed = gr.Slider(label="Seed (-1 for random)", minimum=-1, maximum=2147483647, step=1, value=1561194236)#randomize=True) #value=1561194234)
391 |
if not DEMO:
392 |
DF_DEMO = 'fashionWOMENTees_Tanksid0000762403_1front___fashionWOMENTees_Tanksid0000762403_1front'
393 |
DF_EVAL = 'fashionWOMENBlouses_Shirtsid0000035501_1front___fashionWOMENBlouses_Shirtsid0000035501_1front'
394 |
DF_RESULT ="fashionWOMENTees_Tanksid0000796209_1front___fashionWOMENTees_Tanksid0000796209_2side"
395 |
deepfashion_names = gr.Textbox(label='Deepfashion name', value=DF_EVAL)
396 |
gr.Markdown("Default config reconstruct image faithful to pose, mask and visual condition. Reduce control strength to tip balance towards text prompt for more creativity.")
397 |
prompt = gr.Textbox(label="Text Prompt", value="")
398 |
399 |
run_button = gr.Button(label="Run")
400 |
401 |
402 |
+, inputs=None, outputs=input_image)
403 |
+, inputs=None, outputs=input_image)
404 |
for vision_gallery in viscon_galleries:
405 |
viscon_idx = viscon_images_names2index[vision_gallery['inputs'][0]]
406 |
vision_gallery['component'].select(fn=select_gallery_image, inputs=None,
407 |
408 |
ips = [prompt, a_prompt, n_prompt, num_samples, ddim_steps, scale, seed, eta, mask_image, pose_image,
409 |
*control_scales, *viscon_images]
410 |
+, inputs=ips, outputs=[result_gallery])
411 |
prompt.submit(fn=process, inputs=ips, outputs=[result_gallery])
412 |
+, inputs=[input_image, detect_resolution,
413 |
ignorehead_checkbox, ignorehair_checkbox],
414 |
outputs=[pose_image, mask_image])
415 |
+, inputs=input_image, outputs=[*viscon_images])
416 |
+, inputs=[*viscon_images], outputs=[*viscon_images])
417 |
+, inputs=[strength_select], outputs=[*control_scales])
418 |
scale_all.release(fn=change_all_scales, inputs=[scale_all], outputs=[*control_scales])
419 |
if not DEMO:
420 |
deepfashion_names.submit(fn=fetch_deepfashion, inputs=[deepfashion_names], outputs=[input_image, pose_image, mask_image, *viscon_images])
421 |
return block
422 |
423 |
if __name__ == "__main__":
424 |
parser = argparse.ArgumentParser(description='Calculate image-text similarity score.')
425 |
426 |
parser.add_argument('--gpu', type=int, default=0, help='GPU id')
427 |
parser.add_argument('--config', type=str, default='./configs/visconet_v1.yaml')
428 |
parser.add_argument('--ckpt', type=str, default='./models/visconet_v1.pth')
429 |
parser.add_argument('--public_link', action='store_true', default='', help='Create public link')
430 |
args = parser.parse_args()
431 |
432 |
global device
433 |
global segmentor
434 |
global apply_openpose
435 |
global style_encoder
436 |
global model
437 |
global ddim_sampler
438 |
439 |
device = f'cuda:{args.gpu}' if torch.cuda.is_available() else 'cpu'
440 |
config_file = args.config
441 |
model_ckpt = args.ckpt
442 |
443 |
proj_config = OmegaConf.load(config_file)
444 |
style_names = proj_config.dataset.train.params.style_names
445 |
data_root = Path(proj_config.dataset.train.params.image_root)
446 |
image_root = data_root/proj_config.dataset.train.params.image_dir
447 |
style_root = data_root/proj_config.dataset.train.params.style_dir
448 |
pose_root = data_root/proj_config.dataset.train.params.pose_dir
449 |
mask_root = data_root/proj_config.dataset.train.params.mask_dir
450 |
451 |
segmentor = SegmentCropper()
452 |
apply_openpose = OpenposeDetector()
453 |
454 |
snapshot_download(repo_id=HF_REPO, local_dir='./models',
455 |
456 |
457 |
style_encoder = instantiate_from_config(proj_config.model.style_embedding_config).to(device)
458 |
model = create_model(config_file).cpu()
459 |
model.load_state_dict(load_state_dict(model_ckpt, location=device))
460 |
461 |
model =
462 |
model.cond_stage_model.device = device
463 |
ddim_sampler = DDIMSampler(model)
464 |
465 |
if not GALLERY_PATH.exists():
466 |
zip_name = ''
467 |
snapshot_download(repo_id=HF_REPO, allow_patterns=zip_name, local_dir='.')
468 |
from zipfile import ZipFile
469 |
with ZipFile(zip_name, 'r') as zip_ref:
470 |
471 |
472 |
473 |
# Calling the main function with parsed arguments
474 |
block = create_app()
475 |
block.launch(server_name='', share=args.public_link)
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |