Ashhar commited on
Commit
3683b73
·
1 Parent(s): 270fed8

face match detector

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +0 -19
  2. README.md +6 -5
  3. __pycache__/app.cpython-312.pyc +0 -0
  4. app.py +75 -207
  5. facewrapper/dict/data1.bin +0 -3
  6. facewrapper/dict/data2.bin +0 -3
  7. facewrapper/dict/data3.bin +0 -3
  8. facewrapper/dict/detect.bin +0 -3
  9. facewrapper/facewrapper.py +0 -32
  10. facewrapper/libs/libimutils.so +0 -0
  11. facewrapper/libs/libimutils.so_for_ubuntu22 +0 -0
  12. facewrapper/libs/libttvfaceengine6.so +0 -3
  13. gradio/demo.py +0 -115
  14. gradio/examples/1.jpg +0 -0
  15. gradio/examples/2.jpg +0 -0
  16. gradio/examples/3.jpg +0 -0
  17. gradio/examples/4.jpg +0 -0
  18. openvino/cache.json +0 -0
  19. openvino/libgna.so +0 -3
  20. openvino/libgna.so.2 +0 -3
  21. openvino/libgna.so.3.0.0.1455 +0 -3
  22. openvino/libopenvino.so +0 -3
  23. openvino/libopenvino_auto_batch_plugin.so +0 -0
  24. openvino/libopenvino_auto_plugin.so +0 -0
  25. openvino/libopenvino_c.so +0 -0
  26. openvino/libopenvino_gapi_preproc.so +0 -3
  27. openvino/libopenvino_hetero_plugin.so +0 -0
  28. openvino/libopenvino_intel_cpu_plugin.so +0 -3
  29. openvino/libopenvino_intel_gna_plugin.so +0 -3
  30. openvino/libopenvino_intel_hddl_plugin.so +0 -3
  31. openvino/libopenvino_intel_myriad_plugin.so +0 -3
  32. openvino/libopenvino_ir_frontend.so +0 -0
  33. openvino/libopenvino_onnx_frontend.so +0 -3
  34. openvino/libopenvino_paddle_frontend.so +0 -0
  35. openvino/libopenvino_tensorflow_fe.so +0 -3
  36. openvino/pcie-ma2x8x.mvcmd +0 -3
  37. openvino/plugins.xml +0 -27
  38. openvino/usb-ma2x8x.mvcmd +0 -3
  39. openvino/vpu_custom_kernels/binarization.bin +0 -3
  40. openvino/vpu_custom_kernels/binarization.cl +0 -67
  41. openvino/vpu_custom_kernels/binary_convolution.bin +0 -3
  42. openvino/vpu_custom_kernels/binary_convolution.cl +0 -95
  43. openvino/vpu_custom_kernels/binary_convolution1x1.bin +0 -3
  44. openvino/vpu_custom_kernels/binary_convolution1x1.cl +0 -117
  45. openvino/vpu_custom_kernels/binary_convolution3x3.bin +0 -3
  46. openvino/vpu_custom_kernels/binary_convolution3x3.cl +0 -278
  47. openvino/vpu_custom_kernels/convolution1x1_chw.bin +0 -3
  48. openvino/vpu_custom_kernels/convolution1x1_chw.cl +0 -114
  49. openvino/vpu_custom_kernels/convolution1x1_hwc.bin +0 -3
  50. openvino/vpu_custom_kernels/convolution1x1_hwc.cl +0 -126
Dockerfile DELETED
@@ -1,19 +0,0 @@
1
- FROM ubuntu:20.04
2
- RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone
3
- RUN apt-get update -y
4
- RUN apt-get install -y python3 python3-pip python3-opencv
5
- RUN apt-get install -y libcurl4-openssl-dev libssl-dev
6
- RUN mkdir -p /home/FaceOnLive_v6
7
- RUN mkdir -p /home/FaceOnLive_v6/facewrapper
8
- WORKDIR /home/FaceOnLive_v6
9
- COPY ./facewrapper ./facewrapper
10
- COPY ./facewrapper/libs/libimutils.so /usr/lib
11
- COPY ./gradio ./gradio
12
- COPY ./openvino /usr/lib
13
- COPY ./app.py ./app.py
14
- COPY ./run.sh .
15
- COPY ./requirements.txt ./requirements.txt
16
- RUN pip3 install -r requirements.txt
17
- RUN chmod a+x run.sh
18
- CMD ["./run.sh"]
19
- EXPOSE 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,11 +1,12 @@
1
  ---
2
- title: Face Recognition SDK
3
- emoji: 🤨🙂
4
  colorFrom: yellow
5
- colorTo: pink
6
- sdk: docker
 
 
7
  pinned: false
8
- license: mit
9
  short_description: FaceOnLive On-Premise Solution
10
  ---
11
 
 
1
  ---
2
+ title: Face Matching Tool
3
+ emoji: 🤡
4
  colorFrom: yellow
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 4.36.1
8
+ app_file: app.py
9
  pinned: false
 
10
  short_description: FaceOnLive On-Premise Solution
11
  ---
12
 
__pycache__/app.cpython-312.pyc ADDED
Binary file (3.92 kB). View file
 
app.py CHANGED
@@ -1,217 +1,85 @@
1
- import sys
2
- sys.path.append('.')
3
-
4
- from flask import Flask, request, jsonify
5
- from time import gmtime, strftime
6
- import os
7
- import base64
8
- import json
9
  import cv2
10
- import numpy as np
 
 
 
11
 
12
- from facewrapper.facewrapper import ttv_version
13
- from facewrapper.facewrapper import ttv_get_hwid
14
- from facewrapper.facewrapper import ttv_init
15
- from facewrapper.facewrapper import ttv_init_offline
16
- from facewrapper.facewrapper import ttv_extract_feature
17
- from facewrapper.facewrapper import ttv_compare_feature
18
 
19
- app = Flask(__name__)
20
 
21
- app.config['SITE'] = "http://0.0.0.0:8000/"
22
- app.config['DEBUG'] = False
23
 
24
- licenseKey = os.environ.get("LICENSE_KEY")
25
- licensePath = "license.txt"
26
- modelFolder = os.path.abspath(os.path.dirname(__file__)) + '/facewrapper/dict'
27
 
28
- version = ttv_version()
29
- print("version: ", version.decode('utf-8'))
 
 
 
30
 
31
- ret = ttv_init(modelFolder.encode('utf-8'), licenseKey.encode('utf-8'))
32
- if ret != 0:
33
- print(f"online init failed: {ret}");
34
 
35
- hwid = ttv_get_hwid()
36
- print("hwid: ", hwid.decode('utf-8'))
 
 
37
 
38
- ret = ttv_init_offline(modelFolder.encode('utf-8'), licensePath.encode('utf-8'))
39
- if ret != 0:
40
- print(f"offline init failed: {ret}")
41
- exit(-1)
42
- else:
43
- print(f"offline init ok")
44
-
45
- else:
46
- print(f"online init ok")
47
-
48
- @app.route('/api/compare_face', methods=['POST'])
49
- def compare_face():
50
- file1 = request.files['image1']
51
- image1 = cv2.imdecode(np.fromstring(file1.read(), np.uint8), cv2.IMREAD_COLOR)
52
- if image1 is None:
53
- result = "image1: is null!"
54
- status = "ok"
55
- response = jsonify({"status": status, "data": {"result": result}})
56
- response.status_code = 200
57
- response.headers["Content-Type"] = "application/json; charset=utf-8"
58
- return response
59
-
60
- file2 = request.files['image2']
61
- image2 = cv2.imdecode(np.fromstring(file2.read(), np.uint8), cv2.IMREAD_COLOR)
62
- if image2 is None:
63
- result = "image2: is null!"
64
- status = "ok"
65
- response = jsonify({"status": status, "data": {"result": result}})
66
- response.status_code = 200
67
- response.headers["Content-Type"] = "application/json; charset=utf-8"
68
- return response
69
-
70
- faceRect1 = np.zeros([4], dtype=np.int32)
71
- feature1 = np.zeros([2048], dtype=np.uint8)
72
- featureSize1 = np.zeros([1], dtype=np.int32)
73
-
74
- ret = ttv_extract_feature(image1, image1.shape[1], image1.shape[0], faceRect1, feature1, featureSize1)
75
- if ret <= 0:
76
- if ret == -1:
77
- result = "license error!"
78
- elif ret == -2:
79
- result = "init error!"
80
- elif ret == 0:
81
- result = "image1: no face detected!"
82
-
83
- status = "ok"
84
- response = jsonify({"status": status, "data": {"result": result}})
85
- response.status_code = 200
86
- response.headers["Content-Type"] = "application/json; charset=utf-8"
87
- return response
88
-
89
- faceRect2 = np.zeros([4], dtype=np.int32)
90
- feature2 = np.zeros([2048], dtype=np.uint8)
91
- featureSize2 = np.zeros([1], dtype=np.int32)
92
-
93
- ret = ttv_extract_feature(image2, image2.shape[1], image2.shape[0], faceRect2, feature2, featureSize2)
94
- if ret <= 0:
95
- if ret == -1:
96
- result = "license error!"
97
- elif ret == -2:
98
- result = "init error!"
99
- elif ret == 0:
100
- result = "image2: no face detected!"
101
-
102
- status = "ok"
103
- response = jsonify({"status": status, "data": {"result": result}})
104
- response.status_code = 200
105
- response.headers["Content-Type"] = "application/json; charset=utf-8"
106
- return response
107
-
108
- similarity = ttv_compare_feature(feature1, feature2)
109
- if similarity > 0.7:
110
- result = "same"
111
- else:
112
- result = "different"
113
-
114
- status = "ok"
115
- response = jsonify(
116
- {
117
- "status": status,
118
- "data": {
119
- "result": result,
120
- "similarity": float(similarity),
121
- "face1": {"x1": int(faceRect1[0]), "y1": int(faceRect1[1]), "x2": int(faceRect1[2]), "y2" : int(faceRect1[3])},
122
- "face2": {"x1": int(faceRect2[0]), "y1": int(faceRect2[1]), "x2": int(faceRect2[2]), "y2" : int(faceRect2[3])},
123
- }
124
- })
125
-
126
- response.status_code = 200
127
- response.headers["Content-Type"] = "application/json; charset=utf-8"
128
- return response
129
-
130
-
131
- @app.route('/api/compare_face_base64', methods=['POST'])
132
- def coompare_face_base64():
133
- content = request.get_json()
134
- imageBase641 = content['image1']
135
- image1 = cv2.imdecode(np.frombuffer(base64.b64decode(imageBase641), dtype=np.uint8), cv2.IMREAD_COLOR)
136
-
137
- if image1 is None:
138
- result = "image1: is null!"
139
- status = "ok"
140
- response = jsonify({"status": status, "data": {"result": result}})
141
- response.status_code = 200
142
- response.headers["Content-Type"] = "application/json; charset=utf-8"
143
- return response
144
-
145
- imageBase642 = content['image2']
146
- image2 = cv2.imdecode(np.frombuffer(base64.b64decode(imageBase642), dtype=np.uint8), cv2.IMREAD_COLOR)
147
-
148
- if image2 is None:
149
- result = "image2: is null!"
150
- status = "ok"
151
- response = jsonify({"status": status, "data": {"result": result}})
152
- response.status_code = 200
153
- response.headers["Content-Type"] = "application/json; charset=utf-8"
154
- return response
155
-
156
- faceRect1 = np.zeros([4], dtype=np.int32)
157
- feature1 = np.zeros([2048], dtype=np.uint8)
158
- featureSize1 = np.zeros([1], dtype=np.int32)
159
-
160
- ret = ttv_extract_feature(image1, image1.shape[1], image1.shape[0], faceRect1, feature1, featureSize1)
161
- if ret <= 0:
162
- if ret == -1:
163
- result = "license error!"
164
- elif ret == -2:
165
- result = "init error!"
166
- elif ret == 0:
167
- result = "image1: no face detected!"
168
-
169
- status = "ok"
170
- response = jsonify({"status": status, "data": {"result": result}})
171
- response.status_code = 200
172
- response.headers["Content-Type"] = "application/json; charset=utf-8"
173
- return response
174
-
175
- faceRect2 = np.zeros([4], dtype=np.int32)
176
- feature2 = np.zeros([2048], dtype=np.uint8)
177
- featureSize2 = np.zeros([1], dtype=np.int32)
178
-
179
- ret = ttv_extract_feature(image2, image2.shape[1], image2.shape[0], faceRect2, feature2, featureSize2)
180
- if ret <= 0:
181
- if ret == -1:
182
- result = "license error!"
183
- elif ret == -2:
184
- result = "init error!"
185
- elif ret == 0:
186
- result = "image2: no face detected!"
187
-
188
- status = "ok"
189
- response = jsonify({"status": status, "data": {"result": result}})
190
- response.status_code = 200
191
- response.headers["Content-Type"] = "application/json; charset=utf-8"
192
- return response
193
-
194
- similarity = ttv_compare_feature(feature1, feature2)
195
- if similarity > 0.7:
196
- result = "same"
197
  else:
198
- result = "different"
199
-
200
- status = "ok"
201
- response = jsonify(
202
- {
203
- "status": status,
204
- "data": {
205
- "result": result,
206
- "similarity": float(similarity),
207
- "face1": {"x1": int(faceRect1[0]), "y1": int(faceRect1[1]), "x2": int(faceRect1[2]), "y2" : int(faceRect1[3])},
208
- "face2": {"x1": int(faceRect2[0]), "y1": int(faceRect2[1]), "x2": int(faceRect2[2]), "y2" : int(faceRect2[3])},
209
- }
210
- })
211
- response.status_code = 200
212
- response.headers["Content-Type"] = "application/json; charset=utf-8"
213
- return response
214
-
215
- if __name__ == '__main__':
216
- port = int(os.environ.get("PORT", 8000))
217
- app.run(host='0.0.0.0', port=port)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import cv2
2
+ import face_recognition
3
+ import gradio as gr
4
+ import datetime as DT
5
+ import pytz
6
 
7
+ ipAddress = None
 
 
 
 
 
8
 
 
9
 
10
+ def __nowInIST():
11
+ return DT.datetime.now(pytz.timezone("Asia/Kolkata"))
12
 
 
 
 
13
 
14
+ def __attachIp(request: gr.Request):
15
+ global ipAddress
16
+ x_forwarded_for = request.headers.get('x-forwarded-for')
17
+ if x_forwarded_for:
18
+ ipAddress = x_forwarded_for
19
 
 
 
 
20
 
21
+ def pprint(log: str):
22
+ now = __nowInIST()
23
+ now = now.strftime("%Y-%m-%d %H:%M:%S")
24
+ print(f"[{now}] [{ipAddress}] {log}")
25
 
26
+
27
+ def __findFaceEncodings(imagePath):
28
+ image = cv2.imread(imagePath)
29
+ faceEncodings = face_recognition.face_encodings(image)
30
+ return faceEncodings[0] if len(faceEncodings) > 0 else None
31
+
32
+
33
+ def predictMatch(firstImage, secondImage):
34
+ image1Encoding = __findFaceEncodings(firstImage)
35
+ image2Encoding = __findFaceEncodings(secondImage)
36
+
37
+ pprint("Starting the job...")
38
+ distance = face_recognition.face_distance([image1Encoding], image2Encoding)
39
+ distancePercent = round(distance[0] * 100)
40
+ matchPercent = 100 - distancePercent
41
+ pprint(f"Job finished. Match : {matchPercent}%")
42
+
43
+ isSame = matchPercent > 50
44
+
45
+ amplificationFactor = 1.5
46
+ adjustment = (amplificationFactor - 0.5) * 100
47
+
48
+ outputTexts = []
49
+ if isSame:
50
+ outputTexts.append("The images are of the same person ✅")
51
+ matchScore = round((matchPercent + adjustment) / (100 + adjustment) * 100)
52
+ outputTexts.append(f"\nMatch Score: {matchScore}%")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  else:
54
+ outputTexts.append("The images are not of the same person ❌")
55
+
56
+ outputText = "\n".join(outputTexts)
57
+ pprint(f"{outputText=}")
58
+
59
+ return outputText
60
+
61
+
62
+ with gr.Row(elem_classes=["main-container"]):
63
+ with gr.Row(elem_classes=["img-container"]):
64
+ firstImage = gr.Image(type='filepath', height=250, elem_classes=["image"])
65
+ secondImage = gr.Image(type='filepath', height=250, elem_classes=["image"])
66
+
67
+ with gr.Row(elem_classes=["output-container"]):
68
+ result = gr.Textbox(label="Result", elem_classes=["output"])
69
+
70
+
71
+ with gr.Interface(
72
+ fn=predictMatch,
73
+ inputs=[
74
+ firstImage,
75
+ secondImage,
76
+ ],
77
+ outputs=[
78
+ result
79
+ ],
80
+ title="Face Match Detector",
81
+ allow_flagging="never",
82
+ ) as demo:
83
+ demo.load(__attachIp, None, None)
84
+
85
+ demo.launch(debug=True)
facewrapper/dict/data1.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:36cf5fcc49345989a86839a53529314ec1fe5d621c377a1952bc7538d55e7f1b
3
- size 16255630
 
 
 
 
facewrapper/dict/data2.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f25fb0cd3d70cb84c258e7109620f411c087e0875828d6ab86cc9c4838d49bec
3
- size 11875339
 
 
 
 
facewrapper/dict/data3.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:06daf36a8545f59ac104415e8b2d38072d06abc027cb346fd4a6c6029fed55b4
3
- size 90716260
 
 
 
 
facewrapper/dict/detect.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b505c320dd8add047f107549849a307d0c6f518f01c1d3402bce9e13a765146
3
- size 28463173
 
 
 
 
facewrapper/facewrapper.py DELETED
@@ -1,32 +0,0 @@
1
- import ctypes, ctypes.util
2
- from ctypes import *
3
- from numpy.ctypeslib import ndpointer
4
- import sys
5
- import os
6
-
7
- lib_path = os.path.abspath(os.path.dirname(__file__)) + '/libs/libttvfaceengine6.so'
8
- liveness_engine = cdll.LoadLibrary(lib_path)
9
-
10
- ttv_version = liveness_engine.ttv_version
11
- ttv_version.argtypes = []
12
- ttv_version.restype = ctypes.c_char_p
13
-
14
- ttv_get_hwid = liveness_engine.ttv_get_hwid
15
- ttv_get_hwid.argtypes = []
16
- ttv_get_hwid.restype = ctypes.c_char_p
17
-
18
- ttv_init = liveness_engine.ttv_init
19
- ttv_init.argtypes = [ctypes.c_char_p, ctypes.c_char_p]
20
- ttv_init.restype = ctypes.c_int32
21
-
22
- ttv_init_offline = liveness_engine.ttv_init_offline
23
- ttv_init_offline.argtypes = [ctypes.c_char_p, ctypes.c_char_p]
24
- ttv_init_offline.restype = ctypes.c_int32
25
-
26
- ttv_extract_feature = liveness_engine.ttv_extract_feature
27
- ttv_extract_feature.argtypes = [ndpointer(ctypes.c_ubyte, flags='C_CONTIGUOUS'), ctypes.c_int32, ctypes.c_int32, ndpointer(ctypes.c_int32, flags='C_CONTIGUOUS'), ndpointer(ctypes.c_ubyte, flags='C_CONTIGUOUS'), ndpointer(ctypes.c_int32, flags='C_CONTIGUOUS')]
28
- ttv_extract_feature.restype = ctypes.c_int
29
-
30
- ttv_compare_feature = liveness_engine.ttv_compare_feature
31
- ttv_compare_feature.argtypes = [ndpointer(ctypes.c_ubyte, flags='C_CONTIGUOUS'), ndpointer(ctypes.c_ubyte, flags='C_CONTIGUOUS')]
32
- ttv_compare_feature.restype = ctypes.c_double
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
facewrapper/libs/libimutils.so DELETED
Binary file (412 kB)
 
facewrapper/libs/libimutils.so_for_ubuntu22 DELETED
Binary file (412 kB)
 
facewrapper/libs/libttvfaceengine6.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:01d366a336938c06f53a77d36a590a571898cccb297b768a4df7490baac12b80
3
- size 4998416
 
 
 
 
gradio/demo.py DELETED
@@ -1,115 +0,0 @@
1
- import gradio as gr
2
- import requests
3
- import json
4
- from PIL import Image
5
-
6
- def compare_face(frame1, frame2):
7
- url = "http://127.0.0.1:8000/api/compare_face"
8
- files = {'image1': open(frame1, 'rb'), 'image2': open(frame2, 'rb')}
9
-
10
- r = requests.post(url=url, files=files)
11
- faces = None
12
-
13
- try:
14
- image1 = Image.open(frame1)
15
- image2 = Image.open(frame2)
16
-
17
- face1 = None
18
- face2 = None
19
- data = r.json().get('data')
20
- if data.get('face1') is not None:
21
- face = data.get('face1')
22
- x1 = face.get('x1')
23
- y1 = face.get('y1')
24
- x2 = face.get('x2')
25
- y2 = face.get('y2')
26
- if x1 < 0:
27
- x1 = 0
28
- if y1 < 0:
29
- y1 = 0
30
- if x2 >= image1.width:
31
- x2 = image1.width - 1
32
- if y2 >= image1.height:
33
- y2 = image1.height - 1
34
-
35
- face1 = image1.crop((x1, y1, x2, y2))
36
- face_image_ratio = face1.width / float(face1.height)
37
- resized_w = int(face_image_ratio * 150)
38
- resized_h = 150
39
-
40
- face1 = face1.resize((int(resized_w), int(resized_h)))
41
-
42
- if data.get('face2') is not None:
43
- face = data.get('face2')
44
- x1 = face.get('x1')
45
- y1 = face.get('y1')
46
- x2 = face.get('x2')
47
- y2 = face.get('y2')
48
-
49
- if x1 < 0:
50
- x1 = 0
51
- if y1 < 0:
52
- y1 = 0
53
- if x2 >= image2.width:
54
- x2 = image2.width - 1
55
- if y2 >= image2.height:
56
- y2 = image2.height - 1
57
-
58
- face2 = image2.crop((x1, y1, x2, y2))
59
- face_image_ratio = face2.width / float(face2.height)
60
- resized_w = int(face_image_ratio * 150)
61
- resized_h = 150
62
-
63
- face2 = face2.resize((int(resized_w), int(resized_h)))
64
-
65
- if face1 is not None and face2 is not None:
66
- new_image = Image.new('RGB',(face1.width + face2.width + 10, 150), (80,80,80))
67
-
68
- new_image.paste(face1,(0,0))
69
- new_image.paste(face2,(face1.width + 10, 0))
70
- faces = new_image.copy()
71
- elif face1 is not None and face2 is None:
72
- new_image = Image.new('RGB',(face1.width + face1.width + 10, 150), (80,80,80))
73
-
74
- new_image.paste(face1,(0,0))
75
- faces = new_image.copy()
76
- elif face1 is None and face2 is not None:
77
- new_image = Image.new('RGB',(face2.width + face2.width + 10, 150), (80,80,80))
78
-
79
- new_image.paste(face2,(face2.width + 10, 0))
80
- faces = new_image.copy()
81
- except:
82
- pass
83
-
84
- return [r.json(), faces]
85
-
86
- with gr.Blocks() as demo:
87
- gr.Markdown(
88
- """
89
- # Face Recognition
90
- Get your own Face Recognition Server by duplicating this space.<br/>
91
- Or run on your own machine using docker.<br/>
92
- ```docker run -it -p 7860:7860 --platform=linux/amd64 \
93
- -e LICENSE_KEY="YOUR_VALUE_HERE" \
94
- registry.hf.space/faceonlive-face-recognition-sdk:latest ```<br/><br/>
95
- Contact us at https://faceonlive.com for issues and support.<br/>
96
- """
97
- )
98
- with gr.Row():
99
- with gr.Column():
100
- compare_face_input1 = gr.Image(type='filepath', height=480)
101
- gr.Examples(['gradio/examples/1.jpg', 'gradio/examples/2.jpg'],
102
- inputs=compare_face_input1)
103
- compare_face_button = gr.Button("Compare Face")
104
- with gr.Column():
105
- compare_face_input2 = gr.Image(type='filepath', height=480)
106
- gr.Examples(['gradio/examples/3.jpg', 'gradio/examples/4.jpg'],
107
- inputs=compare_face_input2)
108
- with gr.Column():
109
- compare_face_output = gr.Image(type="pil", height=300)
110
- compare_result_output = gr.JSON(label='Result')
111
-
112
- compare_face_button.click(compare_face, inputs=[compare_face_input1, compare_face_input2], outputs=[compare_result_output, compare_face_output])
113
- gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FFaceOnLive%2FFace-Recognition-SDK"><img src="https://api.visitorbadge.io/api/combined?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FFaceOnLive%2FFace-Recognition-SDK&labelColor=%23ff8a65&countColor=%2337d67a&style=flat&labelStyle=upper" /></a>')
114
-
115
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gradio/examples/1.jpg DELETED
Binary file (14.1 kB)
 
gradio/examples/2.jpg DELETED
Binary file (5.97 kB)
 
gradio/examples/3.jpg DELETED
Binary file (13.3 kB)
 
gradio/examples/4.jpg DELETED
Binary file (6.47 kB)
 
openvino/cache.json DELETED
The diff for this file is too large to render. See raw diff
 
openvino/libgna.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
3
- size 3120536
 
 
 
 
openvino/libgna.so.2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
3
- size 3120536
 
 
 
 
openvino/libgna.so.3.0.0.1455 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
3
- size 3120536
 
 
 
 
openvino/libopenvino.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd216848c1ba78e62360c12c9684df0c160f6962f3d900e5918cc042b42b2b46
3
- size 13495416
 
 
 
 
openvino/libopenvino_auto_batch_plugin.so DELETED
Binary file (391 kB)
 
openvino/libopenvino_auto_plugin.so DELETED
Binary file (371 kB)
 
openvino/libopenvino_c.so DELETED
Binary file (305 kB)
 
openvino/libopenvino_gapi_preproc.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ac5ce0a8f3acefb41e8aa8161f78035dafff25c4b8c3485ebc541573b2b15f0
3
- size 1312920
 
 
 
 
openvino/libopenvino_hetero_plugin.so DELETED
Binary file (367 kB)
 
openvino/libopenvino_intel_cpu_plugin.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:afe05ada6d5b11495a21787fa6ab0162fc40f7a9ab97be78f7b7185126d15b18
3
- size 33299880
 
 
 
 
openvino/libopenvino_intel_gna_plugin.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef15b623e7f81788160c4056ccd5e887a8184affe381e84a906646ef36cae1ab
3
- size 4067016
 
 
 
 
openvino/libopenvino_intel_hddl_plugin.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:96362327fbc404e88583bdcd2a526ccbf4ca26d4ecdb8898234be7986d9b8b2b
3
- size 5894680
 
 
 
 
openvino/libopenvino_intel_myriad_plugin.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e596436002565356b80400e0d7e50093d53d338f623b171f658de527477852de
3
- size 6120168
 
 
 
 
openvino/libopenvino_ir_frontend.so DELETED
Binary file (343 kB)
 
openvino/libopenvino_onnx_frontend.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0770ed09d471b20bffcf4ef57ab1fb002db04c4404598bd5c52a4418a67f5441
3
- size 3781640
 
 
 
 
openvino/libopenvino_paddle_frontend.so DELETED
Binary file (987 kB)
 
openvino/libopenvino_tensorflow_fe.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2dadbcd8ba32cec02873caf8dcc644d1d8856cdcd2978c603e5bac169e01bb9
3
- size 2723864
 
 
 
 
openvino/pcie-ma2x8x.mvcmd DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f03146453508f2bcab1589907bccaa429b48db6123a7b8a428d6ce221d1fbb4d
3
- size 2099248
 
 
 
 
openvino/plugins.xml DELETED
@@ -1,27 +0,0 @@
1
- <ie>
2
- <plugins>
3
- <plugin name="AUTO" location="libopenvino_auto_plugin.so">
4
- <properties>
5
- <property key="MULTI_WORK_MODE_AS_AUTO" value="YES"/>
6
- </properties>
7
- </plugin>
8
- <plugin name="BATCH" location="libopenvino_auto_batch_plugin.so">
9
- </plugin>
10
- <plugin name="CPU" location="libopenvino_intel_cpu_plugin.so">
11
- </plugin>
12
- <plugin name="GNA" location="libopenvino_intel_gna_plugin.so">
13
- </plugin>
14
- <plugin name="GPU" location="libopenvino_intel_gpu_plugin.so">
15
- </plugin>
16
- <plugin name="HETERO" location="libopenvino_hetero_plugin.so">
17
- </plugin>
18
- <plugin name="MULTI" location="libopenvino_auto_plugin.so">
19
- </plugin>
20
- <plugin name="MYRIAD" location="libopenvino_intel_myriad_plugin.so">
21
- </plugin>
22
- <plugin name="HDDL" location="libopenvino_intel_hddl_plugin.so">
23
- </plugin>
24
- <plugin name="VPUX" location="libopenvino_intel_vpux_plugin.so">
25
- </plugin>
26
- </plugins>
27
- </ie>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
openvino/usb-ma2x8x.mvcmd DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:faf33388b88708177a358fcb4704eba04b1cf9e88d6a047f90c833d686140a2e
3
- size 2298632
 
 
 
 
openvino/vpu_custom_kernels/binarization.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e0de6082c7bacca2ff5ad131f0afc44304fc792a6d99e7829399eb61491a0ac
3
- size 19632
 
 
 
 
openvino/vpu_custom_kernels/binarization.cl DELETED
@@ -1,67 +0,0 @@
1
- // Copyright (C) 2018-2022 Intel Corporation
2
- // SPDX-License-Identifier: Apache-2.0
3
- //
4
-
5
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
- #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
-
8
- __kernel void binarization(
9
- const __global half *__restrict src_data,
10
- const __global half *__restrict input_low_high,
11
- const __global half *__restrict dst_data,
12
- int switch_out,
13
- int input_low_high_size,
14
- int W,
15
- int H)
16
- {
17
- __local half local_src[15 * 1024];
18
- __local half local_dst[15 * 1024];
19
-
20
- event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
21
- wait_group_events(1, &e1);
22
-
23
- int c = get_global_id(2);
24
- int C = get_global_size(2);
25
-
26
- half dst_low = switch_out ? 1.h : -1.h;
27
- half dst_high = switch_out ? -1.h : 1.h;
28
-
29
- half s_ilow_ihigh = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c];
30
-
31
- for (int h = 0; h < H; h++) {
32
-
33
- __local const half *__restrict addr_src = local_src + h * W;
34
- __local half *__restrict addr_dst = local_dst + h * W;
35
-
36
- #if 1
37
- for (int w = 0; w < W / 8; w++) {
38
-
39
- half8 h_src_val8 = (*((__local half8 *)addr_src + w));
40
-
41
- short8 cond1;
42
- cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh);
43
- cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh);
44
- cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh);
45
- cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh);
46
- cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh);
47
- cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh);
48
- cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh);
49
- cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh);
50
-
51
- cond1 = ~(cond1 - (short8)1);
52
-
53
- short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high);
54
-
55
- *((__local half8 *)addr_dst + w) = as_half8(res);
56
- }
57
- #endif
58
- for (int w = W & (~0x7); w < W; w++) {
59
- addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high;
60
- }
61
- }
62
-
63
- barrier(CLK_LOCAL_MEM_FENCE);
64
-
65
- event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
66
- wait_group_events(1, &e2);
67
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
openvino/vpu_custom_kernels/binary_convolution.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:12c349d6f73c233b158e1d67af31715c7b8bda79f191b1e759476e01e65bb64a
3
- size 10764
 
 
 
 
openvino/vpu_custom_kernels/binary_convolution.cl DELETED
@@ -1,95 +0,0 @@
1
- // Copyright (C) 2018-2022 Intel Corporation
2
- // SPDX-License-Identifier: Apache-2.0
3
- //
4
-
5
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
-
7
- int extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
8
-
9
- __kernel void binary_convolution(
10
- const __global half *restrict src_data,
11
- const __global uchar *restrict weights_data,
12
- __global half *restrict dst_data,
13
- float pad_value,
14
-
15
- int IW,
16
- int IH,
17
- int IC,
18
-
19
- int DW,
20
- int DH,
21
-
22
- int GC,
23
-
24
- int KW,
25
- int KH,
26
-
27
- int PW,
28
- int PH,
29
-
30
- int SW,
31
- int SH)
32
- {
33
- int ipad_value = ((pad_value > 0.f) ? 1 : 0);
34
- int c = get_global_id(2);
35
- int y = get_global_id(1);
36
- int x = get_global_id(0);
37
-
38
- int OC = get_global_size(2);
39
- int OH = get_global_size(1);
40
- int OW = get_global_size(0);
41
-
42
- int KD = 1;
43
- int SD = 0;
44
- int DD = 0;
45
- int PD = 0;
46
- int ID = 1;
47
- int OD = 1;
48
-
49
- int nbits = 8;
50
-
51
- int g = c % GC;
52
- int oc = c / GC;
53
- int oh = y;
54
- int ow = x;
55
-
56
- for (int od = 0; od < OD; od++) {
57
- int oidx = g * OC / GC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow;
58
-
59
- int res = 0;
60
-
61
- for (int ic = 0; ic < IC / GC; ic++) {
62
- for (int kd = 0; kd < KD; kd++) {
63
- for (int kh = 0; kh < KH; kh++) {
64
- for (int kw = 0; kw < KW; kw++) {
65
- int widx = g * OC / GC * IC / GC * KD * KH * KW
66
- + oc * IC / GC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW
67
- + kh * KW + kw;
68
-
69
- int w = extract_weights(weights_data[widx / nbits], (widx % nbits));
70
-
71
- int s;
72
-
73
- int iw = ow * SW - PW + kw * DW;
74
- int ih = oh * SH - PH + kh * DH;
75
- int id = od * SD - PD + kd * DD;
76
-
77
- if (iw < 0 || iw >= (int)IW || ih < 0 || ih >= (int)IH || id < 0
78
- || id >= (int)ID) {
79
- s = ipad_value;
80
- } else {
81
- int iidx = g * IC / GC * ID * IH * IW + ic * ID * IH * IW + id * IH * IW
82
- + ih * IW + iw;
83
-
84
- s = ((src_data[iidx] > 0.f) ? 1 : 0);
85
- }
86
-
87
- res += s ^ w;
88
- }
89
- }
90
- }
91
- }
92
-
93
- dst_data[oidx] = (half)(IC / GC * KD * KH * KW - 2 * res);
94
- }
95
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
openvino/vpu_custom_kernels/binary_convolution1x1.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6deff31d62aa84c643fbeba77e7dcd4ae5d9b488c1c98e07fffeb58ff8e9b945
3
- size 76316
 
 
 
 
openvino/vpu_custom_kernels/binary_convolution1x1.cl DELETED
@@ -1,117 +0,0 @@
1
- // Copyright (C) 2018-2022 Intel Corporation
2
- // SPDX-License-Identifier: Apache-2.0
3
- //
4
-
5
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
- #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
-
8
- ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
9
-
10
- __kernel void binary_convolution(
11
- const __global half *restrict src_data,
12
- const __global uchar *restrict weights_data,
13
- __global half *restrict dst_data,
14
- float pad_value,
15
-
16
- int IW,
17
- int IH,
18
- int IC,
19
-
20
- int DW,
21
- int DH,
22
-
23
- int GC,
24
-
25
- int KW,
26
- int KH,
27
-
28
- int PW,
29
- int PH,
30
-
31
- int SW,
32
- int SH,
33
-
34
- int OW)
35
- {
36
- __local half src_local[32 * 1024];
37
- __local half dst_local[2 * 1024];
38
-
39
- const int oh = get_group_id(0);
40
- const int oc = get_group_id(1);
41
- const int OH = get_global_size(0);
42
- const int OC = get_global_size(1);
43
-
44
- const int gc = oc / (OC / GC);
45
-
46
- if (oh * SH >= 0 && oh * SH <= IH - 1) {
47
- const __global half *src = src_data + (gc * IC / GC) * IW * IH + (SH * oh) * IW;
48
-
49
- event_t e1 = async_work_group_copy_2D2D(
50
- src_local, // dst
51
- src, // src
52
- IW, // num_elements_per_line,
53
- IC / GC, // num_lines,
54
- IH * IW - IW, // src_line_stride,
55
- 0, // dst_line_stride,
56
- 0);
57
- wait_group_events(1, &e1);
58
- }
59
-
60
- half pad_value_half = convert_half(pad_value);
61
-
62
- //padding row
63
- if (oh * SH > IH - 1) {
64
- __local half *dst = src_local;
65
- for (int c = 0; c < IC / GC; c++) {
66
- #pragma unroll 8
67
- for (int j = 0; j < IW; j++) {
68
- dst[j] = pad_value_half;
69
- }
70
- dst += IW;
71
- }
72
- }
73
-
74
- int OWS = SW * OW;
75
- ushort8 in;
76
-
77
- for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
78
- ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
79
- for (int ic = 0; ic < IC / GC; ++ic) {
80
- __local half *src = (__local half *)((__local half8 *)(src_local + ic * IW) + ows8);
81
- int weight_pos = oc * IC / GC + ic;
82
- ushort w =
83
- extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
84
-
85
- if ((ows8 * 8) <= IW - 1) {
86
- in = *((__local ushort8 *)(src));
87
- }
88
-
89
- //padding column
90
- if (ows8 * 8 + 7 > IW - 1) {
91
- int boundary = (IW - 1) - ows8 * 8 + 1;
92
- boundary = boundary < 0 ? 0 : boundary;
93
- for (int offset = boundary; offset < 8; offset++) {
94
- *((half *)(&in) + offset) = pad_value_half;
95
- }
96
- }
97
-
98
- ushort8 w8 = (ushort8)(w);
99
-
100
- ushort8 cond =
101
- (((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
102
-
103
- val += (cond ^ w8);
104
- }
105
-
106
- ushort8 val_shift = val << 1;
107
- int boundary = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
108
- for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
109
- *(dst_local + ow) = (half)(IC / GC - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
110
- }
111
- }
112
-
113
- barrier(CLK_LOCAL_MEM_FENCE);
114
-
115
- event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
116
- wait_group_events(1, &e2);
117
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
openvino/vpu_custom_kernels/binary_convolution3x3.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:55e3c3f8863ff7a3583bcc7340d1e226775f5f14cfb11dd32bd671764570f7cb
3
- size 104136
 
 
 
 
openvino/vpu_custom_kernels/binary_convolution3x3.cl DELETED
@@ -1,278 +0,0 @@
1
- // Copyright (C) 2018-2022 Intel Corporation
2
- // SPDX-License-Identifier: Apache-2.0
3
- //
4
-
5
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
- #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
-
8
- ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
9
-
10
- __kernel void binary_convolution(
11
- const __global half *restrict src_data,
12
- const __global uchar *restrict weights_data,
13
- const __global half *restrict dst_data,
14
- float pad_value,
15
-
16
- int IW,
17
- int IH,
18
- int IC,
19
-
20
- int DW,
21
- int DH,
22
-
23
- int GC,
24
-
25
- int KW,
26
- int KH,
27
-
28
- int PW,
29
- int PH,
30
-
31
- int SW,
32
- int SH,
33
-
34
- int OW)
35
- {
36
- __local half src_local[32 * 1024];
37
- __local half dst_local[2 * 1024];
38
-
39
- const int oh = get_group_id(0);
40
- const int oc = get_group_id(1);
41
- const int OH = get_global_size(0);
42
- const int OC = get_global_size(1);
43
-
44
- const int gc = oc / (OC / GC);
45
-
46
- if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows
47
- {
48
- event_t e = async_work_group_copy_3D3D(
49
- src_local, // dst
50
- src_data + (gc * IC / GC) * IW * IH + (SH * oh - 1) * IW, // src
51
- IW, // num_elements_per_line
52
- 3, // num_lines
53
- DH * IW - IW, // src_line_stride
54
- 0, // dst_line_stride
55
- IC / GC, // num planes
56
- IH * IW - 3 * DH * IW, // src plane stride
57
- 0, // dst plane stride
58
- 0);
59
- wait_group_events(1, &e);
60
- } else {
61
- int ih = oh * SH - 1;
62
- if (ih >= 0 && ih <= IH - 1) //dma for first row
63
- {
64
- event_t e = async_work_group_copy_2D2D(
65
- src_local, // dst
66
- src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
67
- IW, // num_elements_per_line,
68
- IC / GC, // num_lines,
69
- IH * IW - IW, // src_line_stride,
70
- 2 * IW, // dst_line_stride,
71
- 0);
72
-
73
- wait_group_events(1, &e);
74
- }
75
- ih = oh * SH - 1 + DH;
76
- if (ih >= 0 && ih <= IH - 1) //dma for second row
77
- {
78
- event_t e = async_work_group_copy_2D2D(
79
- src_local + IW, // dst
80
- src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
81
- IW, // num_elements_per_line,
82
- IC / GC, // num_lines,
83
- IH * IW - IW, // src_line_stride,
84
- 2 * IW, // dst_line_stride,
85
- 0);
86
- wait_group_events(1, &e);
87
- }
88
- ih = oh * SH - 1 + 2 * DH;
89
- if (ih >= 0 && ih <= IH - 1) //dma for third row
90
- {
91
- event_t e = async_work_group_copy_2D2D(
92
- src_local + 2 * IW, // dst
93
- src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
94
- IW, // num_elements_per_line,
95
- IC / GC, // num_lines,
96
- IH * IW - IW, // src_line_stride,
97
- 2 * IW, // dst_line_stride,
98
- 0);
99
- wait_group_events(1, &e);
100
- }
101
- }
102
-
103
- half pad_value_half = convert_half(pad_value);
104
-
105
- //padding row
106
- if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1) {
107
- __local half *dst = src_local;
108
- for (int c = 0; c < IC / GC; c++) {
109
- #pragma unroll 8
110
- for (int j = 0; j < IW; j++) {
111
- dst[j] = pad_value_half;
112
- }
113
- dst += 3 * IW;
114
- }
115
- }
116
- if (oh * SH + DH - 1 > IH - 1) {
117
- __local half *dst = src_local + IW;
118
- for (int c = 0; c < IC / GC; c++) {
119
- #pragma unroll 8
120
- for (int j = 0; j < IW; j++) {
121
- dst[j] = pad_value_half;
122
- }
123
- dst += 3 * IW;
124
- }
125
- }
126
- if (oh * SH + DH + DH - 1 > IH - 1) {
127
- __local half *dst = src_local + 2 * IW;
128
- for (int c = 0; c < IC / GC; c++) {
129
- #pragma unroll 8
130
- for (int j = 0; j < IW; j++) {
131
- dst[j] = pad_value_half;
132
- }
133
- dst += 3 * IW;
134
- }
135
- }
136
-
137
- int OWS = SW * OW;
138
-
139
- ushort8 in00;
140
- ushort8 in01;
141
- ushort8 in02;
142
- ushort8 in10;
143
- ushort8 in11;
144
- ushort8 in12;
145
- ushort8 in20;
146
- ushort8 in21;
147
- ushort8 in22;
148
-
149
- for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
150
- ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
151
- for (int ic = 0; ic < IC / GC; ++ic) {
152
- __local half *src =
153
- (__local half *)((__local half8 *)(src_local + ic * IW * 3 + IW + DW - 1) + ows8);
154
- int weight_pos = oc * IC / GC * 3 * 3 + ic * 3 * 3;
155
- ushort w0 = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
156
- ushort w1 = extract_weights(weights_data[((weight_pos + 1)) / 8], ((weight_pos + 1) % 8));
157
- ushort w2 = extract_weights(weights_data[((weight_pos + 2)) / 8], ((weight_pos + 2) % 8));
158
- ushort w3 = extract_weights(weights_data[((weight_pos + 3)) / 8], ((weight_pos + 3) % 8));
159
- ushort w4 = extract_weights(weights_data[((weight_pos + 4)) / 8], ((weight_pos + 4) % 8));
160
- ushort w5 = extract_weights(weights_data[((weight_pos + 5)) / 8], ((weight_pos + 5) % 8));
161
- ushort w6 = extract_weights(weights_data[((weight_pos + 6)) / 8], ((weight_pos + 6) % 8));
162
- ushort w7 = extract_weights(weights_data[((weight_pos + 7)) / 8], ((weight_pos + 7) % 8));
163
- ushort w8 = extract_weights(weights_data[((weight_pos + 8)) / 8], ((weight_pos + 8) % 8));
164
-
165
- if ((ows8 * 8) - 1 <= IW - 1) {
166
- in00 = *((__local ushort8 *)(src - IW - DW));
167
- in01 = *((__local ushort8 *)(src - IW));
168
- in02 = *((__local ushort8 *)(src - IW + DW));
169
-
170
- in10 = *((__local ushort8 *)(src - DW));
171
- in11 = *((__local ushort8 *)(src));
172
- in12 = *((__local ushort8 *)(src + DW));
173
-
174
- in20 = *((__local ushort8 *)(src + IW - DW));
175
- in21 = *((__local ushort8 *)(src + IW));
176
- in22 = *((__local ushort8 *)(src + IW + DW));
177
- }
178
-
179
- //padding column
180
- if (ows8 * 8 - 1 < 0) {
181
- int boundary = 1 - ows8 * 8;
182
- boundary = boundary > 8 ? 8 : boundary;
183
- for (int offset = 0; offset < boundary; offset++) {
184
- *((half *)(&in00) + offset) = pad_value_half;
185
- *((half *)(&in10) + offset) = pad_value_half;
186
- *((half *)(&in20) + offset) = pad_value_half;
187
- }
188
- }
189
- if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1) {
190
- int boundary = (IW - DW - 1 - DW + 1) - ows8 * 8 + 1;
191
- boundary = boundary < 0 ? 0 : boundary;
192
- for (int offset = boundary; offset < 8; offset++) {
193
- *((half *)(&in02) + offset) = pad_value_half;
194
- *((half *)(&in12) + offset) = pad_value_half;
195
- *((half *)(&in22) + offset) = pad_value_half;
196
- }
197
- }
198
- if ((ows8 * 8 + 7) + DW - 1 > IW - 1) {
199
- int boundary = (IW - 1 - DW + 1) - ows8 * 8 + 1;
200
- boundary = boundary < 0 ? 0 : boundary;
201
- for (int offset = boundary; offset < 8; offset++) {
202
- *((half *)(&in01) + offset) = pad_value_half;
203
- *((half *)(&in11) + offset) = pad_value_half;
204
- *((half *)(&in21) + offset) = pad_value_half;
205
- }
206
- }
207
- if ((ows8 * 8 + 7) - 1 > IW - 1) {
208
- int boundary = (IW - 1 + 1) - ows8 * 8 + 1;
209
- boundary = boundary < 0 ? 0 : boundary;
210
- for (int offset = boundary; offset < 8; offset++) {
211
- *((half *)(&in00) + offset) = pad_value_half;
212
- *((half *)(&in10) + offset) = pad_value_half;
213
- *((half *)(&in20) + offset) = pad_value_half;
214
- }
215
- }
216
-
217
- ushort8 w00 = (ushort8)(w0);
218
- ushort8 w01 = (ushort8)(w1);
219
- ushort8 w02 = (ushort8)(w2);
220
- ushort8 w10 = (ushort8)(w3);
221
- ushort8 w11 = (ushort8)(w4);
222
- ushort8 w12 = (ushort8)(w5);
223
- ushort8 w20 = (ushort8)(w6);
224
- ushort8 w21 = (ushort8)(w7);
225
- ushort8 w22 = (ushort8)(w8);
226
-
227
- ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ?
228
- (ushort8)(1) :
229
- (ushort8)(0);
230
- ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ?
231
- (ushort8)(1) :
232
- (ushort8)(0);
233
- ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ?
234
- (ushort8)(1) :
235
- (ushort8)(0);
236
- ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ?
237
- (ushort8)(1) :
238
- (ushort8)(0);
239
- ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ?
240
- (ushort8)(1) :
241
- (ushort8)(0);
242
- ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ?
243
- (ushort8)(1) :
244
- (ushort8)(0);
245
- ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ?
246
- (ushort8)(1) :
247
- (ushort8)(0);
248
- ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ?
249
- (ushort8)(1) :
250
- (ushort8)(0);
251
- ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ?
252
- (ushort8)(1) :
253
- (ushort8)(0);
254
-
255
- val += (cond0 ^ w00);
256
- val += (cond1 ^ w01);
257
- val += (cond2 ^ w02);
258
- val += (cond3 ^ w10);
259
- val += (cond4 ^ w11);
260
- val += (cond5 ^ w12);
261
- val += (cond6 ^ w20);
262
- val += (cond7 ^ w21);
263
- val += (cond8 ^ w22);
264
- }
265
-
266
- ushort8 val_shift = val << 1;
267
- int boundary = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
268
- for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
269
- *(dst_local + ow) =
270
- (half)(IC / GC * KH * KW - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
271
- }
272
- }
273
-
274
- barrier(CLK_LOCAL_MEM_FENCE);
275
-
276
- event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
277
- wait_group_events(1, &e2);
278
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
openvino/vpu_custom_kernels/convolution1x1_chw.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8717c8429d41a69337007871137f06a9e6b38c685b5b3fecc634fade0eaa7e7f
3
- size 9220
 
 
 
 
openvino/vpu_custom_kernels/convolution1x1_chw.cl DELETED
@@ -1,114 +0,0 @@
1
- // Copyright (C) 2018-2022 Intel Corporation
2
- // SPDX-License-Identifier: Apache-2.0
3
- //
4
-
5
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
- #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
-
8
- __kernel void Convolution1x1_NCHW(
9
- const __global half *in,
10
- const __global half *out,
11
- const __global half *w,
12
- int IW,
13
- int IH,
14
- int IC,
15
- int OW,
16
- int OH,
17
- int OC)
18
- {
19
- __local half in_local[8 * 1024];
20
- __local half out_local[8 * 1024];
21
-
22
- event_t e1 = async_work_group_copy_2D2D(
23
- in_local, // dst
24
- in + get_group_id(0) * IW, // src
25
- IW, // num_elements_per_line,
26
- IC, // num_lines,
27
- IW * IH - IW, // src_line_stride,
28
- 0, // dst_line_stride,
29
- 0);
30
- wait_group_events(1, &e1);
31
-
32
- int oh = get_global_id(0);
33
- int oc = get_global_id(1);
34
-
35
- int stride;
36
- int write_output = 0;
37
- __global half *src;
38
-
39
- __global half8 *w8 = (__global half8 *)(&w[oc * IC]);
40
- __global half *w1 = (__global half *)(&w[oc * IC]);
41
-
42
- for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
43
- uint iw = ow;
44
- uint ih = oh;
45
-
46
- half8 val8_0 = 0.0f;
47
-
48
- __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
49
- __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
50
- __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
51
- __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
52
- __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
53
- __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
54
- __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
55
- __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
56
-
57
- for (uint ic = 0; ic < IC / 8; ic++) {
58
- val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
59
- val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
60
- val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
61
- val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
62
- val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
63
- val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
64
- val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
65
- val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
66
- }
67
-
68
- for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
69
- val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
70
- }
71
- *((__local half8 *)&out_local[ow + 0]) = (val8_0);
72
- }
73
-
74
- uint iw = (OW & (~0x7));
75
- uint ih = oh;
76
-
77
- half8 val8_0 = 0.0f;
78
-
79
- __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
80
- __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
81
- __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
82
- __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
83
- __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
84
- __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
85
- __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
86
- __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
87
-
88
- for (uint ic = 0; ic < IC / 8; ic++) {
89
- val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
90
- val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
91
- val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
92
- val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
93
- val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
94
- val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
95
- val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
96
- val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
97
- }
98
-
99
- for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
100
- val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
101
- }
102
- for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
103
- out_local[ow + 0] = (val8_0[ow % 8]);
104
- }
105
-
106
- barrier(CLK_LOCAL_MEM_FENCE);
107
-
108
- event_t e2 = async_work_group_copy(
109
- out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
110
- out_local,
111
- OW,
112
- 0);
113
- wait_group_events(1, &e2);
114
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
openvino/vpu_custom_kernels/convolution1x1_hwc.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b6122a6bf6f50d2c7fc612d4e286559f9c96746e166892d192e1264e1ce5a2c
3
- size 4304
 
 
 
 
openvino/vpu_custom_kernels/convolution1x1_hwc.cl DELETED
@@ -1,126 +0,0 @@
1
- // Copyright (C) 2018-2022 Intel Corporation
2
- // SPDX-License-Identifier: Apache-2.0
3
- //
4
-
5
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
- #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7
-
8
- __kernel void Convolution1x1_NHWC(
9
- const __global half *in,
10
- const __global half *out,
11
- const __global half *w,
12
- int IW,
13
- int IH,
14
- int IC,
15
- int OW,
16
- int OH,
17
- int OC)
18
- {
19
-
20
- __local half in_local[8 * 1024];
21
- __local half out_local[8 * 1024];
22
-
23
- const int sizeAct = IW * IC;
24
-
25
- event_t e1 = async_work_group_copy(in_local, in + get_group_id(0) * sizeAct, sizeAct, 0);
26
- wait_group_events(1, &e1);
27
-
28
- int oh = get_global_id(0);
29
- int oc = get_global_id(1);
30
-
31
- int stride;
32
- int write_output = 0;
33
- __global half *src;
34
-
35
- __global half8 *w8 = (__global half8 *)(&w[oc * IC]);
36
- __global half *w1 = (__global half *)(&w[oc * IC]);
37
-
38
- for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
39
- uint iw = ow;
40
- uint ih = oh;
41
-
42
- half8 val8_0 = 0.0f;
43
- half8 val8_1 = 0.0f;
44
- half8 val8_2 = 0.0f;
45
- half8 val8_3 = 0.0f;
46
- half8 val8_4 = 0.0f;
47
- half8 val8_5 = 0.0f;
48
- half8 val8_6 = 0.0f;
49
- half8 val8_7 = 0.0f;
50
-
51
- __local half8 *in8_0 = (__local half8 *)(&in_local[(iw + 0) * IC]);
52
- __local half8 *in8_1 = (__local half8 *)(&in_local[(iw + 1) * IC]);
53
- __local half8 *in8_2 = (__local half8 *)(&in_local[(iw + 2) * IC]);
54
- __local half8 *in8_3 = (__local half8 *)(&in_local[(iw + 3) * IC]);
55
- __local half8 *in8_4 = (__local half8 *)(&in_local[(iw + 4) * IC]);
56
- __local half8 *in8_5 = (__local half8 *)(&in_local[(iw + 5) * IC]);
57
- __local half8 *in8_6 = (__local half8 *)(&in_local[(iw + 6) * IC]);
58
- __local half8 *in8_7 = (__local half8 *)(&in_local[(iw + 7) * IC]);
59
-
60
- for (uint ic = 0; ic < IC / 8; ++ic) {
61
- val8_0 += (in8_0[ic]) * (w8[ic]);
62
- val8_1 += (in8_1[ic]) * (w8[ic]);
63
- val8_2 += (in8_2[ic]) * (w8[ic]);
64
- val8_3 += (in8_3[ic]) * (w8[ic]);
65
- val8_4 += (in8_4[ic]) * (w8[ic]);
66
- val8_5 += (in8_5[ic]) * (w8[ic]);
67
- val8_6 += (in8_6[ic]) * (w8[ic]);
68
- val8_7 += (in8_7[ic]) * (w8[ic]);
69
- }
70
-
71
- half val_0 = 0.0f;
72
- half val_1 = 0.0f;
73
- half val_2 = 0.0f;
74
- half val_3 = 0.0f;
75
- half val_4 = 0.0f;
76
- half val_5 = 0.0f;
77
- half val_6 = 0.0f;
78
- half val_7 = 0.0f;
79
- for (uint ic = IC & (~0x7); ic < IC; ++ic) {
80
- val_0 += *((__local half *)in8_0 + ic) * (*((__global half *)w8 + ic));
81
- val_1 += *((__local half *)in8_1 + ic) * (*((__global half *)w8 + ic));
82
- val_2 += *((__local half *)in8_2 + ic) * (*((__global half *)w8 + ic));
83
- val_3 += *((__local half *)in8_3 + ic) * (*((__global half *)w8 + ic));
84
- val_4 += *((__local half *)in8_4 + ic) * (*((__global half *)w8 + ic));
85
- val_5 += *((__local half *)in8_5 + ic) * (*((__global half *)w8 + ic));
86
- val_6 += *((__local half *)in8_6 + ic) * (*((__global half *)w8 + ic));
87
- val_7 += *((__local half *)in8_7 + ic) * (*((__global half *)w8 + ic));
88
- }
89
- out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0;
90
- out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1;
91
- out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2;
92
- out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3;
93
- out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4;
94
- out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5;
95
- out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6;
96
- out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7;
97
- }
98
- for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
99
-
100
- uint iw = ow;
101
- uint ih = oh;
102
-
103
- half8 val8 = 0.0f;
104
-
105
- __local half8 *in8 = (__local half8 *)(&in_local[iw * IC]);
106
-
107
- for (uint ic = 0; ic < IC / 8; ++ic) {
108
- val8 += (in8[ic]) * (w8[ic]);
109
- }
110
-
111
- half val = 0.0f;
112
- for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
113
- val += (*((__local half *)in8 + ic)) * (*((__global half *)w8 + ic));
114
- }
115
- out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val;
116
- }
117
-
118
- barrier(CLK_LOCAL_MEM_FENCE);
119
-
120
- event_t e2 = async_work_group_copy(
121
- out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
122
- out_local,
123
- OW,
124
- 0);
125
- wait_group_events(1, &e2);
126
- }