Spaces:
Sleeping
Sleeping
Upload 12 files
Browse files- .gitattributes +1 -0
- preprocessing/20words_mean_face.npy +3 -0
- preprocessing/30word - Copy.csv +30 -0
- preprocessing/30word.csv +30 -0
- preprocessing/README.md +23 -0
- preprocessing/anhtrasn.json +30 -0
- preprocessing/crop_mouth_from_video.py +283 -0
- preprocessing/extract_audio_from_video.py +56 -0
- preprocessing/shape_predictor_68_face_landmarks.dat +3 -0
- preprocessing/transform.py +61 -0
- preprocessing/utils.py +149 -0
- preprocessing/vietnamese_detected_face_30_words.csv +0 -0
- preprocessing/vietnamese_detected_face_30_words_have_snr.csv +0 -0
.gitattributes
CHANGED
|
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
preprocessing/shape_predictor_68_face_landmarks.dat filter=lfs diff=lfs merge=lfs -text
|
preprocessing/20words_mean_face.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dbf68b2044171e1160716df7c53e8bbfaa0ee8c61fb41171d04cb6092bb81422
|
| 3 |
+
size 1168
|
preprocessing/30word - Copy.csv
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ο»ΏthΓ΄ng,1916,,,,thΓ΄ng,tin,cα»§a,vΓ ,cΓ‘c,cΓ³,trong,lΓ ,ngΓ y,ΔΓ£,ΔαΊ§u,theo,cΓ΄ng,tΖ°,quΓ½
|
| 2 |
+
tin,1740,,,,1916,1740,1687,1640,1566,1513,1512,1344,1330,1284,1202,1197,1165,1148,1119
|
| 3 |
+
cα»§a,1687,,,,nhα»―ng,thΓ nh,cho,vα»,tαΊΏ,vα»,phα»,thΓ‘ng,Δα»ng,sαΊ£n,vα»i,Δược,chΓnh,sα»,dΓ΅i
|
| 4 |
+
vΓ ,1640,,,,1076,1065,1050,971,937,925,919,903,902,883,873,869,797,760,750
|
| 5 |
+
cΓ‘c,1566,,,,,,,,,,,,,,,,,,
|
| 6 |
+
cΓ³,1513,,,,,,,,,,,,,,,,,,
|
| 7 |
+
trong,1512,,,,,,,,,,,,,,,,,,
|
| 8 |
+
lΓ ,1344,,,,,,,,,,,,,,,,,,
|
| 9 |
+
ngΓ y,1330,,,,,,,,,,,,,,,,,,
|
| 10 |
+
ΔΓ£,1284,,,,,,,,,,,,,,,,,,
|
| 11 |
+
ΔαΊ§u,1202,,,,,,,,,,,,,,,,,,
|
| 12 |
+
theo,1197,,,,,,,,,,,,,,,,,,
|
| 13 |
+
cΓ΄ng,1165,,,,,,,,,,,,,,,,,,
|
| 14 |
+
tΖ°,1148,,,,,,,,,,,,,,,,,,
|
| 15 |
+
quΓ½,1119,,,,,,,,,,,,,,,,,,
|
| 16 |
+
nhα»―ng,1076,,,,,,,,,,,,,,,,,,
|
| 17 |
+
thΓ nh,1065,,,,,,,,,,,,,,,,,,
|
| 18 |
+
cho,1050,,,,,,,,,,,,,,,,,,
|
| 19 |
+
vα»,971,,,,,,,,,,,,,,,,,,
|
| 20 |
+
tαΊΏ,937,,,,,,,,,,,,,,,,,,
|
| 21 |
+
vα»,925,,,,,,,,,,,,,,,,,,
|
| 22 |
+
phα»,919,,,,,,,,,,,,,,,,,,
|
| 23 |
+
thΓ‘ng,903,,,,,,,,,,,,,,,,,,
|
| 24 |
+
Δα»ng,902,,,,,,,,,,,,,,,,,,
|
| 25 |
+
sαΊ£n,883,,,,,,,,,,,,,,,,,,
|
| 26 |
+
vα»i,873,,,,,,,,,,,,,,,,,,
|
| 27 |
+
Δược,869,,,,,,,,,,,,,,,,,,
|
| 28 |
+
chΓnh,797,,,,,,,,,,,,,,,,,,
|
| 29 |
+
sα»,760,,,,,,,,,,,,,,,,,,
|
| 30 |
+
dΓ΅i,750,,,,,,,,,,,,,,,,,,
|
preprocessing/30word.csv
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
thΓ΄ng,1916
|
| 2 |
+
tin,1740
|
| 3 |
+
cα»§a,1687
|
| 4 |
+
vΓ ,1640
|
| 5 |
+
cΓ‘c,1566
|
| 6 |
+
cΓ³,1513
|
| 7 |
+
trong,1512
|
| 8 |
+
lΓ ,1344
|
| 9 |
+
ngΓ y,1330
|
| 10 |
+
ΔΓ£,1284
|
| 11 |
+
ΔαΊ§u,1202
|
| 12 |
+
theo,1197
|
| 13 |
+
cΓ΄ng,1165
|
| 14 |
+
tΖ°,1148
|
| 15 |
+
quΓ½,1119
|
| 16 |
+
nhα»―ng,1076
|
| 17 |
+
thΓ nh,1065
|
| 18 |
+
cho,1050
|
| 19 |
+
vα»,971
|
| 20 |
+
tαΊΏ,937
|
| 21 |
+
vα»,925
|
| 22 |
+
phα»,919
|
| 23 |
+
thΓ‘ng,903
|
| 24 |
+
Δα»ng,902
|
| 25 |
+
sαΊ£n,883
|
| 26 |
+
vα»i,873
|
| 27 |
+
Δược,869
|
| 28 |
+
chΓnh,797
|
| 29 |
+
sα»,760
|
| 30 |
+
dΓ΅i,750
|
preprocessing/README.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### Pre-processing
|
| 2 |
+
|
| 3 |
+
* To get mouth ROIs
|
| 4 |
+
|
| 5 |
+
Run mouth cropping script to save grayscale mouth ROIs. We assume you save cropped mouths to *`$TCN_LIPREADING_ROOT/datasets/visual_data/`*. You can choose `--testset-only` to produce testing set.
|
| 6 |
+
|
| 7 |
+
```Shell
|
| 8 |
+
python crop_mouth_from_video.py --video-direc <LRW-DIREC> \
|
| 9 |
+
--landmark-direc <LANDMARK-DIREC> \
|
| 10 |
+
--save-direc <MOUTH-ROIS-DIRECTORY> \
|
| 11 |
+
--convert-gray \
|
| 12 |
+
--testset-only
|
| 13 |
+
```
|
| 14 |
+
|
| 15 |
+
* To get audio waveforms
|
| 16 |
+
|
| 17 |
+
Run format conversion script to extract audio waveforms (.npz) from raw videos. We assume you save audio waveforms to *`$TCN_LIPREADING_ROOT/datasets/audio_data/`*. You can choose `--testset-only` to produce testing set.
|
| 18 |
+
|
| 19 |
+
```Shell
|
| 20 |
+
python extract_audio_from_video.py --video-direc <LRW-DIREC> \
|
| 21 |
+
--save-direc <AUDIO-WAVEFORMS-DIRECTORY> \
|
| 22 |
+
--testset-only
|
| 23 |
+
```
|
preprocessing/anhtrasn.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"transcript": "Xin kΓnh chΓ o quΓ½ vα» KΓnh mα»i quΓ½ vα»",
|
| 3 |
+
"words": [
|
| 4 |
+
{
|
| 5 |
+
"end_time": 7.6,
|
| 6 |
+
"start_time": 0.0,
|
| 7 |
+
"word": "Xin"
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"end_time": 7.8,
|
| 11 |
+
"start_time": 7.6,
|
| 12 |
+
"word": "kΓnh"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"end_time": 8.0,
|
| 16 |
+
"start_time": 7.8,
|
| 17 |
+
"word": "chΓ o"
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"end_time": 8.0,
|
| 21 |
+
"start_time": 8.0,
|
| 22 |
+
"word": "quΓ½"
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"end_time": 8.2,
|
| 26 |
+
"start_time": 8.0,
|
| 27 |
+
"word": "vα»"
|
| 28 |
+
}
|
| 29 |
+
]
|
| 30 |
+
}
|
preprocessing/crop_mouth_from_video.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#! /usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
# Copyright 2020 Imperial College London (Pingchuan Ma)
|
| 5 |
+
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
| 6 |
+
|
| 7 |
+
""" Crop Mouth ROIs from videos for lipreading"""
|
| 8 |
+
|
| 9 |
+
# from msilib.schema import File
|
| 10 |
+
from ast import Pass
|
| 11 |
+
import os
|
| 12 |
+
import cv2 # OpenCV λΌμ΄λΈλ¬λ¦¬
|
| 13 |
+
import glob # 리λ
μ€μ κ²½λ‘ νκΈ°λ²μ μ¬μ©νμ¬ μνλ ν΄λ/νμΌ λ¦¬μ€νΈ μ»μ
|
| 14 |
+
import argparse # λͺ
λ Ήν μΈμλ₯Ό νμ±ν΄μ£Όλ λͺ¨λ
|
| 15 |
+
import numpy as np
|
| 16 |
+
from collections import deque # collections λͺ¨λμ μλ λ°ν¬ λΆλ¬μ€κΈ° # λ°ν¬: μ€νκ³Ό νλ₯Ό ν©μΉ μλ£κ΅¬μ‘°
|
| 17 |
+
|
| 18 |
+
from utils import * # utils.py λͺ¨λμ μλ λͺ¨λ ν¨μ λΆλ¬μ€κΈ°
|
| 19 |
+
from transform import * # transform.py λͺ¨λμ μλ λͺ¨λ ν¨μ λΆλ¬μ€κΈ°
|
| 20 |
+
|
| 21 |
+
import dlib # face landmark μ°Ύλ λΌμ΄λΈλ¬λ¦¬
|
| 22 |
+
import face_alignment # face landmark μ°Ύλ λΌμ΄λΈλ¬λ¦¬
|
| 23 |
+
from PIL import Image
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# μΈμκ°μ λ°μμ μ²λ¦¬νλ ν¨μ
|
| 27 |
+
def load_args(default_config=None):
|
| 28 |
+
# μΈμκ°μ λ°μμ μ²λ¦¬νλ ν¨μ
|
| 29 |
+
parser = argparse.ArgumentParser(description='Lipreading Pre-processing')
|
| 30 |
+
|
| 31 |
+
# μ
λ ₯λ°μ μΈμκ° λ±λ‘
|
| 32 |
+
# -- utils
|
| 33 |
+
parser.add_argument('--video-direc', default=None, help='raw video directory')
|
| 34 |
+
parser.add_argument('--video-format', default='.mp4', help='raw video format')
|
| 35 |
+
parser.add_argument('--landmark-direc', default=None, help='landmark directory')
|
| 36 |
+
parser.add_argument('--filename-path', default='./vietnamese_detected_face_30.csv', help='list of detected video and its subject ID')
|
| 37 |
+
parser.add_argument('--save-direc', default=None, help='the directory of saving mouth ROIs')
|
| 38 |
+
# -- mean face utils
|
| 39 |
+
parser.add_argument('--mean-face', default='./20words_mean_face.npy', help='mean face pathname')
|
| 40 |
+
# -- mouthROIs utils
|
| 41 |
+
parser.add_argument('--crop-width', default=96, type=int, help='the width of mouth ROIs')
|
| 42 |
+
parser.add_argument('--crop-height', default=96, type=int, help='the height of mouth ROIs')
|
| 43 |
+
parser.add_argument('--start-idx', default=48, type=int, help='the start of landmark index')
|
| 44 |
+
parser.add_argument('--stop-idx', default=68, type=int, help='the end of landmark index')
|
| 45 |
+
parser.add_argument('--window-margin', default=12, type=int, help='window margin for smoothed_landmarks')
|
| 46 |
+
# -- convert to gray scale
|
| 47 |
+
parser.add_argument('--convert-gray', default=False, action='store_true', help='convert2grayscale')
|
| 48 |
+
# -- test set only
|
| 49 |
+
parser.add_argument('--testset-only', default=False, action='store_true', help='process testing set only')
|
| 50 |
+
|
| 51 |
+
# μ
λ ₯λ°μ μΈμκ°μ argsμ μ μ₯ (type: namespace)
|
| 52 |
+
args = parser.parse_args()
|
| 53 |
+
return args
|
| 54 |
+
|
| 55 |
+
args = load_args() # args νμ± λ° λ‘λ
|
| 56 |
+
|
| 57 |
+
# -- mean face utils
|
| 58 |
+
STD_SIZE = (256, 256)
|
| 59 |
+
mean_face_landmarks = np.load(args.mean_face) # 20words_mean_face.npy
|
| 60 |
+
stablePntsIDs = [33, 36, 39, 42, 45]
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# μμμμ λλλ§ν¬ λ°μμ μ
μ μλΌλ΄κΈ°
|
| 64 |
+
def crop_patch( video_pathname, landmarks):
|
| 65 |
+
|
| 66 |
+
"""Crop mouth patch
|
| 67 |
+
:param str video_pathname: pathname for the video_dieo # μμ μμΉ
|
| 68 |
+
:param list landmarks: interpolated landmarks # 보κ°λ λλλ§ν¬
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
frame_idx = 0 # νλ μ μΈλ±μ€ λ²νΈ 0 μΌλ‘ μ΄κΈ°ν
|
| 72 |
+
frame_gen = read_video(video_pathname) # λΉλμ€ λΆλ¬μ€κΈ°
|
| 73 |
+
|
| 74 |
+
# 무ν λ°λ³΅
|
| 75 |
+
while True:
|
| 76 |
+
try:
|
| 77 |
+
frame = frame_gen.__next__() ## -- BGR # μ΄λ―Έμ§ νλ μ νλμ© λΆλ¬μ€κΈ°
|
| 78 |
+
except StopIteration: # λ μ΄μ next μμκ° μμΌλ©΄ StopIterraion Exception λ°μ
|
| 79 |
+
break # while λΉ μ Έλκ°κΈ°
|
| 80 |
+
if frame_idx == 0: # νλ μ μΈλ±μ€ λ²νΈκ° 0μΌ κ²½μ°
|
| 81 |
+
q_frame, q_landmarks = deque(), deque() # λ°ν¬ μμ±
|
| 82 |
+
sequence = []
|
| 83 |
+
|
| 84 |
+
q_landmarks.append(landmarks[frame_idx]) # νλ μ μΈλ±μ€ λ²νΈμ λ§λ λλλ§ν¬ μ 보 μΆκ°
|
| 85 |
+
q_frame.append(frame) # νλ μ μ 보 μΆκ°
|
| 86 |
+
if len(q_frame) == args.window_margin:
|
| 87 |
+
smoothed_landmarks = np.mean(q_landmarks, axis=0) # κ° κ·Έλ£Ήμ κ°μ μμλΌλ¦¬ νκ·
|
| 88 |
+
cur_landmarks = q_landmarks.popleft() # λ°ν¬ μ μΌ μΌμͺ½ κ° κΊΌλ΄κΈ°
|
| 89 |
+
cur_frame = q_frame.popleft() # λ°ν¬ μ μΌ μΌμͺ½ κ° κΊΌλ΄κΈ°
|
| 90 |
+
# -- affine transformation # μν λ³ν
|
| 91 |
+
trans_frame, trans = warp_img( smoothed_landmarks[stablePntsIDs, :],
|
| 92 |
+
mean_face_landmarks[stablePntsIDs, :],
|
| 93 |
+
cur_frame,
|
| 94 |
+
STD_SIZE)
|
| 95 |
+
trans_landmarks = trans(cur_landmarks)
|
| 96 |
+
# -- crop mouth patch # μ
μ μλΌλ΄κΈ°
|
| 97 |
+
sequence.append( cut_patch( trans_frame,
|
| 98 |
+
trans_landmarks[args.start_idx:args.stop_idx],
|
| 99 |
+
args.crop_height//2,
|
| 100 |
+
args.crop_width//2,))
|
| 101 |
+
if frame_idx == len(landmarks)-1:
|
| 102 |
+
while q_frame:
|
| 103 |
+
cur_frame = q_frame.popleft() # λ°ν¬ μ μΌ μΌμͺ½ κ° κΊΌλ΄κΈ°
|
| 104 |
+
# -- transform frame # νλ μ λ³ν
|
| 105 |
+
trans_frame = apply_transform( trans, cur_frame, STD_SIZE)
|
| 106 |
+
# -- transform landmarks # λλλ§ν¬ λ³ν
|
| 107 |
+
trans_landmarks = trans(q_landmarks.popleft())
|
| 108 |
+
# -- crop mouth patch # μ
μ μλΌλ΄κΈ°
|
| 109 |
+
sequence.append( cut_patch( trans_frame,
|
| 110 |
+
trans_landmarks[args.start_idx:args.stop_idx],
|
| 111 |
+
args.crop_height//2,
|
| 112 |
+
args.crop_width//2,))
|
| 113 |
+
return np.array(sequence) # μ
μ numpy λ°ν
|
| 114 |
+
frame_idx += 1 # νλ μ μΈλ±μ€ λ²νΈ μ¦κ°
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
# λλλ§ν¬ 보κ°
|
| 119 |
+
def landmarks_interpolate(landmarks):
|
| 120 |
+
|
| 121 |
+
"""Interpolate landmarks
|
| 122 |
+
param list landmarks: landmarks detected in raw videos # μλ³Έ μμ λ°μ΄ν°μμ κ²μΆν λλλ§ν¬
|
| 123 |
+
"""
|
| 124 |
+
|
| 125 |
+
valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None] # λλλ§ν¬ λ²νΈ list μμ±
|
| 126 |
+
|
| 127 |
+
# λλλ§ν¬ λ²νΈ list κ° λΉμ΄μλ€λ©΄
|
| 128 |
+
if not valid_frames_idx:
|
| 129 |
+
return None
|
| 130 |
+
|
| 131 |
+
# 1λΆν° (λλλ§ν¬ λ²νΈ list κ°μ-1)λ§νΌ for λ¬Έ λ°λ³΅
|
| 132 |
+
for idx in range(1, len(valid_frames_idx)):
|
| 133 |
+
if valid_frames_idx[idx] - valid_frames_idx[idx-1] == 1: # νμ¬ λλλ§ν¬ λ²νΈ - μ΄μ λλλ§ν¬ λ²νΈ == 1 μΌ κ²½μ°
|
| 134 |
+
continue # μ½λ μ€ν 건λλ°κΈ°
|
| 135 |
+
else: # μλλΌλ©΄
|
| 136 |
+
landmarks = linear_interpolate(landmarks, valid_frames_idx[idx-1], valid_frames_idx[idx]) # λλλ§ν¬ μ
λ°μ΄νΈ(보κ°)
|
| 137 |
+
|
| 138 |
+
valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None] # λλλ§ν¬ λ²νΈ list μμ±
|
| 139 |
+
# -- Corner case: keep frames at the beginning or at the end failed to be detected. # μμ λλ λ νλ μμ 보κ΄νμ§ λͺ»ν¨
|
| 140 |
+
if valid_frames_idx:
|
| 141 |
+
landmarks[:valid_frames_idx[0]] = [landmarks[valid_frames_idx[0]]] * valid_frames_idx[0] # λλλ§ν¬ 첫λ²μ§Έ νλ μ μ 보 μ μ₯
|
| 142 |
+
landmarks[valid_frames_idx[-1]:] = [landmarks[valid_frames_idx[-1]]] * (len(landmarks) - valid_frames_idx[-1]) # λλλ§ν¬ λ§μ§λ§ νλ μ μ 보 μ μ₯
|
| 143 |
+
|
| 144 |
+
valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None] # λλλ§ν¬ λ²νΈ list μμ±
|
| 145 |
+
# λλλ§ν¬ λ²νΈ list κ°μ == 보κ°ν λλλ§ν¬ κ°μ νμΈ, μλλ©΄ AssertionError λ©μμ§λ₯Ό λμ
|
| 146 |
+
assert len(valid_frames_idx) == len(landmarks), "not every frame has landmark" # μνλ 쑰건μ λ³μκ°μ 보μ¦νκΈ° μν΄ μ¬μ©
|
| 147 |
+
|
| 148 |
+
return landmarks # λλλ§ν¬ λ°ν
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def get_yield(output_video):
|
| 152 |
+
for frame in output_video:
|
| 153 |
+
yield frame
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
lines = open(args.filename_path).read().splitlines() # λ¬Έμμ΄μ '\n' κΈ°μ€μΌλ‘ μͺΌκ° ν list μμ±
|
| 157 |
+
lines = list(filter(lambda x: 'test' == x.split('/')[-2], lines)) if args.testset_only else lines # args.testset_only κ°μ΄ μλ€λ©΄ test ν΄λ μ νμΌλͺ
λ§ λΆλ¬μμ list μμ±, μλλΌλ©΄ μλ lines κ·Έλλ‘ κ° μ μ§
|
| 158 |
+
|
| 159 |
+
# lines κ°μλ§νΌ λ°λ³΅λ¬Έ μ€ν
|
| 160 |
+
for filename_idx, line in enumerate(lines):
|
| 161 |
+
|
| 162 |
+
# νμΌλͺ
, μ¬λid
|
| 163 |
+
filename, person_id = line.split(',')
|
| 164 |
+
print('idx: {} \tProcessing.\t{}'.format(filename_idx, filename)) # νμΌ μΈλ±μ€λ²νΈ, νμΌλͺ
μΆλ ₯
|
| 165 |
+
|
| 166 |
+
video_pathname = os.path.join(args.video_direc, filename+args.video_format) # μμλλ ν 리 + νμΌλͺ
.λΉλμ€ν¬λ§·/
|
| 167 |
+
landmarks_pathname = os.path.join(args.landmark_direc, filename+'.npz') # μ μ₯λλ ν 리 + λλλ§ν¬ νμΌλͺ
.npz
|
| 168 |
+
dst_pathname = os.path.join( args.save_direc, filename+'.npz') # μ μ₯λλ ν 리 + κ²°κ³Όμμ νμΌλͺ
.npz
|
| 169 |
+
|
| 170 |
+
# νμΌμ΄ μλμ§ νμΈ, μμΌλ©΄ AssertionError λ©μμ§λ₯Ό λμ
|
| 171 |
+
assert os.path.isfile(video_pathname), "File does not exist. Path input: {}".format(video_pathname) # μνλ 쑰건μ λ³μκ°μ 보μ¦νκΈ° μν΄ μ¬μ©
|
| 172 |
+
|
| 173 |
+
# video μ λν face landmark npz νμΌμ΄ μκ³ μμ νμ₯μ avi μΈ κ²½μ° dlib μΌλ‘ μ§μ npz νμΌ μμ±
|
| 174 |
+
if not os.path.exists(landmarks_pathname) and video_pathname.split('.')[-1] == 'mp4':
|
| 175 |
+
|
| 176 |
+
# dlib μ¬μ©ν΄μ face landmark μ°ΎκΈ°
|
| 177 |
+
def get_face_landmark(img):
|
| 178 |
+
detector_hog = dlib.get_frontal_face_detector()
|
| 179 |
+
dlib_rects = detector_hog(img, 1)
|
| 180 |
+
model_path = os.path.dirname(os.path.abspath(__file__)) + '/shape_predictor_68_face_landmarks.dat'
|
| 181 |
+
landmark_predictor = dlib.shape_predictor(model_path)
|
| 182 |
+
|
| 183 |
+
# dlib μΌλ‘ face landmark μ°ΎκΈ°
|
| 184 |
+
list_landmarks = []
|
| 185 |
+
for dlib_rect in dlib_rects:
|
| 186 |
+
points = landmark_predictor(img, dlib_rect)
|
| 187 |
+
list_points = list(map(lambda p: (p.x, p.y), points.parts()))
|
| 188 |
+
list_landmarks.append(list_points)
|
| 189 |
+
|
| 190 |
+
input_width, input_height = img.shape
|
| 191 |
+
output_width, output_height = (256, 256)
|
| 192 |
+
width_rate = input_width / output_width
|
| 193 |
+
height_rate = input_height / output_height
|
| 194 |
+
img_rate = [(width_rate, height_rate)]*68
|
| 195 |
+
face_rate = np.array(img_rate)
|
| 196 |
+
eye_rate = np.array(img_rate[36:48])
|
| 197 |
+
|
| 198 |
+
# face landmark list κ° λΉμ΄μμ§ μμ κ²½μ°
|
| 199 |
+
if list_landmarks:
|
| 200 |
+
for dlib_rect, landmark in zip(dlib_rects, list_landmarks):
|
| 201 |
+
face_landmark = np.array(landmark) # face landmark
|
| 202 |
+
eye_landmark = np.array(landmark[36:48]) # eye landmark
|
| 203 |
+
|
| 204 |
+
return face_landmark, eye_landmark
|
| 205 |
+
# face landmark list κ° λΉμ΄μλ κ²½μ°
|
| 206 |
+
else:
|
| 207 |
+
landmark = [(0.0, 0.0)] * 68
|
| 208 |
+
face_landmark = np.array(landmark) # face landmark
|
| 209 |
+
eye_landmark = np.array(landmark[36:48]) # eye landmark
|
| 210 |
+
return face_landmark, eye_landmark
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
target_frames = 29 # μνλ νλ μ κ°μ
|
| 214 |
+
video = videoToArray(video_pathname, is_gray=args.convert_gray) # μμ μ 보 μμ μμ νλ μ κ°μλ₯Ό μΆκ°ν numpy
|
| 215 |
+
output_video = frameAdjust(video, target_frames) # frame sampling (νλ μ κ°μ λ§μΆκΈ°)
|
| 216 |
+
|
| 217 |
+
multi_sub_landmarks = []
|
| 218 |
+
person_landmarks = []
|
| 219 |
+
frame_landmarks = []
|
| 220 |
+
for frame_idx, frame in enumerate(get_yield(output_video)):
|
| 221 |
+
print(f'\n ------------frame {frame_idx}------------ ')
|
| 222 |
+
|
| 223 |
+
facial_landmarks, eye_landmarks = get_face_landmark(frame) # dlib μ¬μ©ν΄μ face landmark μ°ΎκΈ°
|
| 224 |
+
|
| 225 |
+
person_landmarks = {
|
| 226 |
+
'id': 0,
|
| 227 |
+
'most_recent_fitting_scores': np.array([2.0,2.0,2.0]),
|
| 228 |
+
'facial_landmarks': facial_landmarks,
|
| 229 |
+
'roll': 7,
|
| 230 |
+
'yaw': 3.5,
|
| 231 |
+
'eye_landmarks': eye_landmarks,
|
| 232 |
+
'fitting_scores_updated': True,
|
| 233 |
+
'pitch': -0.05
|
| 234 |
+
}
|
| 235 |
+
frame_landmarks.append(person_landmarks)
|
| 236 |
+
multi_sub_landmarks.append(np.array(frame_landmarks.copy(), dtype=object))
|
| 237 |
+
|
| 238 |
+
multi_sub_landmarks = np.array(multi_sub_landmarks) # list to numpy
|
| 239 |
+
save2npz(landmarks_pathname, data=multi_sub_landmarks) # face landmark npz μ μ₯
|
| 240 |
+
print('\n ------------ save npz ------------ \n')
|
| 241 |
+
|
| 242 |
+
# video μ λν face landmark npz νμΌμ΄ μλ κ²½μ°
|
| 243 |
+
else:
|
| 244 |
+
|
| 245 |
+
# νμΌμ΄ μλμ§ νμΈ, μμΌλ©΄ AssertionError λ©μμ§λ₯Ό λμ
|
| 246 |
+
assert os.path.isfile(landmarks_pathname), "File does not exist. Path input: {}".format(landmarks_pathname) # μνλ 쑰건μ λ³μκ°μ 보μ¦νκΈ° μν΄ μ¬μ©
|
| 247 |
+
|
| 248 |
+
# νμΌμ΄ μ‘΄μ¬ν κ²½μ°
|
| 249 |
+
if os.path.exists(dst_pathname):
|
| 250 |
+
continue # μ½λ μ€ν 건λλ°κΈ°
|
| 251 |
+
|
| 252 |
+
multi_sub_landmarks = np.load( landmarks_pathname, allow_pickle=True)['data'] # numpy νμΌ μ΄κΈ°
|
| 253 |
+
landmarks = [None] * len( multi_sub_landmarks) # λλλ§ν¬ λ³μ μ΄κΈ°ν
|
| 254 |
+
for frame_idx in range(len(landmarks)):
|
| 255 |
+
try:
|
| 256 |
+
landmarks[frame_idx] = multi_sub_landmarks[frame_idx][int(person_id)]['facial_landmarks'].astype(np.float64) # νλ μ μΈλ±μ€ λ²νΈμμ μ¬λidμ μΌκ΅΄ λλλ§ν¬ μ 보 κ°μ Έμ€κΈ°
|
| 257 |
+
except IndexError: # ν΄λΉ μΈλ±μ€ λ²νΈμ κΉμ΄ μμΌλ©΄ IndexError λ°μ
|
| 258 |
+
continue # μ½λ μ€ν 건λλ°κΈ°
|
| 259 |
+
|
| 260 |
+
# face landmark κ° [(0,0)]*68 μ΄ μλλ©΄ λλλ§ν¬ λ³΄κ° ν npz νμΌ μμ±
|
| 261 |
+
landmarks_empty_list = []
|
| 262 |
+
landmarks_empty = [(0, 0)]*68
|
| 263 |
+
landmarks_empty = np.array(landmarks_empty, dtype=object)
|
| 264 |
+
for i in range(len(landmarks_empty)):
|
| 265 |
+
landmarks_empty_list.append(landmarks_empty.copy())
|
| 266 |
+
condition = landmarks != landmarks_empty_list
|
| 267 |
+
if condition:
|
| 268 |
+
# -- pre-process landmarks: interpolate frames not being detected.
|
| 269 |
+
preprocessed_landmarks = landmarks_interpolate(landmarks) # λλλ§ν¬ 보κ°
|
| 270 |
+
# λ³μκ° λΉμ΄μμ§ μλ€λ©΄
|
| 271 |
+
if not preprocessed_landmarks:
|
| 272 |
+
continue # μ½λ μ€ν 건λλ°κΈ°
|
| 273 |
+
|
| 274 |
+
# -- crop
|
| 275 |
+
sequence = crop_patch(video_pathname, preprocessed_landmarks) # μμμμ λλλ§ν¬ λ°μμ μ
μ μλΌλ΄κΈ°
|
| 276 |
+
# sequenceκ° λΉμ΄μλμ§ νμΈ, λΉμ΄μμΌλ©΄ AssertionError λ©μμ§λ₯Ό λμ
|
| 277 |
+
assert sequence is not None, "cannot crop from {}.".format(filename) # μνλ 쑰건μ λ³μκ°μ 보μ¦νκΈ° μν΄ μ¬μ©
|
| 278 |
+
|
| 279 |
+
# -- save
|
| 280 |
+
data = convert_bgr2gray(sequence) if args.convert_gray else sequence[...,::-1] # gray λ³ν
|
| 281 |
+
save2npz(dst_pathname, data=data) # λ°μ΄ν°λ₯Ό npz νμμΌλ‘ μ μ₯
|
| 282 |
+
|
| 283 |
+
print('Done.')
|
preprocessing/extract_audio_from_video.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#! /usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
# Copyright 2020 Imperial College London (Pingchuan Ma)
|
| 5 |
+
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
"""Transforms mp4 audio to npz. Code has strong assumptions on the dataset organization!"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import librosa # μμ λ°μ΄ν° λΆμ λΌμ΄λΈλ¬λ¦¬
|
| 12 |
+
import argparse # λͺ
λ Ήν μΈμλ₯Ό νμ±ν΄μ£Όλ λͺ¨λ
|
| 13 |
+
|
| 14 |
+
from utils import * # utils.py λͺ¨λμ μλ λͺ¨λ ν¨μ(read_txt_lines(), save2npz(), read_video()) λΆλ¬μ€κΈ°
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# μΈμκ°μ λ°μμ μ²λ¦¬νλ ν¨μ
|
| 18 |
+
def load_args(default_config=None):
|
| 19 |
+
# μΈμκ°μ λ°μ μ μλ μΈμ€ν΄μ€ μμ±
|
| 20 |
+
parser = argparse.ArgumentParser(description='Extract Audio Waveforms')
|
| 21 |
+
|
| 22 |
+
# μ
λ ₯λ°μ μΈμκ° λ±λ‘
|
| 23 |
+
# -- utils
|
| 24 |
+
parser.add_argument('--video-direc', default=None, help='raw video directory')
|
| 25 |
+
parser.add_argument('--filename-path', default='./vietnamese_detected_face_30.csv', help='list of detected video and its subject ID')
|
| 26 |
+
parser.add_argument('--save-direc', default=None, help='the directory of saving audio waveforms (.npz)')
|
| 27 |
+
# -- test set only
|
| 28 |
+
parser.add_argument('--testset-only', default=False, action='store_true', help='process testing set only')
|
| 29 |
+
|
| 30 |
+
# μ
λ ₯λ°μ μΈμκ°μ argsμ μ μ₯ (type: namespace)
|
| 31 |
+
args = parser.parse_args()
|
| 32 |
+
return args
|
| 33 |
+
|
| 34 |
+
args = load_args() # args νμ± λ° λ‘λ
|
| 35 |
+
|
| 36 |
+
lines = open(args.filename_path).read().splitlines() # λ¬Έμμ΄μ '\m' κΈ°μ€μΌλ‘ μͺΌκ° ν list μμ±
|
| 37 |
+
lines = list(filter(lambda x: 'test' == x.split('/')[-2], lines)) if args.testset_only else lines # args.testset_only κ°μ΄ μλ€λ©΄ test ν΄λ μ νμΌλͺ
λ§ λΆλ¬μμ list μμ±, μλλΌλ©΄ μλ lines κ·Έλλ‘ κ° μ μ§
|
| 38 |
+
|
| 39 |
+
# lines κ°μλ§νΌ λ°λ³΅λ¬Έ μ€ν
|
| 40 |
+
for filename_idx, line in enumerate(lines):
|
| 41 |
+
|
| 42 |
+
# νμΌλͺ
, μ¬λid
|
| 43 |
+
filename, person_id = line.split(',')
|
| 44 |
+
print('idx: {} \tProcessing.\t{}'.format(filename_idx, filename)) # νμΌ μΈλ±μ€λ²νΈ, νμΌλͺ
μΆλ ₯
|
| 45 |
+
|
| 46 |
+
video_pathname = os.path.join(args.video_direc, filename+'.mp4') # μμλλ ν 리 + νμΌλͺ
.mp4
|
| 47 |
+
dst_pathname = os.path.join( args.save_direc, filename+'.npz') # μ μ₯λλ ν 리 + νμΌλͺ
.npz
|
| 48 |
+
|
| 49 |
+
# νμΌμ΄ μλμ§ νμΈ, μμΌλ©΄ AssertionError λ©μμ§λ₯Ό λμ
|
| 50 |
+
assert os.path.isfile(video_pathname), "File does not exist. Path input: {}".format(video_pathname) # μνλ 쑰건μ λ³μκ°μ 보μ¦νκΈ° μν΄ μ¬μ©
|
| 51 |
+
|
| 52 |
+
# wav νμΌ μ½λ λΌμ΄λΈλ¬λ¦¬: librosa
|
| 53 |
+
# librosa λ‘ λ°μ΄ν°λ₯Ό μ½μΌλ©΄ λ°μ΄ν° λ²μκ° [-1,1]λ‘ μ κ·νλ¨
|
| 54 |
+
# librosa μ
λ ₯μμ sr=None μΌλ‘ μ§μ νμ§ μκ³ μμμ sample_rateλ₯Ό μ€μ νλ©΄ loadν λ resampling μνν¨
|
| 55 |
+
data = librosa.load(video_pathname, sr=16000)[0][-19456:]
|
| 56 |
+
save2npz(dst_pathname, data=data) # librosa λ‘ μ½μ λ°μ΄ν°λ₯Ό npz νμμΌλ‘ μ μ₯
|
preprocessing/shape_predictor_68_face_landmarks.dat
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fbdc2cb80eb9aa7a758672cbfdda32ba6300efe9b6e6c7a299ff7e736b11b92f
|
| 3 |
+
size 99693937
|
preprocessing/transform.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2 # OpenCV λΌμ΄λΈλ¬λ¦¬
|
| 2 |
+
import numpy as np
|
| 3 |
+
from skimage import transform as tf # μ΄λ―Έμ§ λ³ν λͺ¨λ
|
| 4 |
+
|
| 5 |
+
# -- Landmark interpolation:
|
| 6 |
+
def linear_interpolate(landmarks, start_idx, stop_idx):
|
| 7 |
+
start_landmarks = landmarks[start_idx] # λλλ§ν¬ μμ
|
| 8 |
+
stop_landmarks = landmarks[stop_idx] # λλλ§ν¬ λ
|
| 9 |
+
delta = stop_landmarks - start_landmarks # λλλ§ν¬ κ° μ°¨μ΄
|
| 10 |
+
for idx in range(1, stop_idx-start_idx):
|
| 11 |
+
landmarks[start_idx+idx] = start_landmarks + idx/float(stop_idx-start_idx) * delta # λλλ§ν¬ μ
λ°μ΄νΈ(보κ°)
|
| 12 |
+
return landmarks
|
| 13 |
+
|
| 14 |
+
# -- Face Transformation
|
| 15 |
+
# src: μ
λ ₯ μμ, dst: μΆλ ₯/κ²°κ³Ό μμ
|
| 16 |
+
def warp_img(src, dst, img, std_size):
|
| 17 |
+
tform = tf.estimate_transform('similarity', src, dst) # find the transformation matrix # λ³ν νλ ¬ ꡬνκΈ°
|
| 18 |
+
warped = tf.warp(img, inverse_map=tform.inverse, output_shape=std_size) # wrap the frame image # μ£Όμ΄μ§ μ’ν λ³νμ λ°λΌ νλ μ μ΄λ―Έμ§ μ곑
|
| 19 |
+
warped = warped * 255 # note output from wrap is double image (value range [0,1])
|
| 20 |
+
warped = warped.astype('uint8') # numpy λ°μ΄ν° νμ
uint8 μΌλ‘ λ³κ²½
|
| 21 |
+
return warped, tform
|
| 22 |
+
|
| 23 |
+
def apply_transform(transform, img, std_size):
|
| 24 |
+
warped = tf.warp(img, inverse_map=transform.inverse, output_shape=std_size) # wrap the frame image # μ£Όμ΄μ§ μ’ν λ³νμ λ°λΌ νλ μ μ΄λ―Έμ§ μ곑
|
| 25 |
+
warped = warped * 255 # note output from wrap is double image (value range [0,1])
|
| 26 |
+
warped = warped.astype('uint8') # numpy λ°μ΄ν° νμ
uint8 μΌλ‘ λ³κ²½
|
| 27 |
+
return warped
|
| 28 |
+
|
| 29 |
+
# -- Crop
|
| 30 |
+
def cut_patch(img, landmarks, height, width, threshold=5):
|
| 31 |
+
|
| 32 |
+
center_x, center_y = np.mean(landmarks, axis=0) # κ° κ·Έλ£Ήμ κ°μ μμλΌλ¦¬ νκ·
|
| 33 |
+
|
| 34 |
+
# μ’ν μ²λ¦¬
|
| 35 |
+
if center_y - height < 0:
|
| 36 |
+
center_y = height
|
| 37 |
+
if center_y - height < 0 - threshold:
|
| 38 |
+
raise Exception('too much bias in height')
|
| 39 |
+
if center_x - width < 0:
|
| 40 |
+
center_x = width
|
| 41 |
+
if center_x - width < 0 - threshold:
|
| 42 |
+
raise Exception('too much bias in width')
|
| 43 |
+
|
| 44 |
+
if center_y + height > img.shape[0]:
|
| 45 |
+
center_y = img.shape[0] - height
|
| 46 |
+
if center_y + height > img.shape[0] + threshold:
|
| 47 |
+
raise Exception('too much bias in height')
|
| 48 |
+
if center_x + width > img.shape[1]:
|
| 49 |
+
center_x = img.shape[1] - width
|
| 50 |
+
if center_x + width > img.shape[1] + threshold:
|
| 51 |
+
raise Exception('too much bias in width')
|
| 52 |
+
|
| 53 |
+
# λ°°μ΄ λ³΅μ¬
|
| 54 |
+
cutted_img = np.copy(img[ int(round(center_y) - round(height)): int(round(center_y) + round(height)),
|
| 55 |
+
int(round(center_x) - round(width)): int(round(center_x) + round(width))])
|
| 56 |
+
return cutted_img
|
| 57 |
+
|
| 58 |
+
# -- RGB to GRAY
|
| 59 |
+
def convert_bgr2gray(data):
|
| 60 |
+
# np.stack(λ°°μ΄_1, λ°°μ΄_2, axis=0): μ§μ ν axisλ₯Ό μμ ν μλ‘μ΄ axisλ‘ μκ°
|
| 61 |
+
return np.stack([cv2.cvtColor(_, cv2.COLOR_BGR2GRAY) for _ in data], axis=0) # gray λ³ν
|
preprocessing/utils.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#coding=utf-8
|
| 2 |
+
import os
|
| 3 |
+
import cv2 # OpenCV λΌμ΄λΈλ¬λ¦¬
|
| 4 |
+
import numpy as np
|
| 5 |
+
from PIL import Image
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# -- IO utils
|
| 9 |
+
# ν
μ€νΈ λΌμΈ λΆλ¬μ€κΈ°
|
| 10 |
+
def read_txt_lines(filepath):
|
| 11 |
+
# νμΌμ΄ μλμ§ νμΈ, μμΌλ©΄ AssertionError λ©μμ§λ₯Ό λμ
|
| 12 |
+
assert os.path.isfile( filepath ), "Error when trying to read txt file, path does not exist: {}".format(filepath) # μνλ 쑰건μ λ³μκ°μ 보μ¦νκΈ° μν΄ μ¬μ©
|
| 13 |
+
|
| 14 |
+
# νμΌ λΆλ¬μ€κΈ°
|
| 15 |
+
with open( filepath ) as myfile:
|
| 16 |
+
content = myfile.read().splitlines() # λ¬Έμμ΄μ '\n' κΈ°μ€μΌλ‘ μͺΌκ° ν list μμ±
|
| 17 |
+
return content
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# npz μ μ₯
|
| 21 |
+
def save2npz(filename, data=None):
|
| 22 |
+
# λ°μ΄ν°κ° λΉμ΄μλμ§ νμΈ, μμΌλ©΄ AssertionError λ©μμ§λ₯Ό λμ
|
| 23 |
+
assert data is not None, "data is {}".format(data)
|
| 24 |
+
|
| 25 |
+
# νμΌ μμ κ²½μ°
|
| 26 |
+
if not os.path.exists(os.path.dirname(filename)):
|
| 27 |
+
os.makedirs(os.path.dirname(filename)) # λλ ν 리 μμ±
|
| 28 |
+
np.savez_compressed(filename, data=data) # μμΆλμ§ μμ .npz νμΌ νμ μΌλ‘ μ¬λ¬ λ°°μ΄ μ μ₯
|
| 29 |
+
def save2npz(filename, data=None):
|
| 30 |
+
"""save2npz.
|
| 31 |
+
:param filename: str, the fileanme where the data will be saved.
|
| 32 |
+
:param data: ndarray, arrays to save to the file.
|
| 33 |
+
"""
|
| 34 |
+
assert data is not None, "data is {}".format(data)
|
| 35 |
+
if not os.path.exists(os.path.dirname(filename)):
|
| 36 |
+
os.makedirs(os.path.dirname(filename))
|
| 37 |
+
np.savez_compressed(filename, data=data)
|
| 38 |
+
|
| 39 |
+
# λΉλμ€ λΆλ¬μ€κΈ°
|
| 40 |
+
def read_video(filename):
|
| 41 |
+
cap = cv2.VideoCapture(filename) # μμ κ°μ²΄(νμΌ) κ°μ Έμ€κΈ°
|
| 42 |
+
|
| 43 |
+
while(cap.isOpened()): # μμ νμΌ(μΉ΄λ©λΌ)μ΄ μ μμ μΌλ‘ μ΄λ Έλμ§(μ΄κΈ°νλμλμ§) μ¬λΆ
|
| 44 |
+
# ret: μ μμ μΌλ‘ μ½μ΄μλκ°?
|
| 45 |
+
# frame: ν μ₯μ μ΄λ―Έμ§(frame) κ°μ Έμ€κΈ°
|
| 46 |
+
ret, frame = cap.read() # BGR
|
| 47 |
+
if ret: # νλ μ μ 보λ₯Ό μ μμ μΌλ‘ μ½μ§ λͺ»νλ©΄
|
| 48 |
+
yield frame # νλ μμ ν¨μ λ°κΉ₯μΌλ‘ μ λ¬νλ©΄μ μ½λ μ€νμ ν¨μ λ°κΉ₯μ μ보
|
| 49 |
+
else: # νλ μ μ 보λ₯Ό μ μμ μΌλ‘ μ½μ§ λͺ»νλ©΄
|
| 50 |
+
break # while λΉ μ Έλκ°κΈ°
|
| 51 |
+
cap.release() # μμ νμΌ(μΉ΄λ©λΌ) μ¬μ© μ’
λ£
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# Video μ 보 κ°μ Έμ€κΈ°
|
| 56 |
+
def get_video_info(infilename, is_print=False):
|
| 57 |
+
cap = cv2.VideoCapture(infilename)
|
| 58 |
+
if not cap.isOpened():
|
| 59 |
+
print("could not open : ", infilename)
|
| 60 |
+
cap.release()
|
| 61 |
+
exit(0)
|
| 62 |
+
|
| 63 |
+
length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 64 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
| 65 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 66 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 67 |
+
cap.release()
|
| 68 |
+
|
| 69 |
+
if is_print:
|
| 70 |
+
print('length : ', length)
|
| 71 |
+
print('width : ', width)
|
| 72 |
+
print('height : ', height)
|
| 73 |
+
print('fps : ', fps)
|
| 74 |
+
|
| 75 |
+
video_info = {
|
| 76 |
+
'length': length,
|
| 77 |
+
'width': width,
|
| 78 |
+
'height': height,
|
| 79 |
+
'fps': fps,
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
return video_info
|
| 83 |
+
|
| 84 |
+
# Video -> Numpy
|
| 85 |
+
# μ°Έκ³ κΉνλΈ μ½λ: https://github.com/khazit/Lip2Word/blob/master/lipReader.py#L22
|
| 86 |
+
def videoToArray(video_pathname, is_gray=True) :
|
| 87 |
+
|
| 88 |
+
cap = cv2.VideoCapture(video_pathname) # μμ κ°μ²΄(νμΌ) κ°μ Έμ€κΈ°
|
| 89 |
+
|
| 90 |
+
# μμ νμΌ(μΉ΄λ©λΌ)μ΄ μ μμ μΌλ‘ μ΄λ¦¬μ§ μμ κ²½μ°
|
| 91 |
+
if not cap.isOpened():
|
| 92 |
+
print("could not open : ", video_pathname)
|
| 93 |
+
cap.release() # μμ νμΌ(μΉ΄λ©λΌ) μ¬μ© μ’
λ£
|
| 94 |
+
exit(0) # λΉ μ Έλκ°κΈ°
|
| 95 |
+
|
| 96 |
+
n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) # μμ νλ μ κ°μ
|
| 97 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # μμ λλΉ
|
| 98 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # μμ λμ΄
|
| 99 |
+
fps = cap.get(cv2.CAP_PROP_FPS) # μμ FPS(Frames Per Second)
|
| 100 |
+
|
| 101 |
+
if is_gray:
|
| 102 |
+
video = np.zeros((n_frames, height, width)) # gray
|
| 103 |
+
else:
|
| 104 |
+
n_channels=3
|
| 105 |
+
video = np.zeros((n_frames, height, width, n_channels)) # color
|
| 106 |
+
|
| 107 |
+
video = video.astype(np.uint8)
|
| 108 |
+
|
| 109 |
+
i = 0
|
| 110 |
+
while True :
|
| 111 |
+
success, frame = cap.read()
|
| 112 |
+
if not success :
|
| 113 |
+
break
|
| 114 |
+
else :
|
| 115 |
+
# gray scale μ μ©
|
| 116 |
+
if is_gray:
|
| 117 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
| 118 |
+
|
| 119 |
+
video[i] = frame
|
| 120 |
+
i += 1
|
| 121 |
+
|
| 122 |
+
cap.release() # μμ νμΌ(μΉ΄λ©λΌ) μ¬μ© μ’
λ£
|
| 123 |
+
|
| 124 |
+
return video # μμ μ 보 μμ μμ νλ μ κ°μλ₯Ό μΆκ°ν numpy λ°ν
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# Frame Sampling (νλ μ κ°μ λ§μΆκΈ°)
|
| 128 |
+
# μ°Έκ³ κΉνλΈ μ½λ: https://github.com/khazit/Lip2Word/blob/master/lipReader.py#L62
|
| 129 |
+
def frameAdjust(video, target_frames=29):
|
| 130 |
+
n_frames = video.shape[0] # μμ νλ μ κ°μ
|
| 131 |
+
|
| 132 |
+
if target_frames == n_frames :
|
| 133 |
+
return video # μμ κ·Έλλ‘ λ°ν
|
| 134 |
+
else :
|
| 135 |
+
# μμ νλ μ κ°μ > μνλ νλ μ κ°μ
|
| 136 |
+
if n_frames > target_frames :
|
| 137 |
+
idx = np.linspace(0, n_frames-1, target_frames) # μ«μ μνμ€ μμ± # κ΅¬κ° μμμ , κ΅¬κ° λμ , κ΅¬κ° λ΄ μ«μ κ°μ
|
| 138 |
+
idx = np.around(idx, 0).astype(np.int32) # λ°μ¬λ¦Όνκ³ dtype μ μ μλ‘ λ³κ²½
|
| 139 |
+
return video[idx] # μνλ νλ μ κ°μλ‘ sampling ν μμ
|
| 140 |
+
# μμ νλ μ κ°μ < μνλ νλ μ κ°μ
|
| 141 |
+
else :
|
| 142 |
+
output_video = np.zeros((target_frames, *video.shape[1:])).astype(np.uint8) # μνλ νλ μ κ°μμ λ§μΆ°μ 0μΌλ‘ μ΄κΈ°νν numpy μμ±
|
| 143 |
+
output_video[:n_frames] = video # μμ νλ μ κ°μκΉμ§ κ·Έλλ‘ μμ μ 보 μ μ₯
|
| 144 |
+
|
| 145 |
+
# μνλ νλ μ κ°μλ§νΌ λ§μ§λ§ νλ μ 볡μ
|
| 146 |
+
for i in range(target_frames-n_frames+1) :
|
| 147 |
+
output_video[i+n_frames-1] = output_video[n_frames-1]
|
| 148 |
+
|
| 149 |
+
return output_video # μνλ νλ μ κ°μλ‘ sampling ν μμ
|
preprocessing/vietnamese_detected_face_30_words.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
preprocessing/vietnamese_detected_face_30_words_have_snr.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|