Spaces:
Running
Running
Joshua Lochner
commited on
Commit
·
1286fe5
1
Parent(s):
c4f250e
Add support for mute action type and remove videos with full action type
Browse files- src/preprocess.py +10 -5
- src/shared.py +1 -0
src/preprocess.py
CHANGED
|
@@ -8,7 +8,7 @@ import segment
|
|
| 8 |
from tqdm import tqdm
|
| 9 |
from dataclasses import dataclass, field
|
| 10 |
from transformers import HfArgumentParser
|
| 11 |
-
from shared import CATGEGORY_OPTIONS, START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE, GeneralArguments, CustomTokens
|
| 12 |
import csv
|
| 13 |
import re
|
| 14 |
import random
|
|
@@ -582,7 +582,7 @@ def main():
|
|
| 582 |
|
| 583 |
if line['category'] not in allowed_categories:
|
| 584 |
continue
|
| 585 |
-
if line['actionType']
|
| 586 |
continue
|
| 587 |
|
| 588 |
# Ignore hidden items
|
|
@@ -616,9 +616,16 @@ def main():
|
|
| 616 |
'submission_time': float(line['timeSubmitted'])/1e3,
|
| 617 |
'reputation': reputation,
|
| 618 |
'category': line['category'],
|
| 619 |
-
|
| 620 |
})
|
| 621 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
# Remove duplicate sponsor segments by choosing best (most votes)
|
| 623 |
if not preprocess_args.keep_duplicate_segments:
|
| 624 |
logger.info('Remove duplicate segments')
|
|
@@ -647,8 +654,6 @@ def main():
|
|
| 647 |
# Always include segments locked by VIPs, regardless of view count
|
| 648 |
del db[key]
|
| 649 |
|
| 650 |
-
# TODO remove videos that contain a full-video label?
|
| 651 |
-
|
| 652 |
logger.info(f'Saved {len(db)} videos')
|
| 653 |
|
| 654 |
with open(processed_db_path, 'w') as fp:
|
|
|
|
| 8 |
from tqdm import tqdm
|
| 9 |
from dataclasses import dataclass, field
|
| 10 |
from transformers import HfArgumentParser
|
| 11 |
+
from shared import ACTION_OPTIONS, CATGEGORY_OPTIONS, START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE, GeneralArguments, CustomTokens
|
| 12 |
import csv
|
| 13 |
import re
|
| 14 |
import random
|
|
|
|
| 582 |
|
| 583 |
if line['category'] not in allowed_categories:
|
| 584 |
continue
|
| 585 |
+
if line['actionType'] not in ACTION_OPTIONS:
|
| 586 |
continue
|
| 587 |
|
| 588 |
# Ignore hidden items
|
|
|
|
| 616 |
'submission_time': float(line['timeSubmitted'])/1e3,
|
| 617 |
'reputation': reputation,
|
| 618 |
'category': line['category'],
|
| 619 |
+
'action': line['actionType'],
|
| 620 |
})
|
| 621 |
|
| 622 |
+
# First, remove videos that contain a full-video label
|
| 623 |
+
# (may confuse model since disclaimers and such aren't labelled)
|
| 624 |
+
# Must do it here before removing duplicate segments
|
| 625 |
+
for key in list(db):
|
| 626 |
+
if any(x['action'] == 'full' for x in db[key]):
|
| 627 |
+
del db[key]
|
| 628 |
+
|
| 629 |
# Remove duplicate sponsor segments by choosing best (most votes)
|
| 630 |
if not preprocess_args.keep_duplicate_segments:
|
| 631 |
logger.info('Remove duplicate segments')
|
|
|
|
| 654 |
# Always include segments locked by VIPs, regardless of view count
|
| 655 |
del db[key]
|
| 656 |
|
|
|
|
|
|
|
| 657 |
logger.info(f'Saved {len(db)} videos')
|
| 658 |
|
| 659 |
with open(processed_db_path, 'w') as fp:
|
src/shared.py
CHANGED
|
@@ -8,6 +8,7 @@ from typing import Optional
|
|
| 8 |
from dataclasses import dataclass, field
|
| 9 |
from enum import Enum
|
| 10 |
|
|
|
|
| 11 |
|
| 12 |
CATGEGORY_OPTIONS = {
|
| 13 |
'SPONSOR': 'Sponsor',
|
|
|
|
| 8 |
from dataclasses import dataclass, field
|
| 9 |
from enum import Enum
|
| 10 |
|
| 11 |
+
ACTION_OPTIONS = ['skip', 'mute', 'full']
|
| 12 |
|
| 13 |
CATGEGORY_OPTIONS = {
|
| 14 |
'SPONSOR': 'Sponsor',
|