Upload 13 files

Browse files

Files changed (13) hide show

utils/__init__.py +0 -0
utils/arguments.py +56 -0
utils/data_prep.py +433 -0
utils/fs_utils.py +110 -0
utils/input_process.py +233 -0
utils/load_model.py +29 -0
utils/ner_utils.py +311 -0
utils/train_model.py +264 -0
web/confirm.html +111 -0
web/final.html +63 -0
web/index.html +40 -0
web/put.html +53 -0
web/user.html +34 -0

utils/__init__.py ADDED Viewed

File without changes

utils/arguments.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+A
+"""
+# 미리 설정된 인수들
+from argparse import ArgumentParser
+# 사용자 선정 변수들
+ROOT_DIR = ""  # 프로젝트 루트 디렉토리 경로
+BERT_PRETRAINED_DIR = "klue/roberta-large"  # BERT 사전 훈련된 모델 디렉토리 경로
+DATA_PREFIX = "data"  # 데이터 파일들의 상위 디렉토리 경로
+CHECKPOINT_DIR = 'model'  # 모델 체크포인트 저장 디렉토리 경로
+LOG_FATH = 'logs'  # 훈련 로그 저장 디렉토리 경로
+def get_train_args():
+    """
+    훈련 인수 설정
+    """
+    parser = ArgumentParser(description='I_S', allow_abbrev=False)
+    # 인수 파싱
+    parser.add_argument('--model_name', type=str, default='KCSN')
+    # 모델 설정
+    parser.add_argument('--pooling_type', type=str, default='max_pooling')
+    parser.add_argument('--classifier_intermediate_dim', type=int, default=100)
+    parser.add_argument('--nonlinear_type', type=str, default='tanh')
+    # BERT 설정
+    parser.add_argument('--bert_pretrained_dir', type=str, default=BERT_PRETRAINED_DIR)
+    # 훈련 설정
+    parser.add_argument('--margin', type=float, default=1.0)
+    parser.add_argument('--lr', type=float, default=2e-5)
+    parser.add_argument('--optimizer', type=str, default='adam')
+    parser.add_argument('--dropout', type=float, default=0.5)
+    parser.add_argument('--num_epochs', type=int, default=50)
+    parser.add_argument('--batch_size', type=int, default=16)
+    parser.add_argument('--lr_decay', type=float, default=0.95)
+    parser.add_argument('--patience', type=int, default=10)
+    # 훈련, 개발 및 테스트 데이터 파일 경로
+    parser.add_argument('--train_file', type=str, default=f'{DATA_PREFIX}/train_unsplit.txt')
+    parser.add_argument('--dev_file', type=str, default=f'{DATA_PREFIX}/dev_unsplit.txt')
+    parser.add_argument('--test_file', type=str, default=f'{DATA_PREFIX}/test_unsplit.txt')
+    parser.add_argument('--name_list_path', type=str, default=f'{DATA_PREFIX}/name_list.txt')
+    parser.add_argument('--ws', type=int, default=10)  # 윈도우 크기
+    parser.add_argument('--length_limit', type=int, default=510)  # 시퀀스 길이 제한
+    # 체크포인트 및 로그 저장 디렉토리
+    parser.add_argument('--checkpoint_dir', type=str, default=CHECKPOINT_DIR)
+    parser.add_argument('--training_logs', type=str, default=LOG_FATH)
+    args, _ = parser.parse_known_args()
+    return args

utils/data_prep.py ADDED Viewed

	@@ -0,0 +1,433 @@

+"""
+Author:
+"""
+import copy
+from typing import Any
+from ckonlpy.tag import Twitter
+from tqdm import tqdm
+import re
+import torch
+from torch.utils.data import Dataset, DataLoader
+from sklearn.model_selection import train_test_split
+# 사용자가 사전에 단어 추가가 가능한 형태소 분석기를 이용(추후에 name_list에 등재된 이름을 등록하여 인식 및 분리하기 위함)
+twitter = Twitter()
+def load_data(filename) -> Any:
+    """
+    지정된 파일에서 데이터를 로드합니다.
+    """
+    return torch.load(filename)
+def NML(seg_sents, mention_positions, ws):
+    """
+    Nearest Mention Location (특정 후보 발화자가 언급된 위치중, 인용문으로부터 가장 가까운 언급 위치를 찾는 함수)
+    Parameters:
+        - seg_sents: 문장을 분할한 리스트
+        - mention_positions: 특정 후보 발화자가 언급된 위치를 모두 담은 리스트 [(sentence_index, word_index), ...]
+        - ws: 인용문 앞/뒤로 고려할 문장의 수
+    Returns:
+        - 가장 가까운 언급 위치의 (sentence_index, word_index)
+    """
+    def word_dist(pos):
+        """
+        발화 후보자 이름이 언급된 위치와 인용문 사이의 거리를 단어 수준(word level)에서 반환합니다.
+        Parameters:
+            - pos: 발화 후보자가 언급된 위치 (sentence_index, word_index)
+        Returns:
+            - 발화 후보자와 언급된 위치 사이의 거리 (단어 수준)
+        """
+        if pos[0] == ws:
+            w_d = ws * 2
+        elif pos[0] < ws:
+            w_d = sum(len(
+                sent) for sent in seg_sents[pos[0] + 1:ws]) + len(seg_sents[pos[0]][pos[1] + 1:])
+        else:
+            w_d = sum(
+                len(sent) for sent in seg_sents[ws + 1:pos[0]]) + len(seg_sents[pos[0]][:pos[1]])
+        return w_d
+    # 언급된 위치들과 인용문 사이의 거리를 가까운 순으로 정렬
+    sorted_positions = sorted(mention_positions, key=lambda x: word_dist(x))
+    # 가장 가까운 언급 위치(Nearest Mention Location) 반환
+    return sorted_positions[0]
+def max_len_cut(seg_sents, mention_pos, max_len):
+    """
+    주어진 문장을 모델에 입력 가능한 최대 길이(max_len)로 자르는 함수
+    Parameters:
+        - seg_sents: 문장을 분할한 리스트
+        - mention_pos: 발화 후보자가 언급된 위치 (sentence_index, word_index)
+        - max_len: 입력 가능한 최대 길이
+    Returns:
+        - seg_sents : 자르고 남은 문장 리스트
+        - mention_pos : 조정된 언급된 위치
+    """
+    # 각 문장의 길이를 문자 단위로 계산한 리스트 생성
+    sent_char_lens = [sum(len(word) for word in sent) for sent in seg_sents]
+    # 전체 문자의 길이 합
+    sum_char_len = sum(sent_char_lens)
+    # 각 문장에서, cut을 실행할 문자의 위치(맨 마지막 문자)
+    running_cut_idx = [len(sent) - 1 for sent in seg_sents]
+    while sum_char_len > max_len:
+        max_len_sent_idx = max(list(enumerate(sent_char_lens)), key=lambda x: x[1])[0]
+        if max_len_sent_idx == mention_pos[0] and running_cut_idx[max_len_sent_idx] == mention_pos[1]:
+            running_cut_idx[max_len_sent_idx] -= 1
+        if max_len_sent_idx == mention_pos[0] and running_cut_idx[max_len_sent_idx] < mention_pos[1]:
+            mention_pos[1] -= 1
+        reduced_char_len = len(
+            seg_sents[max_len_sent_idx][running_cut_idx[max_len_sent_idx]])
+        sent_char_lens[max_len_sent_idx] -= reduced_char_len
+        sum_char_len -= reduced_char_len
+        # 자를 위치 삭제
+        del seg_sents[max_len_sent_idx][running_cut_idx[max_len_sent_idx]]
+        # 자를 위치 업데이트
+        running_cut_idx[max_len_sent_idx] -= 1
+    return seg_sents, mention_pos
+def seg_and_mention_location(raw_sents_in_list, alias2id):
+    """
+    주어진 문장을 분할하고 발화자 이름이 언급된 위치를 찾는 함수
+    Parameters:
+        - raw_sents_in_list: 분할할 원본 문장 리스트
+        - alias2id: 캐릭터 별 이름(및 별칭)과 ID를 매핑한 딕셔너리
+    Returns:
+        - seg_sents: 문장을 단어로 분할한 리스트
+        - character_mention_poses: 캐릭터별로, 이름이 언급된 위치를 모두 저장한 딕셔너리 {character1_id: [[sent_idx, word_idx], ...]}
+        - name_list_index: 언급된 캐릭터 이름 리스트
+    """
+    character_mention_poses = {}
+    seg_sents = []
+    id_pattern = ['&C{:02d}&'.format(i) for i in range(51)]
+    for sent_idx, sent in enumerate(raw_sents_in_list):
+        raw_sent_with_split = sent.split()
+        for word_idx, word in enumerate(raw_sent_with_split):
+            match =  re.search(r'&C\d{1,2}&', word)
+            # &C00& 형식으로 된 이름이 있을 경우, result 변수로 지정
+            if match:
+                result = match.group(0)
+                if alias2id[result] in character_mention_poses:
+                    character_mention_poses[alias2id[result]].append([sent_idx, word_idx])
+                else:
+                    character_mention_poses[alias2id[result]] = [[sent_idx, word_idx]]
+        seg_sents.append(raw_sent_with_split)
+    name_list_index = list(character_mention_poses.keys())
+    return seg_sents, character_mention_poses, name_list_index
+def create_CSS(seg_sents, candidate_mention_poses, args):
+    """
+    각 인스턴스 내 각 발화자 후보(candidate)에 대하여 candidate-specific segments(CSS)를 만듭니다.
+    parameters:
+        seg_sents: 2ws + 1 개의 문장(각 문장은 분할됨)들을 담은 리스트
+        candidate_mention_poses: 발화자별로 이름이 언급된 위치를 담고 있는 딕셔너리이며, 형태는 다음과 같음.
+            {character index: [[sentence index, word index in sentence] of mention 1,...]...}.
+        args : 실행 인수를 담은 객체
+    return:
+        Returned contents are in lists, in which each element corresponds to a candidate.
+        The order of candidate is consistent with that in list(candidate_mention_poses.keys()).
+        many_css: 각 발화자 후보에 대한 candidate-specific segments(CSS).
+        many_sent_char_len: 각 CSS의 문자 길이 정보
+            [[character-level length of sentence 1,...] of the CSS of candidate 1,...].
+        many_mention_pos: CSS 내에서, 인용문과 가장 가까운 이름이 언급된 위치 정보
+            [(sentence-level index of nearest mention in CSS,
+             character-level index of the leftmost character of nearest mention in CSS,
+             character-level index of the rightmost character + 1) of candidate 1,...].
+        many_quote_idx: CSS 내의 인용문의 문장 인덱스
+        many_cut_css : 최대 길이 제한이 적용된 CSS
+    """
+    ws = args.ws
+    max_len = args.length_limit
+    model_name = args.model_name
+    # assert len(seg_sents) == ws * 2 + 1
+    many_css = []
+    many_sent_char_lens = []
+    many_mention_poses = []
+    many_quote_idxes = []
+    many_cut_css = []
+    for candidate_idx in candidate_mention_poses.keys():
+        nearest_pos = NML(seg_sents, candidate_mention_poses[candidate_idx], ws)
+        if nearest_pos[0] <= ws:
+            CSS = copy.deepcopy(seg_sents[nearest_pos[0]:ws + 1])
+            mention_pos = [0, nearest_pos[1]]
+            quote_idx = ws - nearest_pos[0]
+        else:
+            CSS = copy.deepcopy(seg_sents[ws:nearest_pos[0] + 1])
+            mention_pos = [nearest_pos[0] - ws, nearest_pos[1]]
+            quote_idx = 0
+        cut_CSS, mention_pos = max_len_cut(CSS, mention_pos, max_len)
+        sent_char_lens = [sum(len(word) for word in sent) for sent in cut_CSS]
+        mention_pos_left = sum(sent_char_lens[:mention_pos[0]]) + sum(
+            len(x) for x in cut_CSS[mention_pos[0]][:mention_pos[1]])
+        mention_pos_right = mention_pos_left + len(cut_CSS[mention_pos[0]][mention_pos[1]])
+        if model_name == 'CSN':
+            mention_pos = (mention_pos[0], mention_pos_left, mention_pos_right)
+            cat_CSS = ''.join([''.join(sent) for sent in cut_CSS])
+        elif model_name == 'KCSN':
+            mention_pos = (mention_pos[0], mention_pos_left, mention_pos_right, mention_pos[1])
+            cat_CSS = ' '.join([' '.join(sent) for sent in cut_CSS])
+        many_css.append(cat_CSS)
+        many_sent_char_lens.append(sent_char_lens)
+        many_mention_poses.append(mention_pos)
+        many_quote_idxes.append(quote_idx)
+        many_cut_css.append(cut_CSS)
+    return many_css, many_sent_char_lens, many_mention_poses, many_quote_idxes, many_cut_css
+class ISDataset(Dataset):
+    """
+    발화자 식별을 위한 데이터셋 서브클래스
+    """
+    def __init__(self, data_list):
+        super(ISDataset, self).__init__()
+        self.data = data_list
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.data[idx]
+def build_data_loader(data_file, alias2id, args, save_name=None) -> DataLoader:
+    """
+    학습을 위한 데이터로더를 생성합니다.
+    """
+    # 사전에 이름을 추가
+    for alias in alias2id:
+        twitter.add_dictionary(alias, 'Noun')
+    # 파일을 줄별로 불러들임
+    with open(data_file, 'r', encoding='utf-8') as fin:
+        data_lines = fin.readlines()
+    # 전처리
+    data_list = []
+    for i, line in enumerate(tqdm(data_lines)):
+        offset = i % 31
+        if offset == 0:
+            instance_index = line.strip().split()[-1]
+            raw_sents_in_list = []
+            continue
+        if offset < 22:
+            raw_sents_in_list.append(line.strip())
+        if offset == 22:
+            speaker_name = line.strip().split()[-1]
+            # 빈 리스트는 제거
+            filtered_list = [li for li in raw_sents_in_list if li]
+            # ��장 분할 및 등장인물 언급 위치 추출
+            seg_sents, candidate_mention_poses, name_list_index = seg_and_mention_location(
+                filtered_list, alias2id)
+            # CSS 생성
+            css, sent_char_lens, mention_poses, quote_idxes, cut_css = create_CSS(
+                seg_sents, candidate_mention_poses, args)
+            # 후보자 리스트
+            candidates_list = list(candidate_mention_poses.keys())
+            # 원핫 레이블 생성
+            one_hot_label = [0 if character_idx != alias2id[speaker_name]
+                             else 1 for character_idx in candidate_mention_poses.keys()]
+            true_index = one_hot_label.index(1) if 1 in one_hot_label else 0
+        if offset == 24:
+            category = line.strip().split()[-1]
+        if offset == 25:
+            name = ' '.join(line.strip().split()[1:])
+        if offset == 26:
+            scene = line.strip().split()[-1]
+        if offset == 27:
+            place = line.strip().split()[-1]
+        if offset == 28:
+            time = line.strip().split()[-1]
+        if offset == 29:
+            cut_position = line.strip().split()[-1]
+            data_list.append((seg_sents, css, sent_char_lens, mention_poses, quote_idxes,
+                              cut_css, one_hot_label, true_index, category, name_list_index,
+                              name, scene, place, time, cut_position, candidates_list,
+                              instance_index))
+    # 데이터로더 생성
+    data_loader = DataLoader(ISDataset(data_list), batch_size=1, collate_fn=lambda x: x[0])
+    # 저장할 이름이 주어진 경우 데이터 리스트 저장
+    if save_name is not None:
+        torch.save(data_list, save_name)
+    return data_loader
+def load_data_loader(saved_filename: str) -> DataLoader:
+    """
+    저장된 파일에서 데이터를 로드하고 DataLoader 객체로 변환합니다.
+    """
+    # 저장된 데이터 리스트 로드
+    data_list = load_data(saved_filename)
+    return DataLoader(ISDataset(data_list), batch_size=1, collate_fn=lambda x: x[0])
+def split_train_val_test(data_file, alias2id, args, save_name=None, test_size=0.2, val_size=0.1, random_state=13):
+    """
+    기존 검증 방식을 적용하여 데이터 로더를 빌드합니다.
+    주어진 데이터 파일을 훈련, 검증, 테스트 세트로 분할하고 각각의 DataLoader를 생성합니다.
+    Parameters:
+        - data_file: 분할할 데이터 파일 경로
+        - alias2id: 등장인물 이름과 ID를 매핑한 딕셔너리
+        - args: 실행 인자를 담은 객체
+        - save_name: 분할된 데이터를 저장할 파일 이름
+        - test_size: 테스트 세트의 비율 (기본값: 0.2)
+        - val_size: 검증 세트의 비율 (기본값: 0.1)
+        - random_state: 랜덤 시드 (기본값: 13)
+    Returns:
+        - train_loader: 훈련 데이터로더
+        - val_loader: 검증 데이터로더
+        - test_loader: 테스트 데이터로더
+    """
+    # 사전에 이름 추가
+    for alias in alias2id:
+        twitter.add_dictionary(alias, 'Noun')
+    # 파일에서 인스턴스 로드
+    with open(data_file, 'r', encoding='utf-8') as fin:
+        data_lines = fin.readlines()
+    # 전처리
+    data_list = []
+    for i, line in enumerate(tqdm(data_lines)):
+        offset = i % 31
+        if offset == 0:
+            instance_index = line.strip().split()[-1]
+            raw_sents_in_list = []
+            continue
+        if offset < 22:
+            raw_sents_in_list.append(line.strip())
+        if offset == 22:
+            speaker_name = line.strip().split()[-1]
+            # 빈 리스트는 제거합니다.
+            filtered_list = [li for li in raw_sents_in_list if li]
+            # 문장 분할 및 등장인물 언급 위치 추출
+            seg_sents, candidate_mention_poses, name_list_index = seg_and_mention_location(
+                filtered_list, alias2id)
+            # CSS 생성
+            css, sent_char_lens, mention_poses, quote_idxes, cut_css = create_CSS(
+                seg_sents, candidate_mention_poses, args)
+            # 후보자 리스트
+            candidates_list = list(candidate_mention_poses.keys())
+            # 원핫 레이블 생성
+            one_hot_label = [0 if character_idx != alias2id[speaker_name]
+                             else 1 for character_idx in candidate_mention_poses.keys()]
+            true_index = one_hot_label.index(1) if 1 in one_hot_label else 0
+        if offset == 24:
+            category = line.strip().split()[-1]
+        if offset == 25:
+            name = ' '.join(line.strip().split()[1:])
+        if offset == 26:
+            scene = line.strip().split()[-1]
+        if offset == 27:
+            place = line.strip().split()[-1]
+        if offset == 28:
+            time = line.strip().split()[-1]
+        if offset == 29:
+            cut_position = line.strip().split()[-1]
+            data_list.append((seg_sents, css, sent_char_lens, mention_poses, quote_idxes,
+                              cut_css, one_hot_label, true_index, category, name_list_index,
+                              name, scene, place, time, cut_position, candidates_list,
+                              instance_index))
+    # train-validation-test로 데이터를 나누기
+    train_data, test_data = train_test_split(
+        data_list, test_size=test_size, random_state=random_state)
+    train_data, val_data = train_test_split(
+        train_data, test_size=val_size, random_state=random_state)
+    # train DataLoader 생성
+    train_loader = DataLoader(ISDataset(train_data), batch_size=1, collate_fn=lambda x: x[0])
+    # validation DataLoader 생성
+    val_loader = DataLoader(ISDataset(val_data), batch_size=1, collate_fn=lambda x: x[0])
+    # test DataLoader 생성
+    test_loader = DataLoader(ISDataset(test_data), batch_size=1, collate_fn=lambda x: x[0])
+    if save_name is not None:
+        # 각각의 데이터를 저장
+        torch.save(train_data, save_name.replace(".pt", "_train.pt"))
+        torch.save(val_data, save_name.replace(".pt", "_val.pt"))
+        torch.save(test_data, save_name.replace(".pt", "_test.pt"))
+    return train_loader, val_loader, test_loader

utils/fs_utils.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+화자 찾는 모델 유틸 파일들
+"""
+class InputFeatures:
+    """
+    BERT 모델의 입력들
+    """
+    def __init__(self, tokens, input_ids, input_mask, input_type_ids):
+        self.tokens = tokens
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.input_type_ids = input_type_ids
+def convert_examples_to_features(examples, tokenizer):
+    """
+    텍스트 segment를 단어 ID로 변환합니다.
+    """
+    features = []
+    tokens_list = []
+    for (ex_index, example) in enumerate(examples):
+        tokens = tokenizer.tokenize(example)
+        tokens_list.append(tokens)
+        new_tokens = []
+        input_type_ids = []
+        new_tokens.append("[CLS]")
+        input_type_ids.append(0)
+        new_tokens = new_tokens + tokens
+        input_type_ids = input_type_ids + [0] * len(tokens)
+        new_tokens.append("[SEP]")
+        input_type_ids.append(0)
+        input_ids = tokenizer.convert_tokens_to_ids(new_tokens)
+        input_mask = [1] * len(input_ids)
+        features.append(
+            InputFeatures(
+                tokens=new_tokens,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                input_type_ids=input_type_ids))
+    return features, tokens_list
+def get_alias2id(name_list_path) -> dict:
+    """
+    주어진 이름 목록 파일에서 별칭(alias)을 ID로 매핑하는 사전을 생성.
+    """
+    with open(name_list_path, 'r', encoding='utf-8') as fin:
+        name_lines = fin.readlines()
+    alias2id = {}
+    for i, line in enumerate(name_lines):
+        for alias in line.strip().split()[1:]:
+            alias2id[alias] = i
+    return alias2id
+def find_speak(fs_model, input_data, tokenizer, alias2id):
+    """
+    주어진 모델과 입력 데이터를 사용하여 각 입력에 대한 화자를 찾는 함수
+    """
+    model = fs_model
+    check_data_iter = iter(input_data)
+    names = []
+    for _ in range(len(input_data)):
+        seg_sents, css, scl, mp, qi, cut_css, name_list_index = next(check_data_iter)
+        features, tokens_list = convert_examples_to_features(examples=css, tokenizer=tokenizer)
+        try:
+            predictions = model(features, scl, mp, qi, 0, "cuda:0", tokens_list, cut_css)
+        except RuntimeError:
+            predictions = model(features, scl, mp, qi, 0, "cpu", tokens_list, cut_css)
+        scores, _, _ = predictions
+        # 후처리
+        try:
+            scores_np = scores.detach().cpu().numpy()
+            scores_list = scores_np.tolist()
+            score_index = scores_list.index(max(scores_list))
+            name_index = name_list_index[score_index]
+            for key, val in alias2id.items():
+                if val == name_index:
+                    result_key = key
+            names.append(result_key)
+        except AttributeError:
+            names.append('알 수 없음')
+    return names
+def making_script(text, speaker:list, instance_num:list) -> str:
+    """
+    주어진 텍스트와 화자 목록, 해당하는 줄 번호를 사용하여 대화 스크립트를 생성하는 함수
+    """
+    lines = text.splitlines()
+    for num, people in zip(instance_num, speaker):
+        lines[num] = f'{people}: {lines[num]}'
+    return lines

utils/input_process.py ADDED Viewed

	@@ -0,0 +1,233 @@

+"""
+사용자 입력을 가공하는 모듈
+"""
+import copy
+import re
+from torch.utils.data import DataLoader, Dataset
+class ISDataset(Dataset):
+    """
+    Dataset subclass for Identifying speaker.
+    """
+    def __init__(self, data_list):
+        super(ISDataset, self).__init__()
+        self.data = data_list
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.data[idx]
+def make_instance_list(text: str, ws=10) -> list:
+    """
+    입력받은 문장을 기초적인 인스턴스 리스트로 만들어줍니다.
+    """
+    lines = text.splitlines()
+    max_line = len(lines)
+    utterance = ['"', '“', '‘']
+    instance_num = []
+    for idx, line in enumerate(lines):
+        if any(u in line for u in utterance):
+            instance_num.append(idx)
+    instance = [[] for _ in range(len(instance_num))]
+    for i, num in enumerate(instance_num):
+        if num - ws <= 0 and num + ws + 1 < max_line:
+            instance[i] += ([''] * (ws - num))
+            instance[i] +=(lines[:num + 1 + ws])
+        elif num - ws <= 0 and num + ws + 1 >= max_line:
+            instance[i] += ([''] * (ws - num))
+            instance[i] +=(lines)
+            instance[i] += ([''] * (ws * 2 - len(instance[i]) + 1))
+        elif num + ws + 1 >= max_line:
+            instance[i] +=(lines[num-ws:max_line+1])
+            instance[i] += ([''] * (num + ws + 1 - max_line))
+        else:
+            instance[i] += (lines[num-ws:num + ws + 1])
+    return instance, instance_num
+def NML(seg_sents, mention_positions, ws):
+    """
+    Nearest Mention Location
+    """
+    def word_dist(pos):
+        """
+        The word level distance between quote and the mention position
+        """
+        if pos[0] == ws:
+            w_d = ws * 2
+        elif pos[0] < ws:
+            w_d = sum(len(
+                sent) for sent in seg_sents[pos[0] + 1:ws]) + len(seg_sents[pos[0]][pos[1] + 1:])
+        else:
+            w_d = sum(
+                len(sent) for sent in seg_sents[ws + 1:pos[0]]) + len(seg_sents[pos[0]][:pos[1]])
+        return w_d
+    sorted_positions = sorted(mention_positions, key=lambda x: word_dist(x))
+    return sorted_positions[0]
+def max_len_cut(seg_sents, mention_pos, max_len):
+    sent_char_lens = [sum(len(word) for word in sent) for sent in seg_sents]
+    sum_char_len = sum(sent_char_lens)
+    running_cut_idx = [len(sent) - 1 for sent in seg_sents]
+    while sum_char_len > max_len:
+        max_len_sent_idx = max(list(enumerate(sent_char_lens)), key=lambda x: x[1])[0]
+        if max_len_sent_idx == mention_pos[0] and running_cut_idx[max_len_sent_idx] == mention_pos[1]:
+            running_cut_idx[max_len_sent_idx] -= 1
+        if max_len_sent_idx == mention_pos[0] and running_cut_idx[max_len_sent_idx] < mention_pos[1]:
+            mention_pos[1] -= 1
+        reduced_char_len = len(
+            seg_sents[max_len_sent_idx][running_cut_idx[max_len_sent_idx]])
+        sent_char_lens[max_len_sent_idx] -= reduced_char_len
+        sum_char_len -= reduced_char_len
+        del seg_sents[max_len_sent_idx][running_cut_idx[max_len_sent_idx]]
+        running_cut_idx[max_len_sent_idx] -= 1
+    return seg_sents, mention_pos
+def seg_and_mention_location(raw_sents_in_list, alias2id):
+    character_mention_poses = {}
+    seg_sents = []
+    id_pattern = ['&C{:02d}&'.format(i) for i in range(51)]
+    for sent_idx, sent in enumerate(raw_sents_in_list):
+        raw_sent_with_split = sent.split()
+        for word_idx, word in enumerate(raw_sent_with_split):
+            match =  re.search(r'&C\d{1,2}&', word)
+            if match:
+                result = match.group(0)
+                if alias2id[result] in character_mention_poses:
+                    character_mention_poses[alias2id[result]].append([sent_idx, word_idx])
+                else:
+                    character_mention_poses[alias2id[result]] = [[sent_idx, word_idx]]
+        seg_sents.append(raw_sent_with_split)
+    name_list_index = list(character_mention_poses.keys())
+    return seg_sents, character_mention_poses, name_list_index
+def create_css(seg_sents, candidate_mention_poses, ws=10):
+    """
+    Create candidate-specific segments for each candidate in an instance.
+    """
+    # assert len(seg_sents) == ws * 2 + 1
+    many_css = []
+    many_sent_char_lens = []
+    many_mention_poses = []
+    many_quote_idxes = []
+    many_cut_css = []
+    for candidate_idx in candidate_mention_poses.keys():
+        nearest_pos = NML(seg_sents, candidate_mention_poses[candidate_idx], ws)
+        if nearest_pos[0] <= ws:
+            CSS = copy.deepcopy(seg_sents[nearest_pos[0]:ws + 1])
+            mention_pos = [0, nearest_pos[1]]
+            quote_idx = ws - nearest_pos[0]
+        else:
+            CSS = copy.deepcopy(seg_sents[ws:nearest_pos[0] + 1])
+            mention_pos = [nearest_pos[0] - ws, nearest_pos[1]]
+            quote_idx = 0
+        cut_CSS, mention_pos = max_len_cut(CSS, mention_pos, 510)
+        sent_char_lens = [sum(len(word) for word in sent) for sent in cut_CSS]
+        mention_pos_left = sum(sent_char_lens[:mention_pos[0]]) + sum(
+            len(x) for x in cut_CSS[mention_pos[0]][:mention_pos[1]])
+        mention_pos_right = mention_pos_left + len(cut_CSS[mention_pos[0]][mention_pos[1]])
+        mention_pos = (mention_pos[0], mention_pos_left, mention_pos_right, mention_pos[1])
+        cat_CSS = ' '.join([' '.join(sent) for sent in cut_CSS])
+        many_css.append(cat_CSS)
+        many_sent_char_lens.append(sent_char_lens)
+        many_mention_poses.append(mention_pos)
+        many_quote_idxes.append(quote_idx)
+        many_cut_css.append(cut_CSS)
+    return many_css, many_sent_char_lens, many_mention_poses, many_quote_idxes, many_cut_css
+def input_data_loader(instances: list, alias2id) -> DataLoader:
+    """
+    나눠진 데이터를 맞추기 위해 가공
+    """
+    data_list = []
+    for instance in instances:
+        seg_sents, candidate_mention_poses, name_list_index = seg_and_mention_location(
+            instance, alias2id)
+        css, sent_char_lens, mention_poses, quote_idxes, cut_css = create_css(
+            seg_sents, candidate_mention_poses)
+        data_list.append((seg_sents, css, sent_char_lens, mention_poses, quote_idxes,
+                          cut_css, name_list_index))
+    data_loader = DataLoader(ISDataset(data_list), batch_size=1, collate_fn=lambda x: x[0])
+    return data_loader
+def make_ner_input(text, chunk_size=500) -> list:
+    """
+    문장을 New Lines 기준으로 나누어 줍니다.
+    chunk size보다 문장이 길 경우, 마지막 문장은 뒤에서 chunk size 만큼 추가합니다.
+    """
+    count_text = chunk_size
+    max_text = len(text)
+    newline_position = []
+    while count_text < max_text:
+        sentence = text[:count_text]
+        last_newline_position = sentence.rfind('\n')
+        newline_position.append(last_newline_position)
+        count_text = last_newline_position + chunk_size
+    split_sentences = []
+    start_num = 0
+    for _, num in enumerate(newline_position):
+        split_sentences.append(text[start_num:num])
+        start_num = num
+    if max_text % chunk_size != 0:
+        f_sentence = text[max_text-500:]
+        first_newline_position =  max_text-500 + f_sentence.find('\n')
+        split_sentences.append(text[first_newline_position:])
+    return split_sentences
+def making_script(text, speaker:list, instance_num:list) -> str:
+    """
+    스크립트를 만드는 함수
+    """
+    lines = text.splitlines()
+    for num, people in zip(instance_num, speaker):
+        lines[num] = f'{people}: {lines[num]}'
+    return lines

utils/load_model.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+모델들 불러오는 모듈
+"""
+import torch
+# from .load_model import KCSN
+# from .arguments import get_train_args
+# args = get_train_args()
+def load_ner(path ='model/NER.pth'):
+    """
+    NER 모델
+    """
+    checkpoint = torch.load(path)
+    model = checkpoint['model']
+    model.load_state_dict(checkpoint['model_state_dict'])
+    return model, checkpoint
+# def load_fs(path = 'model/FS.pth'):
+#     """
+#     Find Speaker 모델
+#     """
+#     model = KCSN(args)
+#     checkpoint = torch.load(path)
+#     model.load_state_dict(checkpoint['model_state_dict'])
+#     return model, checkpoint

utils/ner_utils.py ADDED Viewed

	@@ -0,0 +1,311 @@

+"""
+NER 모델을 이용하여 작업하는 코드입니다.
+"""
+import re
+import torch
+import numpy as np
+from collections import Counter
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+def ner_tokenizer(text, max_seq_length, checkpoint):
+    """
+    NER을 위해 텍스트를 토큰화합니다.
+    Args:
+        sent: 처리하고자 하는 텍스트를 입력받습니다.
+        max_seq_length: BERT의 config에서 처리 가능한 최대 문자열 길이는 512입니다. 최대 길이를 넘어서지 않도록, 텍스트 길이가 512를 넘어갈 경우 여러 개의 문자열로 분리합니다.
+                        문맥 정보를 고려하므로 가능한 긴 길이로 chunking하는 것이 좋은 성능을 보장할 수 있습니다.
+        checkpoint: NER 모델에 대한 정보를 불러들입니다.
+    Return:
+        ner_tokenizer_dict: 아래 세 요소를 포함한 딕셔너리입니다.
+            input_ids: 각 토큰의 모델 딕셔너리에서의 아이디값입니다.
+            attention_mask: 각 토큰의 어탠션 마스크 활성화 여부입니다.
+            token_type_ids: 개체명 인식 된 토큰의 경우 그 타입의 아이디(숫자 조합)를 반환합니다.
+    """
+    #저장된 모델의 토크나이저를 불러옵니다.
+    tokenizer = checkpoint['tokenizer']
+    #각각 패딩, 문장 시작, 문장 끝을 나타내는 특별한 토큰들의 ID 값들을 가져옵니다.
+    pad_token_id = tokenizer.pad_token_id
+    cls_token_id = tokenizer.cls_token_id
+    sep_token_id = tokenizer.sep_token_id
+    #이전 음절을 저장하는 변수를 초기화합니다.
+    pre_syllable = "_"
+    #토크나이징된 결과를 저장할 리스트들을 초기화합니다.
+    input_ids = [pad_token_id] * (max_seq_length - 1)
+    attention_mask = [0] * (max_seq_length - 1)
+    token_type_ids = [0] * max_seq_length
+    #입력된 텍스트를 최대 시퀀스 길이에 맞게 잘라냅니다.
+    text = text[:max_seq_length-2]
+    #텍스트의 각 음절에 대해 반복문을 실행합니다.
+    for i, syllable in enumerate(text):
+        if syllable == '_':
+            pre_syllable = syllable
+        if pre_syllable != "_":
+            syllable = '##' + syllable
+        pre_syllable = syllable
+        #토큰을 모델의 단어 사전에 있는 ID 값으로 변환하여 input_ids 리스트에 저장합니다.
+        input_ids[i] = tokenizer.convert_tokens_to_ids(syllable)
+        #해당 위치의 토큰에 대한 어텐션 마스크를 활성화합니다.
+        attention_mask[i] = 1
+    #입력 시퀀스의 시작에는 cls_token_id를, 끝에는 sep_token_id를 추가합니다.
+    input_ids = [cls_token_id] + input_ids[:-1] + [sep_token_id]
+    #어텐션 마스크도 시작과 끝 토큰을 고려하여 수정합니다.
+    attention_mask = [1] + attention_mask[:-1] + [1]
+    ner_tokenizer_dict = {"input_ids": input_ids,
+                          "attention_mask": attention_mask,
+                          "token_type_ids": token_type_ids}
+    return ner_tokenizer_dict
+def get_ner_predictions(text, checkpoint):
+    """
+    토큰화한 문장(tokenized_sent)과 예측한 태그(pred_tags) 값을 만드는 함수입니다.
+    Args:
+        text: NER 예측을 필요로 하는 텍스트를 입력합니다.
+        checkpoint: 저장한 모델을 불러들입니다.
+    Returns:
+        tokenized_sent: 모델 입력을 위한 토큰화된 문장 정보입니다.
+        pred_tags: 각 토큰에 대한 예측된 태그들을 포함합니다.
+    """
+    #저장한 모델을 불러들입니다.
+    model = checkpoint['model']
+    #태그와 해당 태그의 ID 매핑 정보를 가져옵니다.
+    tag2id = checkpoint['tag2id']
+    model.to(device)
+    #입력된 텍스트에서 공백을 언더스코어(_)로 대체합니다.
+    text = text.replace(' ', '_')
+    #예측값과 실제 라벨을 저장할 빈 리스트를 생성합니다.
+    predictions, true_labels = [], []
+    #ner_tokenizer 함수를 사용하여 텍스트를 토큰화합니다.
+    tokenized_sent = ner_tokenizer(text, len(text) + 2, checkpoint)
+    #토큰화된 결과를 토대로 텐서로 변환하여 모델 입력 형식에 맞게 준비합니다.
+    input_ids = torch.tensor(
+        tokenized_sent['input_ids']).unsqueeze(0).to(device)
+    attention_mask = torch.tensor(
+        tokenized_sent['attention_mask']).unsqueeze(0).to(device)
+    token_type_ids = torch.tensor(
+        tokenized_sent['token_type_ids']).unsqueeze(0).to(device)
+    #그래디언트 계산을 수행하지 않기 위해 torch.no_grad() 컨텍스트 내에서 다음을 실행합니다. (eval 영역이기 때문에 학습을 하지 않습니다)
+    with torch.no_grad():
+        outputs = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids)
+    #모델 출력에서 로짓 값을 가져와 Numpy값으로 변환하고, 라벨 ID들을 CPU 상의 NumPy 배열로 가져옵니다.
+    logits = outputs['logits']
+    logits = logits.detach().cpu().numpy()
+    label_ids = token_type_ids.cpu().numpy()
+    #예측된 라벨 값을 가져와서 리스트에 추가합니다.
+    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
+    #실제 라벨을 리스트에 추가합니다.
+    true_labels.append(label_ids)
+    #예측된 라벨 ID를 실제 태그로 변환합니다.
+    pred_tags = [list(tag2id.keys())[p_i] for p in predictions for p_i in p]
+    return tokenized_sent, pred_tags
+def ner_inference(tokenized_sent, pred_tags, checkpoint, name_len=5) -> list:
+    """
+    NER을 실행하고, 이름과 시간 및 공간 정보를 추출합니다.
+    Args:
+        tokenized_sent: 토큰화된 문장이 저장된 리스트
+        pred_tags: 각 토큰에 대한 예측 태그값 (NER 결과)
+        checkpoint: 저장해둔 모델을 불러옴
+        name_len: 더 정확한 이름 인식을 위해 앞뒤로 몇 개의 음절을 더 검토할지 지정합니다.
+    Returns:
+        namelist: 추출한 이름(별칭 포함) 리스트입니다. 후처리를 통해
+        scene: 추출한 장소 시간 사전입니다.
+    """
+    name_list = []
+    speaker = ''
+    tokenizer = checkpoint['tokenizer']
+    scene = {'장소': [], '시간': []}
+    target = ''
+    c_tag = None
+    for i, tag in enumerate(pred_tags):
+        token = tokenizer.convert_ids_to_tokens(tokenized_sent['input_ids'][i]).replace('#', '')
+        if 'PER' in tag:
+            if 'B' in tag and speaker != '':
+                name_list.append(speaker)
+                speaker = ''
+            speaker += token
+        elif speaker != '' and tag != pred_tags[i-1]:
+            if speaker in name_list:
+                name_list.append(speaker)
+            else:
+                tmp = speaker
+                found_name = False
+                # print(f'{speaker}에 의문이 생겨 확인해봅니다.')
+                for j in range(name_len):
+                    if i + j < len(tokenized_sent['input_ids']):
+                        token = tokenizer.convert_ids_to_tokens(
+                            tokenized_sent['input_ids'][i+j]).replace('#', '')
+                        tmp += token
+                        # print(f'{speaker} 뒤로 나온 {j} 번째 까지 확인한결과, {tmp} 입니다')
+                        if tmp in name_list:
+                            name_list.append(tmp)
+                            found_name = True
+                            # print(f'명단에 {tmp} 가 존재하여, {speaker} 대신 추가하였습니다.')
+                            break
+                if not found_name:
+                    name_list.append(speaker)
+                    # print(f'찾지 못하여 {speaker} 를 추가하였습니다.')
+                speaker = ''
+        elif tag != 'O':
+            if tag.startswith('B'):
+                if c_tag in ['TIM', 'DAT']:
+                    scene['시간'].append(target)
+                elif c_tag =='LOC':
+                    scene['장소'].append(target)
+                c_tag = tag[2:]
+                target = token
+            else:
+                target += token.replace('_', ' ')
+    return name_list, scene
+def make_name_list(ner_inputs, checkpoint):
+    """
+    문장들을 NER 돌려서 Name List 만들기.
+    """
+    name_list = []
+    times = []
+    places = []
+    for ner_input in ner_inputs:
+        tokenized_sent, pred_tags = get_ner_predictions(ner_input, checkpoint)
+        names, scene = ner_inference(tokenized_sent, pred_tags, checkpoint)
+        name_list.extend(names)
+        times.extend(scene['시간'])
+        places.extend(scene['장소'])
+    return name_list, times, places
+def show_name_list(name_list):
+    """
+    사용자 친화적으로 네임리스트를 보여줍니다.
+    Arg:
+        name_list: 추출한 이름 리스트
+    Return:
+        name: 동일한 이름이 몇 번 등장했는지 횟수를 함께 제공합니다.
+    """
+    name = dict(Counter(name_list))
+    return name
+def compare_strings(str1, str2):
+    """
+    ner로 추출한 인명을 후처리하는 코드입니다.
+    비교할 두 문자열의 길이가 다를 경우, 더 짧은 문자열이 더 긴 문자열에 포함되는지 확인합니다.
+    비교할 두 문자열의 길이가 같을 경우, 겹치는 부분이 2글자 이상일 경우 같은 이름으로 지정합니다.
+    이 함수와 아래의 combine_similar_names를 함께 실행하면, '한다정'과 '다정이', '다정이가' 등은 모두 하나의 인물로 묶을 수 있습니다.
+    Args: 비교하려는 두 문자열
+    Return: 두 문자열이 같은 이름으로 판단될 경우 True, 아닐 경우 False
+    """
+    if len(str1) != len(str2):
+        # 더 짧은 문자열이 더 긴 문자열에 포함되는지 확인
+        shorter, longer = (str1, str2) if len(str1) < len(str2) else (str2, str1)
+        if shorter in longer:
+            return True
+    else:
+        same_part = []
+        for i in range(len(str1)):
+            if str1[i] in str2:
+                same_part += str1[i]
+                continue
+            else:
+                break
+        if len(same_part) >= 2:
+            return True
+    return False
+def combine_similar_names(names_dict):
+    """
+    compare_strings 함수를 바탕으로 유사한 이름을 함께 묶습니다.
+    2글자는 이름일 확률이 높으니 기준점으로 지정합니다.
+    """
+    names = names_dict.keys()
+    similar_groups = [[name] for name in names if len(name) == 2]
+    idx = 0
+    # print(similar_groups, '\n',idx)
+    for name in names:
+        found = False
+        for group in similar_groups:
+            idx += 1
+            for item in group:
+                if compare_strings(name, item) and len(name)>1:
+                    found = True
+                    cleaned_text = re.sub(r'(아|이)$', '', item)
+                    if len(name) == len(item):
+                        same_part = ''
+                        # 완전히 일치하는 부분이 있는지 확인
+                        for i in range(len(name)):
+                            if name[i] in item:
+                                same_part += name[i]
+                        if same_part not in group and cleaned_text not in group:
+                            group.append(cleaned_text)
+                            # print(similar_groups, '\n',idx, '문자열의 길이가 같을 때')
+                    else:
+                        group.append(name)
+                        # print(similar_groups, '\n',idx, '문자열의 길이가 다를 때')
+                        break
+            if found:
+                break
+        if not found:
+            similar_groups.append([name])
+    updated_names = {tuple(name for name in group if len(name) > 1): counts for group, counts in (
+        (group, sum(names_dict[name] for name in group if name != '')) for group in similar_groups)
+        if len([name for name in group if len(name) > 1]) > 0}
+    return updated_names
+def convert_name2codename(codename2name, text):
+    """RE를 이용하여 이름을 코드네임으로 변경합니다. 이때 각 코드네임의 번호는 빈도수 기준 내림차순한 결과입니다."""
+    import re
+    for n_list in codename2name.values():
+        n_list.sort(key=lambda x:(len(x), x), reverse=True)
+    for codename, n_list in codename2name.items():
+        for subname in n_list:
+            text = re.sub(subname, codename, text)
+    return text
+def convert_codename2name(codename2name, text):
+    """코드네임을 이름으로 변경해줍니다."""
+    outputs = []
+    for i in text:
+        try:
+            outputs.append(codename2name[i][0])
+        except:
+            outputs.append('알 수 없음')
+    return outputs

utils/train_model.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""
+Author:
+"""
+import re
+import torch.nn as nn
+import torch.nn.functional as functional
+import torch
+from transformers import AutoModel
+import torch.autograd as autograd
+def get_nonlinear(nonlinear):
+    """
+    Activation function.
+    """
+    nonlinear_dict = {'relu': nn.ReLU(), 'tanh': nn.Tanh(),
+                      'sigmoid': nn.Sigmoid(), 'softmax': nn.Softmax(dim=-1)}
+    try:
+        return nonlinear_dict[nonlinear]
+    except:
+        raise ValueError('not a valid nonlinear type!')
+class SeqPooling(nn.Module):
+    """
+    Sequence pooling module.
+    Can do max-pooling, mean-pooling and attentive-pooling on a list of sequences of different lengths.
+    """
+    def __init__(self, pooling_type, hidden_dim):
+        super(SeqPooling, self).__init__()
+        self.pooling_type = pooling_type
+        self.hidden_dim = hidden_dim
+        if pooling_type == 'attentive_pooling':
+            self.query_vec = nn.parameter.Parameter(torch.randn(hidden_dim))
+    def max_pool(self, seq):
+        return seq.max(0)[0]
+    def mean_pool(self, seq):
+        return seq.mean(0)
+    def attn_pool(self, seq):
+        attn_score = torch.mm(seq, self.query_vec.view(-1, 1)).view(-1)
+        attn_w = nn.Softmax(dim=0)(attn_score)
+        weighted_sum = torch.mm(attn_w.view(1, -1), seq).view(-1)
+        return weighted_sum
+    def forward(self, batch_seq):
+        pooling_fn = {'max_pooling': self.max_pool,
+                      'mean_pooling': self.mean_pool,
+                      'attentive_pooling': self.attn_pool}
+        pooled_seq = [pooling_fn[self.pooling_type](seq) for seq in batch_seq]
+        return torch.stack(pooled_seq, dim=0)
+class MLP_Scorer(nn.Module):
+    """
+    MLP scorer module.
+    A perceptron with two layers.
+    """
+    def __init__(self, args, classifier_input_size):
+        super(MLP_Scorer, self).__init__()
+        self.scorer = nn.ModuleList()
+        self.scorer.append(nn.Linear(classifier_input_size, args.classifier_intermediate_dim))
+        self.scorer.append(nn.Linear(args.classifier_intermediate_dim, 1))
+        self.nonlinear = get_nonlinear(args.nonlinear_type)
+    def forward(self, x):
+        for model in self.scorer:
+            x = self.nonlinear(model(x))
+        return x
+class KCSN(nn.Module):
+    """
+    Candidate Scoring Network.
+    It's built on BERT with an MLP and other simple components.
+    """
+    def __init__(self, args):
+        super(KCSN, self).__init__()
+        self.args = args
+        self.bert_model = AutoModel.from_pretrained(args.bert_pretrained_dir)
+        self.pooling = SeqPooling(args.pooling_type, self.bert_model.config.hidden_size)
+        self.mlp_scorer = MLP_Scorer(args, self.bert_model.config.hidden_size * 3)
+        self.dropout = nn.Dropout(args.dropout)
+    def forward(self, features, sent_char_lens, mention_poses, quote_idxes, true_index, device, tokens_list, cut_css):
+        # encoding
+        qs_hid = []
+        ctx_hid = []
+        cdd_hid = []
+        unk_loc_li = []
+        unk_loc = 0
+        for i, (cdd_sent_char_lens, cdd_mention_pos, cdd_quote_idx) in enumerate(
+            zip(sent_char_lens, mention_poses, quote_idxes)):
+            unk_loc = unk_loc + 1
+            bert_output = self.bert_model(
+                torch.tensor([features[i].input_ids], dtype=torch.long).to(device),
+                token_type_ids=None,
+                attention_mask=torch.tensor([features[i].input_mask], dtype=torch.long).to(device)
+            )
+            modified_list = [s.replace('#', '') for s in tokens_list[i]]
+            cnt = 1
+            verify = 0
+            num_check = 0
+            num_vid = -999
+            accum_char_len = [0]
+            for idx, txt in enumerate(cut_css[i]):
+                result_string = ''.join(txt)
+                replace_dict = {']': r'\]', '[': r'\[', '?': r'\?', '-': r'\-', '!': r'\!'}
+                string_processing = result_string[-7:].translate(str.maketrans(replace_dict))
+                pattern = re.compile(rf'[{string_processing}]')
+                cnt = 1
+                if num_check == 1000:
+                    accum_char_len.append(num_vid)
+                num_check = 1000
+                for string in modified_list:
+                    string_nospace = string.replace(' ','')
+                    if len(accum_char_len) > idx + 1:
+                        continue
+                    for letter in string_nospace:
+                        match_result = pattern.match(letter)
+                        if match_result:
+                            verify += 1
+                            if verify == len(result_string[-7:]):
+                                if cnt > accum_char_len[-1]:
+                                    accum_char_len.append(cnt)
+                                verify = 0
+                                num_check = len(accum_char_len)
+                        else:
+                            verify = 0
+                    cnt = cnt + 1
+            if num_check == 1000:
+                accum_char_len.append(num_vid)
+            if -999 in accum_char_len:
+                unk_loc_li.append(unk_loc)
+                continue
+            CSS_hid = bert_output['last_hidden_state'][0][1:sum(cdd_sent_char_lens) + 1].to(device)
+            qs_hid.append(CSS_hid[accum_char_len[cdd_quote_idx]:accum_char_len[cdd_quote_idx + 1]])
+            ## 발화자 부분 찾아서 - bert tokenizer 된 부분을 인덱싱 하는 부분
+            cnt = 1
+            cdd_mention_pos_bert_li = []
+            cdd_mention_pos_unk = []
+            name = cut_css[i][cdd_mention_pos[0]][cdd_mention_pos[3]]
+            # extract only name
+            # 이름만 추출
+            cdd_pattern = re.compile(r'&C[0-5][0-9]&')
+            name_process = cdd_pattern.search(name)
+            # find candidate location in bert output
+            # 버트 결과에서 발화자 위치를 찾습니다
+            pattern_unk = re.compile(r'[\[UNK\]]')
+            # 이 부분은 결과를 찾게 되면, 더 이상 넘어가지 않도록 하는 코드 입니다.
+            if len(accum_char_len) < cdd_mention_pos[0]+1:
+                maxx_len = accum_char_len[len(accum_char_len)-1]
+            elif len(accum_char_len) == cdd_mention_pos[0]+1:
+                maxx_len = accum_char_len[-1] + 1000
+            else:
+                maxx_len = accum_char_len[cdd_mention_pos[0]+1]
+            # 포함되는 발화자를 찾기 위해.
+            start_name = None
+            name_match = '&'
+            for string in modified_list:
+                string_nospace = string.replace(' ','')
+                for letter in string_nospace:
+                    match_result_unk = pattern_unk.match(letter)
+                    if match_result_unk:
+                        cdd_mention_pos_unk.append(cnt)
+                    if start_name is True:
+                        name_match += letter
+                    if (name_match == name_process.group(0) or letter == '&') and len(
+                        cdd_mention_pos_bert_li) < 3 and maxx_len > cnt >= accum_char_len[
+                            cdd_mention_pos[0]]:  # 만약 & 가 포함되어 있을 경우에 사람으로 추출
+                        start_name = True  # 매칭이 되면, 1을 더합니다.
+                        if len(cdd_mention_pos_bert_li) == 1 and name_match != name_process.group(0):  # 만약 &가 두번째로 나오고, 매칭이 안될 경우
+                            start_name = None
+                            name_match = '&'
+                            cdd_mention_pos_bert_li = []
+                        elif name_match == name_process.group(0):  # 두번째 추가
+                            cdd_mention_pos_bert_li.append(cnt)
+                            start_name = None
+                            name_match = '&'
+                        else:
+                            cdd_mention_pos_bert_li.append(cnt-1)
+                cnt += 1
+            if len(cdd_mention_pos_bert_li) == 0 & len(cdd_mention_pos_unk) != 0:
+                cdd_mention_pos_bert_li.extend([cdd_mention_pos_unk[0], cdd_mention_pos_unk[0]+1])
+            elif len(cdd_mention_pos_bert_li) != 2:
+                cdd_mention_pos_bert_li = []
+                cdd_mention_pos_bert_li.extend([int(cdd_mention_pos[1] * accum_char_len[-1]/sum(
+                    cdd_sent_char_lens)), int(cdd_mention_pos[2] * accum_char_len[-1]/sum(
+                        cdd_sent_char_lens))])
+            if cdd_mention_pos_bert_li[0] == cdd_mention_pos_bert_li[1]:
+                cdd_mention_pos_bert_li[1] = cdd_mention_pos_bert_li[1]+1
+            # ctx 결정하는 코드. candidate 주변 정보 추출
+            # 하나일 경우에는 전체 부분을 가져온다.
+            if len(cdd_sent_char_lens) == 1:
+                ctx_hid.append(torch.zeros(1, CSS_hid.size(1)).to(device))
+            # 만약 앞에 발화자가 있을 경우엔 앞 문장부터, 마지막(인용문) 전까지 가져온다.
+            elif cdd_mention_pos[0] == 0:
+                ctx_hid.append(CSS_hid[:accum_char_len[-2]])
+            # 마지막으로 발화자가 뒤에 있을 경우에는 두번째 부터 끝까지 가져온다.
+            else:
+                ctx_hid.append(CSS_hid[accum_char_len[1]:])
+            cdd_mention_pos_bert = (cdd_mention_pos[0], cdd_mention_pos_bert_li[0],
+                                    cdd_mention_pos_bert_li[1])
+            cdd_hid.append(CSS_hid[cdd_mention_pos_bert[1]:cdd_mention_pos_bert[2]])
+        # pooling
+        if not qs_hid:
+            scores = '1'
+            scores_false = 1
+            scores_true = 1
+            return scores, scores_false, scores_true
+        qs_rep = self.pooling(qs_hid).to(device)
+        ctx_rep = self.pooling(ctx_hid).to(device)
+        cdd_rep = self.pooling(cdd_hid).to(device)
+        # concatenate
+        feature_vector = torch.cat([qs_rep, ctx_rep, cdd_rep], dim=-1).to(device)
+        # dropout
+        feature_vector = self.dropout(feature_vector).to(device)
+        # scoring
+        scores = self.mlp_scorer(feature_vector).view(-1).to(device)
+        for i in unk_loc_li:
+            # 추가할 원소
+            new_element = torch.tensor([-0.9000], requires_grad=True).to(device)
+            # 특정 인덱스에 원소를 추가하기 위해 torch.cat()과 슬라이싱을 사용합니다.
+            index_to_insert = i - 1
+            scores = torch.cat((scores[:index_to_insert], new_element, scores[index_to_insert:]),
+                               dim=0).to(device)
+        scores_false = [scores[i] for i in range(scores.size(0)) if i != true_index]
+        scores_true = [scores[true_index] for i in range(scores.size(0) - 1)]
+        return scores, scores_false, scores_true

web/confirm.html ADDED Viewed

	@@ -0,0 +1,111 @@

+<!DOCTYPE html>
+<html lang="ko">
+<head>
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <meta charset="utf-8">
+    <title>Spakers in Text</title>
+    <link rel="stylesheet" href="{{ url_for('static', path='css/put.css') }}">
+    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
+        integrity="sha384-4bw+/aepP/YC94hEpVNVgiZdgIC5+VKNBQNGCHeKRQN+PtmoHDEXuppvnDJzQIu9" crossorigin="anonymous">
+</head>
+<body>
+    <!-- jQuery 및 Bootstrap JavaScript 로드 -->
+    <script src="https://code.jquery.com/jquery-3.6.4.min.js"
+        integrity="sha256-oP6HI/tZ1aS9sz3Jr4+6zqbc9BE/l6fLx+Vz2I+H/GL4ZiI/Z5L3hMv8w3yXdBi"
+        crossorigin="anonymous"></script>
+    <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"
+        integrity="sha384-3ziFjNlAXja/Yb0M7y2BmFvR3s09gRPbrCm0lF+SvL4uIboD5lv3U3BdD7dW7Y3"
+        crossorigin="anonymous"></script>
+    <!-- 모달 표시 스크립트 -->
+    <script>
+        // JavaScript 수정: 버튼이 클릭되면 모달 표시
+        $(document).ready(function () {
+            $('#exampleModal').modal("show");
+        });
+        //추가 모달 열기 함수
+        function openAddItemModal() {
+            const addItemModal = createAddItemModal();
+            openModal(addItemModal);
+        }
+        //폼 제출 시 항목 추가 함수
+        function addItem() {
+            event.preventDefault();
+            const newItem = document.getElementById('newItem').value;
+            itemList.push(newItem);
+            updateItemList();
+            closeModal('addItemModal');
+        }
+    </script>
+    <div class="background">
+        <div class="header"><a href="/">
+                <span class="title">Nouvel : Novel for you</span></a></div>
+        <div class="empty"></div>
+        <div class="box">
+            <div class="subtitle">
+                감지한 등장인물 명단
+            </div>
+            <div class="explain_box">
+                <span class="explain">작업이 진행 중 입니다. 잠시만 기다려 주세요.</span>
+            </div>
+            <div class="container mt-4">
+                <ol id="itemList" class="list-group">
+                    <!-- 목록이 여기에 동적으로 추가됩니다. -->
+                </ol>
+                <!-- 추가 버튼
+                <button class="btn btn-primary mt-3" data-toggle="modal" data-target="#addItemModal">추가</button> -->
+                <form>
+                    <input type="text" class="form-control" id="newItem">
+                    <div class="modal-footer">
+                        <button type="button" class="btn btn-secondary" data-dismiss="modal">취소</button>
+                        <!-- 수정된 부분: type 속성을 "button"에서 "submit"으로 변경 -->
+                        <button type="submit" class="btn btn-primary" onclick="addItem()">추가</button>
+                    </div>
+                </form>
+            </div>
+            <!-- 수정/삭제 모달 -->
+            <div class="modal fade" id="editItemModal" tabindex="-1" role="dialog" aria-labelledby="editItemModalLabel"
+                aria-hidden="true">
+                <!-- 모달 창 내용이 여기에 동적으로 추가됩니다. -->
+            </div>
+            <!-- 추가 모달
+                <div class="modal fade" id="addItemModal" tabindex="-1" role="dialog" aria-labelledby="addItemModalLabel" aria-hidden="true">
+                모달 창 내용이 여기에 동적으로 추가됩니다.
+                </div>
+            -->
+            <div class="buttonbox">
+                <div class="transformbox">
+                    <button onclick="handleButtonClick()" class="transformButton" type="submit">
+                        <span>시작하기</span>
+                    </button>
+                </div>
+            </div>
+            <div class="foot">
+                <div class="footer-text">
+                    <span>고려대학교 지능정보 SW 아카데미 5조</span>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.12.9/umd/popper.min.js"></script>
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/js/bootstrap.min.js"></script>
+    <script type="text/javascript" src="../static/js/confirm.js"></script>
+    <script>
+        document.addEventListener('DOMContentLoaded', function () {handlePageLoad();});
+    </script>
+</body>
+</html>

web/final.html ADDED Viewed

	@@ -0,0 +1,63 @@

+<!DOCTYPE html>
+<html lang="ko">
+<head>
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+  <meta charset="utf-8">
+  <title>Spakers in Text</title>
+  <link rel="stylesheet" href="{{ url_for('static', path='css/finishs.css') }}">
+  <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
+    integrity="sha384-4bw+/aepP/YC94hEpVNVgiZdgIC5+VKNBQNGCHeKRQN+PtmoHDEXuppvnDJzQIu9" crossorigin="anonymous">
+</head>
+<body>
+  <!-- jQuery 및 Bootstrap JavaScript 로드 -->
+  <script src="https://code.jquery.com/jquery-3.6.4.min.js"
+    integrity="sha256-oP6HI/tZ1aS9sz3Jr4+6zqbc9BE/l6fLx+Vz2I+H/GL4ZiI/Z5L3hMv8w3yXdBi" crossorigin="anonymous"></script>
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"
+    integrity="sha384-3ziFjNlAXja/Yb0M7y2BmFvR3s09gRPbrCm0lF+SvL4uIboD5lv3U3BdD7dW7Y3" crossorigin="anonymous"></script>
+  <div class="background">
+    <div class="header"><a href="/"><span class="title">Nouvel : Novel for you</span></a></div>
+    <div class="empty"></div>
+    <div class="box">
+      <div class="subtitle">txt파일 변환 결과</div>
+      <div class="explain_box">
+        <span class="explain">밑의 결과는 소설 속 인용문 별로 발화자를 인식한 결과입니다.</br>
+          또한 장면 별로 장소와 시간을 인식하였습니다.</span>
+      </div>
+      <div class = "TLContainer">
+        <div class = "timeContainer">
+          <span class = "timelocation">시간 : {{ time }}</span>
+        </div>
+        <div class = "locationContainer">
+          <span class = "timelocation">장소 : {{ place }}</span>
+        </div>
+      </div>
+      <div class="itemContainer" id="resultContainer">
+        {% for out in output %}
+        <p>{{ out }}</p>
+        {% endfor %}
+      </div>
+      <div class="buttonContainer">
+        <div class="downloadButtonContainer">
+          <button id="downloadButton">Download Text File</button>
+        </div>
+        <div class="homeButtonContainer">
+          <button id="homeButton" onClick="location.href='/'">홈으로 돌아가기</button>
+        </div>
+      </div>
+    </div>
+    <div class="foot">
+      <div class="footer-text">
+        <span>고려대학교 지능정보 SW 아카데미 5조</span>
+      </div>
+    </div>
+  </div>
+  <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js"></script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.12.9/umd/popper.min.js"></script>
+  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/js/bootstrap.min.js"></script>
+  <script type="text/javascript" src="../js/finish.js"></script>
+</body>
+</html>

web/index.html ADDED Viewed

	@@ -0,0 +1,40 @@

+<!DOCTYPE html>
+<html lang="ko">
+<head>
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <meta charset="utf-8">
+    <title>Spakers in Text</title>
+    <link rel="stylesheet" href="{{ url_for('static', path='css/indexs.css') }}">
+</head>
+<body>
+    <div class="background">
+        <div class="header">
+            <a href="/">
+                <span class="title">Nouvel : Novel for you</span>
+            </a>
+        </div>
+        <div class="empty"></div>
+        <div class="box">
+            <p class="subtitle">서비스 소개</p>
+            <div class="explain-box">
+                <span class="explain">BERT 모델을 활용하여 텍스트 파일 내에서 발화자를 인식할 수 있는 모델입니다.
+                    <br>아래의 시작하기 버튼을 눌러 텍스트 파일을 업로드하고 모델을
+                    이용한 발화자 인식 결과를 확인할 수 있습니다.</span>
+            </div>
+            <div class="buttonbox">
+                <button onClick="location.href='put.html'" class="start-button">
+                    <span>시작하기</span>
+                </button>
+            </div>
+        </div>
+        <div class="foot">
+            <div class="footer-text">
+                <span>고려대학교 지능정보 SW 아카데미 5조</span>
+            </div>
+        </div>
+    </div>
+</body>
+</html>

web/put.html ADDED Viewed

	@@ -0,0 +1,53 @@

+<!DOCTYPE html>
+<html lang="ko">
+<head>
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+  <meta charset="utf-8">
+  <title>Spakers in Text</title>
+  <link rel="stylesheet" href="{{ url_for('static', path='css/put.css') }}">
+  <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
+    integrity="sha384-4bw+/aepP/YC94hEpVNVgiZdgIC5+VKNBQNGCHeKRQN+PtmoHDEXuppvnDJzQIu9" crossorigin="anonymous">
+</head>
+<body>
+  <div class="background">
+    <div class="header"><a href="/"><span class="title">Nouvel : Novel for you</span></a></div>
+    <div class="empty"></div>
+    <div class="box">
+      <div class="subtitle">변환할 파일 업로드하기</div>
+      <div class="explain_box">
+        <span class="explain">발화자를 찾고 싶은 txt 파일을 업로드하고 변환하기 버튼을 클릭하세요. </span>
+      </div>
+      <form id="myForm" onsubmit="return validateForm()">
+        <div class="button_box">
+          <p>아래의 버튼을 클릭하여 사용자의 유형을 선택해주세요. <br><b><i>작가(전문가) 사용자는 등장인물 리스트를 검토할 수 있습니다.</i></b></p>
+          <input type="radio" class="btn-check" name="displayOption" id="option1" autocomplete="off" value="pro">
+          <label class="btn btn-secondary" for="option1">작가(전문가)</label>
+          <input type="radio" class="btn-check" name="displayOption" id="option2" autocomplete="off" value="reader">
+          <label class="btn btn-secondary" for="option2">독자(이용자)</label>
+        </div>
+        <div class="formbox">
+          <div class="mb-3">
+            <label for="formFileSm" class="form-label"><b>첨부파일 업로드</b></label>
+            <input class="form-control form-control-sm" id="formFileSm" type="file">
+          </div>
+        </div>
+        <div class="transformbox">
+          <button onclick="handleButtonClick()" class="transformButton" type="button">
+            <span>변환하기</span>
+          </button>
+        </div>
+      </form>
+    </div>
+    <div class="foot">
+      <div class="footer-text">
+        <span>고려대학교 지능정보 SW 아카데미 5조</span>
+      </div>
+    </div>
+  </div>
+  <script src="../static/js/put.js"></script>
+</body>
+</html>

web/user.html ADDED Viewed

	@@ -0,0 +1,34 @@

+<!DOCTYPE html>
+<html lang="ko">
+<head>
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+  <meta charset="utf-8">
+  <title>Spakers in Text</title>
+  <link rel="stylesheet" href="{{ url_for('static', path='css/index.css') }}">
+</head>
+<body>
+  <div class="background">
+    <div class="header"><a href="/"><span class="title">Nouvel : Novel for you</span></a></div>
+    <div class="empty"></div>
+    <div class="box">
+      <div class="subtitle">등장인물 명단</div>
+      <div class="body">
+      </div>
+      <div class="transformbox">
+        <button onclick="handleButtonClick()" class="transformButton"><span>장면으로 변환하기</span></button>
+      </div>
+    </div>
+    <div class="foot">
+      <div class="footer-text">
+        <span>고려대학교 지능정보 SW 아카데미 5조</span>
+      </div>
+    </div>
+    <script src="../static/js/user.js"></script>
+</body>
+</html>