from tqdm import tqdm import torch from read_file import * from align_word_ids import * from convertTotext import * def pdf_predict(model, tokenizer, file_path, ids_to_labels, check_point='IndoBERT (IndoLEM)'): file_pdf = read_pdf(file_path) sentence_file = file_pdf.split(';') use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") if use_cuda: model = model.cuda() file_check_point = 'model/IndoLEM/model_fold_4.pth' if check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth' model_weights = torch.load(file_check_point, map_location=torch.device(device)) model.load_state_dict(model_weights) label_extraction = [] for text in tqdm(sentence_file, desc="Prediction Sentence"): toknize = tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt") input_ids = toknize['input_ids'].to(device) mask = toknize['attention_mask'].to(device) logits = model(input_ids, mask, None) label_ids = torch.Tensor(align_word_ids(text, tokenizer, True)).unsqueeze(0).to(device) logits_clean = logits[0][label_ids != -100] predictions = logits_clean.argmax(dim=1).tolist() prediction_label = [ids_to_labels[i] for i in predictions] input_ids_conv = tokenizer.convert_ids_to_tokens(toknize['input_ids'][0]) data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']] nerExtraction = convertTotext(data_token, prediction_label) if nerExtraction: label_extraction.append(nerExtraction) # print(f"\nText : {text}") # print(f"Predict Label : {prediction_label}") # print() # print(f"Hasil Ekstrak NER:") # print(nerExtraction) # print(f"Panjang Token : {len(data_token)}, Panjang Predict Label : {len(prediction_label)}") # print() return label_extraction