Spaces:
Sleeping
Sleeping
Commit
·
a450bc7
1
Parent(s):
baf5acb
add files
Browse files- app.py +42 -0
- data/162_Pid.Sus_2023_PN_Bkl.pdf +0 -0
- data/164_Pid.Sus_2023_PN_Bkl.pdf +0 -0
- data/165_Pdt.P_2023_PN_Bkl.pdf +0 -0
- data/167_Pid.Sus_2023_PN_Bkl.pdf +0 -0
- data/168_Pid.Sus_2023_PN_Bkl.pdf +0 -0
- data/169_Pid.Sus_2023_PN_Bkl.pdf +0 -0
- model/IndoLEM/model_fold_1.pth +3 -0
- model/IndoLEM/model_fold_2.pth +3 -0
- model/IndoLEM/model_fold_3.pth +3 -0
- model/IndoLEM/model_fold_4.pth +3 -0
- model/IndoLEM/model_fold_5.pth +3 -0
- model/IndoNLU/model_fold_1.pth +3 -0
- model/IndoNLU/model_fold_2.pth +3 -0
- model/IndoNLU/model_fold_3.pth +3 -0
- model/IndoNLU/model_fold_4.pth +3 -0
- model/IndoNLU/model_fold_5.pth +3 -0
- reuqirements.txt +105 -0
- src/BertModel.py +14 -0
- src/__pycache__/BertModel.cpython-310.pyc +0 -0
- src/__pycache__/align_word_ids.cpython-310.pyc +0 -0
- src/__pycache__/clean_text.cpython-310.pyc +0 -0
- src/__pycache__/convertTotext.cpython-310.pyc +0 -0
- src/__pycache__/pdf_predict.cpython-310.pyc +0 -0
- src/__pycache__/read_file.cpython-310.pyc +0 -0
- src/align_word_ids.py +27 -0
- src/clean_text.py +15 -0
- src/convertTotext.py +22 -0
- src/pdf_predict.py +48 -0
- src/read_file.py +21 -0
app.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
sys.path.append("/home/pstar7/Documents/gradio/src")
|
3 |
+
|
4 |
+
from transformers import BertTokenizerFast
|
5 |
+
from gradio_pdf import PDF
|
6 |
+
from BertModel import *
|
7 |
+
from pdf_predict import *
|
8 |
+
import gradio as gr
|
9 |
+
|
10 |
+
ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
|
11 |
+
indolem = 'indolem/indobert-base-uncased'
|
12 |
+
indonlu = 'indobenchmark/indobert-base-p2'
|
13 |
+
model_indolem = BertModel(indolem, len(ids_to_labels))
|
14 |
+
model_indonlu = BertModel(indonlu, len(ids_to_labels))
|
15 |
+
tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
|
16 |
+
tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
|
17 |
+
|
18 |
+
def predict(doc : str, model : str) -> str:
|
19 |
+
if model == 'IndoBERT (IndoLEM)':
|
20 |
+
use_model = model_indolem
|
21 |
+
use_tokenizer = tokenizer_indolem
|
22 |
+
|
23 |
+
else:
|
24 |
+
use_model = model_indonlu
|
25 |
+
use_tokenizer = tokenizer_indonlu
|
26 |
+
|
27 |
+
result = pdf_predict(use_model, use_tokenizer, doc, ids_to_labels, model)
|
28 |
+
|
29 |
+
return result
|
30 |
+
|
31 |
+
iface = gr.Interface(
|
32 |
+
fn=predict,
|
33 |
+
inputs=[PDF(label="Document"),
|
34 |
+
gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')],
|
35 |
+
outputs="textbox",
|
36 |
+
title="Legal NER",
|
37 |
+
description="Upload File PDF Putusan Pidana",
|
38 |
+
allow_flagging='never'
|
39 |
+
)
|
40 |
+
|
41 |
+
if __name__ == "__main__":
|
42 |
+
iface.launch()
|
data/162_Pid.Sus_2023_PN_Bkl.pdf
ADDED
Binary file (142 kB). View file
|
|
data/164_Pid.Sus_2023_PN_Bkl.pdf
ADDED
Binary file (144 kB). View file
|
|
data/165_Pdt.P_2023_PN_Bkl.pdf
ADDED
Binary file (70.4 kB). View file
|
|
data/167_Pid.Sus_2023_PN_Bkl.pdf
ADDED
Binary file (142 kB). View file
|
|
data/168_Pid.Sus_2023_PN_Bkl.pdf
ADDED
Binary file (146 kB). View file
|
|
data/169_Pid.Sus_2023_PN_Bkl.pdf
ADDED
Binary file (128 kB). View file
|
|
model/IndoLEM/model_fold_1.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12944de7c2f9f3ac701c25cd8b72a1ae0b9264234e4b83318596727dc3f73e4c
|
3 |
+
size 440031387
|
model/IndoLEM/model_fold_2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6ce4f2b6fee4f615bcbe94e8e5c50f9f141083c816a3cf7e2c9df42bf6fadcd3
|
3 |
+
size 440031387
|
model/IndoLEM/model_fold_3.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0a637e3d56c2df060f8212b8ee845bf861e69ef30d6204eeb864b9ef2127460
|
3 |
+
size 440031387
|
model/IndoLEM/model_fold_4.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9d378edcc860146fc622477232b07a3a3e5c618cb56b109c59c8af43b7cccd96
|
3 |
+
size 440031387
|
model/IndoLEM/model_fold_5.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ddb58de48b51fcec9d83456f903ecc35a147b1c5ee6685e371cd94aafea79338
|
3 |
+
size 440031387
|
model/IndoNLU/model_fold_1.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d948732f22f46957d5e8423532e556b76718a8f3a129f9bcb3526cb8b098dbfb
|
3 |
+
size 495563931
|
model/IndoNLU/model_fold_2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3093ae2dbed5175d8fd5bb2e1b5ef31641ccc562038e158ed0b0a5403fa26092
|
3 |
+
size 495563931
|
model/IndoNLU/model_fold_3.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60c7d6a726b2b10c19b035c91630264c1892cb741e5c788941c6fd90076d589e
|
3 |
+
size 495563931
|
model/IndoNLU/model_fold_4.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e87a3141742561544df98e491338c11c7ff3d6ff7fcf67de5a76e44238f4222
|
3 |
+
size 495563931
|
model/IndoNLU/model_fold_5.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aaad1b8c8c7c1591bd9ff71f7f9176a9b32dc37bd05d83bbc49ba710bdebca7e
|
3 |
+
size 495563931
|
reuqirements.txt
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
altair==5.2.0
|
3 |
+
annotated-types==0.6.0
|
4 |
+
anyio==4.3.0
|
5 |
+
attrs==23.2.0
|
6 |
+
blinker==1.7.0
|
7 |
+
cachetools==5.3.3
|
8 |
+
certifi==2024.2.2
|
9 |
+
charset-normalizer==3.3.2
|
10 |
+
click==8.1.7
|
11 |
+
colorama==0.4.6
|
12 |
+
contourpy==1.2.0
|
13 |
+
cycler==0.12.1
|
14 |
+
exceptiongroup==1.2.0
|
15 |
+
fastapi==0.110.0
|
16 |
+
ffmpy==0.3.2
|
17 |
+
filelock==3.13.1
|
18 |
+
fonttools==4.50.0
|
19 |
+
fsspec==2024.3.0
|
20 |
+
gitdb==4.0.11
|
21 |
+
GitPython==3.1.42
|
22 |
+
gradio==4.21.0
|
23 |
+
gradio_client==0.12.0
|
24 |
+
gradio_pdf==0.0.5
|
25 |
+
h11==0.14.0
|
26 |
+
httpcore==1.0.4
|
27 |
+
httpx==0.27.0
|
28 |
+
huggingface-hub==0.21.4
|
29 |
+
idna==3.6
|
30 |
+
importlib_resources==6.3.0
|
31 |
+
Jinja2==3.1.3
|
32 |
+
joblib==1.3.2
|
33 |
+
jsonschema==4.21.1
|
34 |
+
jsonschema-specifications==2023.12.1
|
35 |
+
kiwisolver==1.4.5
|
36 |
+
markdown-it-py==3.0.0
|
37 |
+
MarkupSafe==2.1.5
|
38 |
+
matplotlib==3.8.3
|
39 |
+
mdurl==0.1.2
|
40 |
+
mpmath==1.3.0
|
41 |
+
networkx==3.2.1
|
42 |
+
nltk==3.8.1
|
43 |
+
numpy==1.26.4
|
44 |
+
nvidia-cublas-cu12==12.1.3.1
|
45 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
46 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
47 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
48 |
+
nvidia-cudnn-cu12==8.9.2.26
|
49 |
+
nvidia-cufft-cu12==11.0.2.54
|
50 |
+
nvidia-curand-cu12==10.3.2.106
|
51 |
+
nvidia-cusolver-cu12==11.4.5.107
|
52 |
+
nvidia-cusparse-cu12==12.1.0.106
|
53 |
+
nvidia-nccl-cu12==2.19.3
|
54 |
+
nvidia-nvjitlink-cu12==12.4.99
|
55 |
+
nvidia-nvtx-cu12==12.1.105
|
56 |
+
orjson==3.9.15
|
57 |
+
packaging==23.2
|
58 |
+
pandas==2.2.1
|
59 |
+
pillow==10.2.0
|
60 |
+
protobuf==4.25.3
|
61 |
+
pyarrow==15.0.1
|
62 |
+
pydantic==2.6.4
|
63 |
+
pydantic_core==2.16.3
|
64 |
+
pydeck==0.8.1b0
|
65 |
+
pydub==0.25.1
|
66 |
+
Pygments==2.17.2
|
67 |
+
pyparsing==3.1.2
|
68 |
+
PyPDF2==3.0.1
|
69 |
+
python-dateutil==2.9.0.post0
|
70 |
+
python-multipart==0.0.9
|
71 |
+
pytz==2024.1
|
72 |
+
PyYAML==6.0.1
|
73 |
+
referencing==0.33.0
|
74 |
+
regex==2023.12.25
|
75 |
+
requests==2.31.0
|
76 |
+
rich==13.7.1
|
77 |
+
rpds-py==0.18.0
|
78 |
+
ruff==0.3.3
|
79 |
+
safetensors==0.4.2
|
80 |
+
semantic-version==2.10.0
|
81 |
+
shellingham==1.5.4
|
82 |
+
six==1.16.0
|
83 |
+
smmap==5.0.1
|
84 |
+
sniffio==1.3.1
|
85 |
+
starlette==0.36.3
|
86 |
+
streamlit==1.32.2
|
87 |
+
sympy==1.12
|
88 |
+
tenacity==8.2.3
|
89 |
+
tokenizers==0.15.2
|
90 |
+
toml==0.10.2
|
91 |
+
tomlkit==0.12.0
|
92 |
+
toolz==0.12.1
|
93 |
+
torch==2.2.1
|
94 |
+
tornado==6.4
|
95 |
+
tqdm==4.66.2
|
96 |
+
transformers==4.38.2
|
97 |
+
triton==2.2.0
|
98 |
+
typer==0.9.0
|
99 |
+
typing_extensions==4.10.0
|
100 |
+
tzdata==2024.1
|
101 |
+
urllib3==2.2.1
|
102 |
+
uvicorn==0.28.0
|
103 |
+
watchdog==4.0.0
|
104 |
+
websockets==11.0.3
|
105 |
+
Werkzeug==3.0.1
|
src/BertModel.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BertForTokenClassification
|
2 |
+
import torch
|
3 |
+
|
4 |
+
class BertModel(torch.nn.Module):
|
5 |
+
def __init__(self, pretrained_model, num_labels):
|
6 |
+
|
7 |
+
super(BertModel, self).__init__()
|
8 |
+
self.bert = BertForTokenClassification.from_pretrained(pretrained_model, num_labels=num_labels)
|
9 |
+
|
10 |
+
def forward(self, input_id, mask, label):
|
11 |
+
|
12 |
+
output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
|
13 |
+
|
14 |
+
return output
|
src/__pycache__/BertModel.cpython-310.pyc
ADDED
Binary file (883 Bytes). View file
|
|
src/__pycache__/align_word_ids.cpython-310.pyc
ADDED
Binary file (607 Bytes). View file
|
|
src/__pycache__/clean_text.cpython-310.pyc
ADDED
Binary file (1.77 kB). View file
|
|
src/__pycache__/convertTotext.cpython-310.pyc
ADDED
Binary file (526 Bytes). View file
|
|
src/__pycache__/pdf_predict.cpython-310.pyc
ADDED
Binary file (1.71 kB). View file
|
|
src/__pycache__/read_file.cpython-310.pyc
ADDED
Binary file (666 Bytes). View file
|
|
src/align_word_ids.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def align_word_ids(texts, tokenizer, label_all_tokens):
|
2 |
+
|
3 |
+
tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
|
4 |
+
|
5 |
+
word_ids = tokenized_inputs.word_ids()
|
6 |
+
|
7 |
+
previous_word_idx = None
|
8 |
+
label_ids = []
|
9 |
+
|
10 |
+
for word_idx in word_ids:
|
11 |
+
|
12 |
+
if word_idx is None:
|
13 |
+
label_ids.append(-100)
|
14 |
+
|
15 |
+
elif word_idx != previous_word_idx:
|
16 |
+
try:
|
17 |
+
label_ids.append(1)
|
18 |
+
except:
|
19 |
+
label_ids.append(-100)
|
20 |
+
else:
|
21 |
+
try:
|
22 |
+
label_ids.append(1 if label_all_tokens else -100)
|
23 |
+
except:
|
24 |
+
label_ids.append(-100)
|
25 |
+
previous_word_idx = word_idx
|
26 |
+
|
27 |
+
return label_ids
|
src/clean_text.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
def clean_text(text):
|
4 |
+
text = text.replace("Mahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nDirektori Putusan Mahkamah Agung Republik Indonesia\nputusan.mahkamahagung.go.id\n", "")
|
5 |
+
text = text.replace("\nDisclaimer\nKepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\npelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\nDalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\nEmail : kepaniteraan@mahkamahagung.go.id", "")
|
6 |
+
text = text.replace("Telp : 021-384 3348 (ext.318)", "")
|
7 |
+
text = text.replace('P U T U S A N', 'PUTUSAN').replace('T erdakwa', 'Terdakwa').replace('T empat', 'Tempat').replace('T ahun', 'Tahun')
|
8 |
+
text = text.replace('P E N E T A P A N', 'PENETAPAN').replace('J u m l a h', 'Jumlah').replace('\n', '')
|
9 |
+
text = re.sub(r'\nHalaman \d+ dari \d+ .*', '', text)
|
10 |
+
text = re.sub(r'Halaman \d+ dari \d+ .*', '', text)
|
11 |
+
text = re.sub(r'\nHal. \d+ dari \d+ .*', '', text)
|
12 |
+
text = re.sub(r'Hal. \d+ dari \d+ .*', '', text)
|
13 |
+
text = re.sub(r' +|[\uf0fc\uf0a7\uf0a8\uf0b7]', ' ', text)
|
14 |
+
text = re.sub(r'[\u2026]+|\.{3,}', '', text)
|
15 |
+
return text.strip()
|
src/convertTotext.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def convertTotext(data_token, prediction_label):
|
2 |
+
prev_tag = 'O'
|
3 |
+
result = {}
|
4 |
+
temp = ''
|
5 |
+
for i, word in enumerate(data_token):
|
6 |
+
if prediction_label[i] != 'O':
|
7 |
+
if prev_tag == 'O' and temp != '':
|
8 |
+
temp = ''
|
9 |
+
|
10 |
+
if '##' in word:
|
11 |
+
temp += word.replace('##', '')
|
12 |
+
|
13 |
+
else:
|
14 |
+
temp += ' ' + word
|
15 |
+
else:
|
16 |
+
if temp != "":
|
17 |
+
result[prev_tag.replace("I_", "B_")] = temp.strip()
|
18 |
+
temp = ""
|
19 |
+
|
20 |
+
prev_tag = prediction_label[i]
|
21 |
+
|
22 |
+
return result
|
src/pdf_predict.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tqdm import tqdm
|
2 |
+
import torch
|
3 |
+
from read_file import *
|
4 |
+
from align_word_ids import *
|
5 |
+
from convertTotext import *
|
6 |
+
|
7 |
+
def pdf_predict(model, tokenizer, file_path, ids_to_labels, check_point='IndoBERT (IndoLEM)'):
|
8 |
+
file_pdf = read_pdf(file_path)
|
9 |
+
sentence_file = file_pdf.split(';')
|
10 |
+
|
11 |
+
use_cuda = torch.cuda.is_available()
|
12 |
+
device = torch.device("cuda" if use_cuda else "cpu")
|
13 |
+
if use_cuda:
|
14 |
+
model = model.cuda()
|
15 |
+
|
16 |
+
file_check_point = 'model/IndoLEM/model_fold_4.pth' if check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth'
|
17 |
+
|
18 |
+
model_weights = torch.load(file_check_point, map_location=torch.device(device))
|
19 |
+
model.load_state_dict(model_weights)
|
20 |
+
|
21 |
+
label_extraction = []
|
22 |
+
for text in tqdm(sentence_file, desc="Prediction Sentence"):
|
23 |
+
toknize = tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
|
24 |
+
input_ids = toknize['input_ids'].to(device)
|
25 |
+
mask = toknize['attention_mask'].to(device)
|
26 |
+
|
27 |
+
logits = model(input_ids, mask, None)
|
28 |
+
label_ids = torch.Tensor(align_word_ids(text, tokenizer, True)).unsqueeze(0).to(device)
|
29 |
+
logits_clean = logits[0][label_ids != -100]
|
30 |
+
predictions = logits_clean.argmax(dim=1).tolist()
|
31 |
+
prediction_label = [ids_to_labels[i] for i in predictions]
|
32 |
+
|
33 |
+
input_ids_conv = tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
|
34 |
+
data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
|
35 |
+
nerExtraction = convertTotext(data_token, prediction_label)
|
36 |
+
|
37 |
+
if nerExtraction:
|
38 |
+
label_extraction.append(nerExtraction)
|
39 |
+
# print(f"\nText : {text}")
|
40 |
+
# print(f"Predict Label : {prediction_label}")
|
41 |
+
# print()
|
42 |
+
|
43 |
+
# print(f"Hasil Ekstrak NER:")
|
44 |
+
# print(nerExtraction)
|
45 |
+
# print(f"Panjang Token : {len(data_token)}, Panjang Predict Label : {len(prediction_label)}")
|
46 |
+
# print()
|
47 |
+
|
48 |
+
return label_extraction
|
src/read_file.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PyPDF2
|
2 |
+
from clean_text import *
|
3 |
+
import requests
|
4 |
+
|
5 |
+
def read_pdf(file_pdf):
|
6 |
+
try:
|
7 |
+
pdf_text = ''
|
8 |
+
pdf_file = open(file_pdf, 'rb')
|
9 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
10 |
+
|
11 |
+
for page_num in range(len(pdf_reader.pages)):
|
12 |
+
page = pdf_reader.pages[page_num]
|
13 |
+
text = clean_text(page.extract_text())
|
14 |
+
|
15 |
+
pdf_text += text
|
16 |
+
|
17 |
+
pdf_file.close()
|
18 |
+
return pdf_text.strip()
|
19 |
+
|
20 |
+
except requests.exceptions.RequestException as e:
|
21 |
+
print("Error:", e)
|