arosyihuddin commited on
Commit
a450bc7
·
1 Parent(s): baf5acb
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append("/home/pstar7/Documents/gradio/src")
3
+
4
+ from transformers import BertTokenizerFast
5
+ from gradio_pdf import PDF
6
+ from BertModel import *
7
+ from pdf_predict import *
8
+ import gradio as gr
9
+
10
+ ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
11
+ indolem = 'indolem/indobert-base-uncased'
12
+ indonlu = 'indobenchmark/indobert-base-p2'
13
+ model_indolem = BertModel(indolem, len(ids_to_labels))
14
+ model_indonlu = BertModel(indonlu, len(ids_to_labels))
15
+ tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
16
+ tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
17
+
18
+ def predict(doc : str, model : str) -> str:
19
+ if model == 'IndoBERT (IndoLEM)':
20
+ use_model = model_indolem
21
+ use_tokenizer = tokenizer_indolem
22
+
23
+ else:
24
+ use_model = model_indonlu
25
+ use_tokenizer = tokenizer_indonlu
26
+
27
+ result = pdf_predict(use_model, use_tokenizer, doc, ids_to_labels, model)
28
+
29
+ return result
30
+
31
+ iface = gr.Interface(
32
+ fn=predict,
33
+ inputs=[PDF(label="Document"),
34
+ gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')],
35
+ outputs="textbox",
36
+ title="Legal NER",
37
+ description="Upload File PDF Putusan Pidana",
38
+ allow_flagging='never'
39
+ )
40
+
41
+ if __name__ == "__main__":
42
+ iface.launch()
data/162_Pid.Sus_2023_PN_Bkl.pdf ADDED
Binary file (142 kB). View file
 
data/164_Pid.Sus_2023_PN_Bkl.pdf ADDED
Binary file (144 kB). View file
 
data/165_Pdt.P_2023_PN_Bkl.pdf ADDED
Binary file (70.4 kB). View file
 
data/167_Pid.Sus_2023_PN_Bkl.pdf ADDED
Binary file (142 kB). View file
 
data/168_Pid.Sus_2023_PN_Bkl.pdf ADDED
Binary file (146 kB). View file
 
data/169_Pid.Sus_2023_PN_Bkl.pdf ADDED
Binary file (128 kB). View file
 
model/IndoLEM/model_fold_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12944de7c2f9f3ac701c25cd8b72a1ae0b9264234e4b83318596727dc3f73e4c
3
+ size 440031387
model/IndoLEM/model_fold_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ce4f2b6fee4f615bcbe94e8e5c50f9f141083c816a3cf7e2c9df42bf6fadcd3
3
+ size 440031387
model/IndoLEM/model_fold_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0a637e3d56c2df060f8212b8ee845bf861e69ef30d6204eeb864b9ef2127460
3
+ size 440031387
model/IndoLEM/model_fold_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d378edcc860146fc622477232b07a3a3e5c618cb56b109c59c8af43b7cccd96
3
+ size 440031387
model/IndoLEM/model_fold_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddb58de48b51fcec9d83456f903ecc35a147b1c5ee6685e371cd94aafea79338
3
+ size 440031387
model/IndoNLU/model_fold_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d948732f22f46957d5e8423532e556b76718a8f3a129f9bcb3526cb8b098dbfb
3
+ size 495563931
model/IndoNLU/model_fold_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3093ae2dbed5175d8fd5bb2e1b5ef31641ccc562038e158ed0b0a5403fa26092
3
+ size 495563931
model/IndoNLU/model_fold_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60c7d6a726b2b10c19b035c91630264c1892cb741e5c788941c6fd90076d589e
3
+ size 495563931
model/IndoNLU/model_fold_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e87a3141742561544df98e491338c11c7ff3d6ff7fcf67de5a76e44238f4222
3
+ size 495563931
model/IndoNLU/model_fold_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaad1b8c8c7c1591bd9ff71f7f9176a9b32dc37bd05d83bbc49ba710bdebca7e
3
+ size 495563931
reuqirements.txt ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ altair==5.2.0
3
+ annotated-types==0.6.0
4
+ anyio==4.3.0
5
+ attrs==23.2.0
6
+ blinker==1.7.0
7
+ cachetools==5.3.3
8
+ certifi==2024.2.2
9
+ charset-normalizer==3.3.2
10
+ click==8.1.7
11
+ colorama==0.4.6
12
+ contourpy==1.2.0
13
+ cycler==0.12.1
14
+ exceptiongroup==1.2.0
15
+ fastapi==0.110.0
16
+ ffmpy==0.3.2
17
+ filelock==3.13.1
18
+ fonttools==4.50.0
19
+ fsspec==2024.3.0
20
+ gitdb==4.0.11
21
+ GitPython==3.1.42
22
+ gradio==4.21.0
23
+ gradio_client==0.12.0
24
+ gradio_pdf==0.0.5
25
+ h11==0.14.0
26
+ httpcore==1.0.4
27
+ httpx==0.27.0
28
+ huggingface-hub==0.21.4
29
+ idna==3.6
30
+ importlib_resources==6.3.0
31
+ Jinja2==3.1.3
32
+ joblib==1.3.2
33
+ jsonschema==4.21.1
34
+ jsonschema-specifications==2023.12.1
35
+ kiwisolver==1.4.5
36
+ markdown-it-py==3.0.0
37
+ MarkupSafe==2.1.5
38
+ matplotlib==3.8.3
39
+ mdurl==0.1.2
40
+ mpmath==1.3.0
41
+ networkx==3.2.1
42
+ nltk==3.8.1
43
+ numpy==1.26.4
44
+ nvidia-cublas-cu12==12.1.3.1
45
+ nvidia-cuda-cupti-cu12==12.1.105
46
+ nvidia-cuda-nvrtc-cu12==12.1.105
47
+ nvidia-cuda-runtime-cu12==12.1.105
48
+ nvidia-cudnn-cu12==8.9.2.26
49
+ nvidia-cufft-cu12==11.0.2.54
50
+ nvidia-curand-cu12==10.3.2.106
51
+ nvidia-cusolver-cu12==11.4.5.107
52
+ nvidia-cusparse-cu12==12.1.0.106
53
+ nvidia-nccl-cu12==2.19.3
54
+ nvidia-nvjitlink-cu12==12.4.99
55
+ nvidia-nvtx-cu12==12.1.105
56
+ orjson==3.9.15
57
+ packaging==23.2
58
+ pandas==2.2.1
59
+ pillow==10.2.0
60
+ protobuf==4.25.3
61
+ pyarrow==15.0.1
62
+ pydantic==2.6.4
63
+ pydantic_core==2.16.3
64
+ pydeck==0.8.1b0
65
+ pydub==0.25.1
66
+ Pygments==2.17.2
67
+ pyparsing==3.1.2
68
+ PyPDF2==3.0.1
69
+ python-dateutil==2.9.0.post0
70
+ python-multipart==0.0.9
71
+ pytz==2024.1
72
+ PyYAML==6.0.1
73
+ referencing==0.33.0
74
+ regex==2023.12.25
75
+ requests==2.31.0
76
+ rich==13.7.1
77
+ rpds-py==0.18.0
78
+ ruff==0.3.3
79
+ safetensors==0.4.2
80
+ semantic-version==2.10.0
81
+ shellingham==1.5.4
82
+ six==1.16.0
83
+ smmap==5.0.1
84
+ sniffio==1.3.1
85
+ starlette==0.36.3
86
+ streamlit==1.32.2
87
+ sympy==1.12
88
+ tenacity==8.2.3
89
+ tokenizers==0.15.2
90
+ toml==0.10.2
91
+ tomlkit==0.12.0
92
+ toolz==0.12.1
93
+ torch==2.2.1
94
+ tornado==6.4
95
+ tqdm==4.66.2
96
+ transformers==4.38.2
97
+ triton==2.2.0
98
+ typer==0.9.0
99
+ typing_extensions==4.10.0
100
+ tzdata==2024.1
101
+ urllib3==2.2.1
102
+ uvicorn==0.28.0
103
+ watchdog==4.0.0
104
+ websockets==11.0.3
105
+ Werkzeug==3.0.1
src/BertModel.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertForTokenClassification
2
+ import torch
3
+
4
+ class BertModel(torch.nn.Module):
5
+ def __init__(self, pretrained_model, num_labels):
6
+
7
+ super(BertModel, self).__init__()
8
+ self.bert = BertForTokenClassification.from_pretrained(pretrained_model, num_labels=num_labels)
9
+
10
+ def forward(self, input_id, mask, label):
11
+
12
+ output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
13
+
14
+ return output
src/__pycache__/BertModel.cpython-310.pyc ADDED
Binary file (883 Bytes). View file
 
src/__pycache__/align_word_ids.cpython-310.pyc ADDED
Binary file (607 Bytes). View file
 
src/__pycache__/clean_text.cpython-310.pyc ADDED
Binary file (1.77 kB). View file
 
src/__pycache__/convertTotext.cpython-310.pyc ADDED
Binary file (526 Bytes). View file
 
src/__pycache__/pdf_predict.cpython-310.pyc ADDED
Binary file (1.71 kB). View file
 
src/__pycache__/read_file.cpython-310.pyc ADDED
Binary file (666 Bytes). View file
 
src/align_word_ids.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def align_word_ids(texts, tokenizer, label_all_tokens):
2
+
3
+ tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
4
+
5
+ word_ids = tokenized_inputs.word_ids()
6
+
7
+ previous_word_idx = None
8
+ label_ids = []
9
+
10
+ for word_idx in word_ids:
11
+
12
+ if word_idx is None:
13
+ label_ids.append(-100)
14
+
15
+ elif word_idx != previous_word_idx:
16
+ try:
17
+ label_ids.append(1)
18
+ except:
19
+ label_ids.append(-100)
20
+ else:
21
+ try:
22
+ label_ids.append(1 if label_all_tokens else -100)
23
+ except:
24
+ label_ids.append(-100)
25
+ previous_word_idx = word_idx
26
+
27
+ return label_ids
src/clean_text.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def clean_text(text):
4
+ text = text.replace("Mahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nDirektori Putusan Mahkamah Agung Republik Indonesia\nputusan.mahkamahagung.go.id\n", "")
5
+ text = text.replace("\nDisclaimer\nKepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\npelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\nDalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\nEmail : kepaniteraan@mahkamahagung.go.id", "")
6
+ text = text.replace("Telp : 021-384 3348 (ext.318)", "")
7
+ text = text.replace('P U T U S A N', 'PUTUSAN').replace('T erdakwa', 'Terdakwa').replace('T empat', 'Tempat').replace('T ahun', 'Tahun')
8
+ text = text.replace('P E N E T A P A N', 'PENETAPAN').replace('J u m l a h', 'Jumlah').replace('\n', '')
9
+ text = re.sub(r'\nHalaman \d+ dari \d+ .*', '', text)
10
+ text = re.sub(r'Halaman \d+ dari \d+ .*', '', text)
11
+ text = re.sub(r'\nHal. \d+ dari \d+ .*', '', text)
12
+ text = re.sub(r'Hal. \d+ dari \d+ .*', '', text)
13
+ text = re.sub(r' +|[\uf0fc\uf0a7\uf0a8\uf0b7]', ' ', text)
14
+ text = re.sub(r'[\u2026]+|\.{3,}', '', text)
15
+ return text.strip()
src/convertTotext.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def convertTotext(data_token, prediction_label):
2
+ prev_tag = 'O'
3
+ result = {}
4
+ temp = ''
5
+ for i, word in enumerate(data_token):
6
+ if prediction_label[i] != 'O':
7
+ if prev_tag == 'O' and temp != '':
8
+ temp = ''
9
+
10
+ if '##' in word:
11
+ temp += word.replace('##', '')
12
+
13
+ else:
14
+ temp += ' ' + word
15
+ else:
16
+ if temp != "":
17
+ result[prev_tag.replace("I_", "B_")] = temp.strip()
18
+ temp = ""
19
+
20
+ prev_tag = prediction_label[i]
21
+
22
+ return result
src/pdf_predict.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm import tqdm
2
+ import torch
3
+ from read_file import *
4
+ from align_word_ids import *
5
+ from convertTotext import *
6
+
7
+ def pdf_predict(model, tokenizer, file_path, ids_to_labels, check_point='IndoBERT (IndoLEM)'):
8
+ file_pdf = read_pdf(file_path)
9
+ sentence_file = file_pdf.split(';')
10
+
11
+ use_cuda = torch.cuda.is_available()
12
+ device = torch.device("cuda" if use_cuda else "cpu")
13
+ if use_cuda:
14
+ model = model.cuda()
15
+
16
+ file_check_point = 'model/IndoLEM/model_fold_4.pth' if check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth'
17
+
18
+ model_weights = torch.load(file_check_point, map_location=torch.device(device))
19
+ model.load_state_dict(model_weights)
20
+
21
+ label_extraction = []
22
+ for text in tqdm(sentence_file, desc="Prediction Sentence"):
23
+ toknize = tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
24
+ input_ids = toknize['input_ids'].to(device)
25
+ mask = toknize['attention_mask'].to(device)
26
+
27
+ logits = model(input_ids, mask, None)
28
+ label_ids = torch.Tensor(align_word_ids(text, tokenizer, True)).unsqueeze(0).to(device)
29
+ logits_clean = logits[0][label_ids != -100]
30
+ predictions = logits_clean.argmax(dim=1).tolist()
31
+ prediction_label = [ids_to_labels[i] for i in predictions]
32
+
33
+ input_ids_conv = tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
34
+ data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
35
+ nerExtraction = convertTotext(data_token, prediction_label)
36
+
37
+ if nerExtraction:
38
+ label_extraction.append(nerExtraction)
39
+ # print(f"\nText : {text}")
40
+ # print(f"Predict Label : {prediction_label}")
41
+ # print()
42
+
43
+ # print(f"Hasil Ekstrak NER:")
44
+ # print(nerExtraction)
45
+ # print(f"Panjang Token : {len(data_token)}, Panjang Predict Label : {len(prediction_label)}")
46
+ # print()
47
+
48
+ return label_extraction
src/read_file.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from clean_text import *
3
+ import requests
4
+
5
+ def read_pdf(file_pdf):
6
+ try:
7
+ pdf_text = ''
8
+ pdf_file = open(file_pdf, 'rb')
9
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
10
+
11
+ for page_num in range(len(pdf_reader.pages)):
12
+ page = pdf_reader.pages[page_num]
13
+ text = clean_text(page.extract_text())
14
+
15
+ pdf_text += text
16
+
17
+ pdf_file.close()
18
+ return pdf_text.strip()
19
+
20
+ except requests.exceptions.RequestException as e:
21
+ print("Error:", e)