Spaces:

arosyihuddin
/

gradio-LegalNER

Sleeping

App Files Files Community

arosyihuddin commited on Mar 16, 2024

Commit

a450bc7

1 Parent(s): baf5acb

add files

Browse files

Files changed (30) hide show

app.py +42 -0
data/162_Pid.Sus_2023_PN_Bkl.pdf +0 -0
data/164_Pid.Sus_2023_PN_Bkl.pdf +0 -0
data/165_Pdt.P_2023_PN_Bkl.pdf +0 -0
data/167_Pid.Sus_2023_PN_Bkl.pdf +0 -0
data/168_Pid.Sus_2023_PN_Bkl.pdf +0 -0
data/169_Pid.Sus_2023_PN_Bkl.pdf +0 -0
model/IndoLEM/model_fold_1.pth +3 -0
model/IndoLEM/model_fold_2.pth +3 -0
model/IndoLEM/model_fold_3.pth +3 -0
model/IndoLEM/model_fold_4.pth +3 -0
model/IndoLEM/model_fold_5.pth +3 -0
model/IndoNLU/model_fold_1.pth +3 -0
model/IndoNLU/model_fold_2.pth +3 -0
model/IndoNLU/model_fold_3.pth +3 -0
model/IndoNLU/model_fold_4.pth +3 -0
model/IndoNLU/model_fold_5.pth +3 -0
reuqirements.txt +105 -0
src/BertModel.py +14 -0
src/__pycache__/BertModel.cpython-310.pyc +0 -0
src/__pycache__/align_word_ids.cpython-310.pyc +0 -0
src/__pycache__/clean_text.cpython-310.pyc +0 -0
src/__pycache__/convertTotext.cpython-310.pyc +0 -0
src/__pycache__/pdf_predict.cpython-310.pyc +0 -0
src/__pycache__/read_file.cpython-310.pyc +0 -0
src/align_word_ids.py +27 -0
src/clean_text.py +15 -0
src/convertTotext.py +22 -0
src/pdf_predict.py +48 -0
src/read_file.py +21 -0

app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import sys
+sys.path.append("/home/pstar7/Documents/gradio/src")
+from transformers import BertTokenizerFast
+from gradio_pdf import PDF
+from BertModel import *
+from pdf_predict import *
+import gradio as gr
+ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
+indolem = 'indolem/indobert-base-uncased'
+indonlu = 'indobenchmark/indobert-base-p2'
+model_indolem = BertModel(indolem, len(ids_to_labels))
+model_indonlu = BertModel(indonlu, len(ids_to_labels))
+tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
+tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
+def predict(doc : str, model : str) -> str:
+  if model == 'IndoBERT (IndoLEM)':
+    use_model = model_indolem
+    use_tokenizer = tokenizer_indolem
+  else:
+    use_model = model_indonlu
+    use_tokenizer = tokenizer_indonlu
+  result =  pdf_predict(use_model, use_tokenizer, doc, ids_to_labels, model)
+  return result
+iface = gr.Interface(
+    fn=predict,
+    inputs=[PDF(label="Document"),
+            gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')],
+    outputs="textbox",
+    title="Legal NER",
+    description="Upload File PDF Putusan Pidana",
+    allow_flagging='never'
+    )
+if __name__ == "__main__":
+    iface.launch()

data/162_Pid.Sus_2023_PN_Bkl.pdf ADDED Viewed

Binary file (142 kB). View file

data/164_Pid.Sus_2023_PN_Bkl.pdf ADDED Viewed

Binary file (144 kB). View file

data/165_Pdt.P_2023_PN_Bkl.pdf ADDED Viewed

Binary file (70.4 kB). View file

data/167_Pid.Sus_2023_PN_Bkl.pdf ADDED Viewed

Binary file (142 kB). View file

data/168_Pid.Sus_2023_PN_Bkl.pdf ADDED Viewed

Binary file (146 kB). View file

data/169_Pid.Sus_2023_PN_Bkl.pdf ADDED Viewed

Binary file (128 kB). View file

model/IndoLEM/model_fold_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12944de7c2f9f3ac701c25cd8b72a1ae0b9264234e4b83318596727dc3f73e4c
+size 440031387

model/IndoLEM/model_fold_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ce4f2b6fee4f615bcbe94e8e5c50f9f141083c816a3cf7e2c9df42bf6fadcd3
+size 440031387

model/IndoLEM/model_fold_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0a637e3d56c2df060f8212b8ee845bf861e69ef30d6204eeb864b9ef2127460
+size 440031387

model/IndoLEM/model_fold_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d378edcc860146fc622477232b07a3a3e5c618cb56b109c59c8af43b7cccd96
+size 440031387

model/IndoLEM/model_fold_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddb58de48b51fcec9d83456f903ecc35a147b1c5ee6685e371cd94aafea79338
+size 440031387

model/IndoNLU/model_fold_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d948732f22f46957d5e8423532e556b76718a8f3a129f9bcb3526cb8b098dbfb
+size 495563931

model/IndoNLU/model_fold_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3093ae2dbed5175d8fd5bb2e1b5ef31641ccc562038e158ed0b0a5403fa26092
+size 495563931

model/IndoNLU/model_fold_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60c7d6a726b2b10c19b035c91630264c1892cb741e5c788941c6fd90076d589e
+size 495563931

model/IndoNLU/model_fold_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e87a3141742561544df98e491338c11c7ff3d6ff7fcf67de5a76e44238f4222
+size 495563931

model/IndoNLU/model_fold_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aaad1b8c8c7c1591bd9ff71f7f9176a9b32dc37bd05d83bbc49ba710bdebca7e
+size 495563931

reuqirements.txt ADDED Viewed

	@@ -0,0 +1,105 @@

+aiofiles==23.2.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.3.0
+attrs==23.2.0
+blinker==1.7.0
+cachetools==5.3.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.0
+cycler==0.12.1
+exceptiongroup==1.2.0
+fastapi==0.110.0
+ffmpy==0.3.2
+filelock==3.13.1
+fonttools==4.50.0
+fsspec==2024.3.0
+gitdb==4.0.11
+GitPython==3.1.42
+gradio==4.21.0
+gradio_client==0.12.0
+gradio_pdf==0.0.5
+h11==0.14.0
+httpcore==1.0.4
+httpx==0.27.0
+huggingface-hub==0.21.4
+idna==3.6
+importlib_resources==6.3.0
+Jinja2==3.1.3
+joblib==1.3.2
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.8.3
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.19.3
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.1.105
+orjson==3.9.15
+packaging==23.2
+pandas==2.2.1
+pillow==10.2.0
+protobuf==4.25.3
+pyarrow==15.0.1
+pydantic==2.6.4
+pydantic_core==2.16.3
+pydeck==0.8.1b0
+pydub==0.25.1
+Pygments==2.17.2
+pyparsing==3.1.2
+PyPDF2==3.0.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.33.0
+regex==2023.12.25
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.0
+ruff==0.3.3
+safetensors==0.4.2
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+starlette==0.36.3
+streamlit==1.32.2
+sympy==1.12
+tenacity==8.2.3
+tokenizers==0.15.2
+toml==0.10.2
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.2.1
+tornado==6.4
+tqdm==4.66.2
+transformers==4.38.2
+triton==2.2.0
+typer==0.9.0
+typing_extensions==4.10.0
+tzdata==2024.1
+urllib3==2.2.1
+uvicorn==0.28.0
+watchdog==4.0.0
+websockets==11.0.3
+Werkzeug==3.0.1

src/BertModel.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from transformers import BertForTokenClassification
+import torch
+class BertModel(torch.nn.Module):
+    def __init__(self, pretrained_model, num_labels):
+        super(BertModel, self).__init__()
+        self.bert = BertForTokenClassification.from_pretrained(pretrained_model, num_labels=num_labels)
+    def forward(self, input_id, mask, label):
+        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
+        return output

src/__pycache__/BertModel.cpython-310.pyc ADDED Viewed

Binary file (883 Bytes). View file

src/__pycache__/align_word_ids.cpython-310.pyc ADDED Viewed

Binary file (607 Bytes). View file

src/__pycache__/clean_text.cpython-310.pyc ADDED Viewed

Binary file (1.77 kB). View file

src/__pycache__/convertTotext.cpython-310.pyc ADDED Viewed

Binary file (526 Bytes). View file

src/__pycache__/pdf_predict.cpython-310.pyc ADDED Viewed

Binary file (1.71 kB). View file

src/__pycache__/read_file.cpython-310.pyc ADDED Viewed

Binary file (666 Bytes). View file

src/align_word_ids.py ADDED Viewed

	@@ -0,0 +1,27 @@

+def align_word_ids(texts, tokenizer, label_all_tokens):
+    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
+    word_ids = tokenized_inputs.word_ids()
+    previous_word_idx = None
+    label_ids = []
+    for word_idx in word_ids:
+        if word_idx is None:
+            label_ids.append(-100)
+        elif word_idx != previous_word_idx:
+            try:
+                label_ids.append(1)
+            except:
+                label_ids.append(-100)
+        else:
+            try:
+                label_ids.append(1 if label_all_tokens else -100)
+            except:
+                label_ids.append(-100)
+        previous_word_idx = word_idx
+    return label_ids

src/clean_text.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import re
+def clean_text(text):
+    text = text.replace("Mahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nMahkamah Agung Republik Indonesia\nDirektori Putusan Mahkamah Agung Republik Indonesia\nputusan.mahkamahagung.go.id\n", "")
+    text = text.replace("\nDisclaimer\nKepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\npelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\nDalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\nEmail : kepaniteraan@mahkamahagung.go.id", "")
+    text = text.replace("Telp : 021-384 3348 (ext.318)", "")
+    text = text.replace('P U T U S A N', 'PUTUSAN').replace('T erdakwa', 'Terdakwa').replace('T empat', 'Tempat').replace('T ahun', 'Tahun')
+    text = text.replace('P  E  N  E  T  A  P  A  N', 'PENETAPAN').replace('J u m l a h', 'Jumlah').replace('\n', '')
+    text = re.sub(r'\nHalaman \d+ dari \d+ .*', '', text)
+    text = re.sub(r'Halaman \d+ dari \d+ .*', '', text)
+    text = re.sub(r'\nHal. \d+ dari \d+ .*', '', text)
+    text = re.sub(r'Hal. \d+ dari \d+ .*', '', text)
+    text = re.sub(r' +|[\uf0fc\uf0a7\uf0a8\uf0b7]', ' ', text)
+    text = re.sub(r'[\u2026]+|\.{3,}', '', text)
+    return text.strip()

src/convertTotext.py ADDED Viewed

	@@ -0,0 +1,22 @@

+def convertTotext(data_token, prediction_label):
+  prev_tag = 'O'
+  result = {}
+  temp = ''
+  for i, word in enumerate(data_token):
+    if prediction_label[i] != 'O':
+      if prev_tag == 'O' and temp != '':
+        temp = ''
+      if '##' in word:
+        temp += word.replace('##', '')
+      else:
+        temp +=  ' ' + word
+    else:
+      if temp != "":
+        result[prev_tag.replace("I_", "B_")] = temp.strip()
+      temp = ""
+    prev_tag = prediction_label[i]
+  return result

src/pdf_predict.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from tqdm import tqdm
+import torch
+from read_file import *
+from align_word_ids import *
+from convertTotext import *
+def pdf_predict(model, tokenizer, file_path, ids_to_labels, check_point='IndoBERT (IndoLEM)'):
+  file_pdf = read_pdf(file_path)
+  sentence_file = file_pdf.split(';')
+  use_cuda = torch.cuda.is_available()
+  device = torch.device("cuda" if use_cuda else "cpu")
+  if use_cuda:
+    model = model.cuda()
+  file_check_point = 'model/IndoLEM/model_fold_4.pth' if check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth'
+  model_weights = torch.load(file_check_point, map_location=torch.device(device))
+  model.load_state_dict(model_weights)
+  label_extraction = []
+  for text in tqdm(sentence_file, desc="Prediction Sentence"):
+    toknize = tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
+    input_ids = toknize['input_ids'].to(device)
+    mask = toknize['attention_mask'].to(device)
+    logits = model(input_ids, mask, None)
+    label_ids = torch.Tensor(align_word_ids(text, tokenizer, True)).unsqueeze(0).to(device)
+    logits_clean = logits[0][label_ids != -100]
+    predictions = logits_clean.argmax(dim=1).tolist()
+    prediction_label = [ids_to_labels[i] for i in predictions]
+    input_ids_conv = tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
+    data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
+    nerExtraction = convertTotext(data_token, prediction_label)
+    if nerExtraction:
+      label_extraction.append(nerExtraction)
+      # print(f"\nText : {text}")
+      # print(f"Predict Label : {prediction_label}")
+      # print()
+      # print(f"Hasil Ekstrak NER:")
+      # print(nerExtraction)
+      # print(f"Panjang Token : {len(data_token)}, Panjang Predict Label : {len(prediction_label)}")
+      # print()
+  return label_extraction

src/read_file.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import PyPDF2
+from clean_text import *
+import requests
+def read_pdf(file_pdf):
+  try:
+    pdf_text = ''
+    pdf_file = open(file_pdf, 'rb')
+    pdf_reader = PyPDF2.PdfReader(pdf_file)
+    for page_num in range(len(pdf_reader.pages)):
+        page = pdf_reader.pages[page_num]
+        text = clean_text(page.extract_text())
+        pdf_text += text
+    pdf_file.close()
+    return pdf_text.strip()
+  except requests.exceptions.RequestException as e:
+    print("Error:", e)