Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Apr 25, 2024

Commit

641ff3e

0 Parent(s):

Initial commit

Browse files

Files changed (14) hide show

.dockerignore +11 -0
.github/workflows/check_file_size.yml +16 -0
.github/workflows/sync_to_hf.yml +20 -0
.gitignore +11 -0
Dockerfile +51 -0
README.md +19 -0
app.py +143 -0
requirements.txt +11 -0
tools/__init__.py +0 -0
tools/aws_functions.py +165 -0
tools/file_conversion.py +58 -0
tools/file_redaction.py +203 -0
tools/helper_functions.py +12 -0
tools/load_spacy_model_custom_recognisers.py +158 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,11 @@

+*.csv
+*.pdf
+*.url
+*.jpg
+*.png
+*.ipynb
+examples/*
+processing/*
+output/*
+tools/__pycache__/*
+old_code/*

.github/workflows/check_file_size.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+name: Check file size
+on:               # or directly `on: [push]` to run the action on every push on any branch
+  pull_request:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check large files
+        uses: ActionsDesk/[email protected]
+        with:
+          filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces

.github/workflows/sync_to_hf.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://seanpedrickcase:[email protected]/spaces/seanpedrickcase/document_redaction main

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+*.csv
+*.pdf
+*.url
+*.jpg
+*.png
+*.ipynb
+examples/*
+processing/*
+output/*
+tools/__pycache__/*
+old_code/*

Dockerfile ADDED Viewed

	@@ -0,0 +1,51 @@

+FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
+# Install system dependencies
+RUN apt-get update \
+    && apt-get install -y \
+        tesseract-ocr \
+        libtesseract-dev \
+        poppler-utils \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /src
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+# Set up a new user named "user" with user ID 1000
+#RUN useradd -m -u 1000 user
+# Change ownership of /home/user directory
+#RUN chown -R user:user /home/user
+# Create the temp files directory and set its permissions
+#RUN mkdir -p /home/user/tmp && chown -R user:user /home/user/tmp
+# Switch to the "user" user
+#USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=$HOME/app \
+	PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_SERVER_PORT=7861 \
+	GRADIO_THEME=huggingface \
+	#GRADIO_TEMP_DIR=$HOME/tmp \
+	#GRADIO_ROOT_PATH=/address-match \
+	SYSTEM=spaces
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+#COPY --chown=user . $HOME/app
+COPY . $HOME/app
+CMD ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+---
+title: Document redaction
+emoji: 🌍
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.27.0
+app_file: app.py
+pinned: false
+license: mit
+---
+# Introduction
+Redact PDF files using image-based OCR or direct text analysis from pdfminer.six. Personal information identification performed using Microsoft Presidio.
+Take an image-based or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
+WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from tools.file_redaction import redact_text_pdf, redact_image_pdf
+from tools.helper_functions import get_file_path_end
+from tools.file_conversion import process_file, is_pdf
+from tools.aws_functions import load_data_from_aws
+from typing import List
+import pandas as pd
+import gradio as gr
+import time
+file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
+chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
+full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
+language = 'en'
+def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
+    tic = time.perf_counter()
+    if is_pdf(file_path) == False:
+        return "Please upload a PDF file.", None
+    out_message = ''
+    out_file_paths = []
+    in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
+    if file_path:
+        file_path_without_ext = get_file_path_end(file_path)
+    else:
+        out_message = "No file selected"
+        print(out_message)
+        return out_message, out_file_paths
+    if in_redact_method == "Image analysis":
+        # Analyse image-based pdf
+        pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
+        out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
+        pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
+        out_file_paths.append(out_image_file_path)
+        out_message = "Image-based PDF successfully redacted and saved to file."
+    elif in_redact_method == "Text analysis":
+        # Analyse text-based pdf
+        pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
+        out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
+        pdf_text.save(out_text_file_path)
+        out_file_paths.append(out_text_file_path)
+        # Convert annotated text pdf back to image to give genuine redactions
+        pdf_text_image_paths = process_file(out_text_file_path)
+        out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
+        pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
+        out_file_paths.append(out_text_image_file_path)
+        out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
+    else:
+        out_message = "No redaction method selected"
+        print(out_message)
+        return out_message, out_file_paths
+    toc = time.perf_counter()
+    out_time = f"Time taken: {toc - tic:0.1f} seconds."
+    print(out_time)
+    out_message = out_message + "\n\n" + out_time
+    return out_message, out_file_paths
+# Create the gradio interface
+block = gr.Blocks(theme = gr.themes.Base())
+with block:
+    data_state = gr.State(pd.DataFrame())
+    ref_data_state = gr.State(pd.DataFrame())
+    results_data_state = gr.State(pd.DataFrame())
+    ref_results_data_state =gr.State(pd.DataFrame())
+    gr.Markdown(
+    """
+    # Document redaction
+    Take an image-based or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
+    WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
+    """)
+    with gr.Tab("Redact document"):
+        with gr.Accordion("Input document", open = True):
+            in_file = gr.File(label="Choose document file", file_count= "single")
+            in_redaction_method = gr.Radio(label="Redaction method", value = "Image analysis", choices=["Image analysis", "Text analysis"])
+            in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
+            in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language", multiselect=False)
+            in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=1, value=[[""]], type="array", column_widths=["50%"])
+        redact_btn = gr.Button("Redact document")
+        with gr.Row():
+            output_summary = gr.Textbox(label="Output summary")
+            output_file = gr.File(label="Output file")
+    with gr.Tab(label="Advanced options"):
+        with gr.Accordion(label = "AWS data access", open = False):
+                aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
+                with gr.Row():
+                    in_aws_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
+                    load_aws_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
+                aws_log_box = gr.Textbox(label="AWS data load status")
+    ### Loading AWS data ###
+    load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
+    # Updates to components
+    #in_file.change(fn = initial_data_load, inputs=[in_file], outputs=[output_summary, in_redact_entities, in_existing, data_state, results_data_state])
+    #in_ref.change(fn = initial_data_load, inputs=[in_ref], outputs=[output_summary, in_refcol, in_joincol, ref_data_state, ref_results_data_state])
+    redact_btn.click(fn = choose_and_run_redactor, inputs=[in_file, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
+                    outputs=[output_summary, output_file], api_name="redact")
+# Simple run for HF spaces or local on your computer
+#block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
+# Simple run for AWS server
+block.queue().launch(ssl_verify=False) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
+# Download OpenSSL from here:
+# Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d
+#block.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
+#                     ssl_certfile="cert.pem", ssl_keyfile="key.pem") # port 443 for https. Certificates currently not valid
+# Running on local server without https
+#block.queue().launch(server_name="0.0.0.0", server_port=7861, ssl_verify=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+pdfminer.six==20231228
+pdf2image==1.17.0
+#img2pdf==0.5.1
+presidio_analyzer==2.2.351
+presidio_anonymizer==2.2.351
+presidio-image-redactor==0.0.52
+pikepdf==8.15.1
+pandas==2.2.2
+spacy==3.7.4
+en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
+gradio==4.27.0

tools/__init__.py ADDED Viewed

File without changes

tools/aws_functions.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from typing import Type
+import pandas as pd
+import boto3
+import tempfile
+import os
+PandasDataFrame = Type[pd.DataFrame]
+bucket_name = 'doc-redaction-data'
+try:
+    session = boto3.Session(profile_name="default")
+except Exception as e:
+    print(e)
+# sts = session.client("sts")
+# Create a Session with the IAM role ARN
+# aws_role = os.environ['AWS_ROLE_DATA_TEXT_SEARCH']
+# response = sts.assume_role(
+#     RoleArn=aws_role,
+#     RoleSessionName="ecs-test-session"
+# )
+# print(response)
+def get_assumed_role_info():
+    sts = boto3.client('sts')
+    response = sts.get_caller_identity()
+    # Extract ARN of the assumed role
+    assumed_role_arn = response['Arn']
+    # Extract the name of the assumed role from the ARN
+    assumed_role_name = assumed_role_arn.split('/')[-1]
+    return assumed_role_arn, assumed_role_name
+try:
+    assumed_role_arn, assumed_role_name = get_assumed_role_info()
+    print("Assumed Role ARN:", assumed_role_arn)
+    print("Assumed Role Name:", assumed_role_name)
+except Exception as e:
+    print(e)
+# Download direct from S3 - requires login credentials
+def download_file_from_s3(bucket_name, key, local_file_path):
+    s3 = boto3.client('s3')
+    s3.download_file(bucket_name, key, local_file_path)
+    print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
+#download_file_from_s3(bucket_name, object_key, local_file_loc)
+def download_folder_from_s3(bucket_name, s3_folder, local_folder):
+    """
+    Download all files from an S3 folder to a local folder.
+    """
+    s3 = boto3.client('s3')
+    # List objects in the specified S3 folder
+    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+    # Download each object
+    for obj in response.get('Contents', []):
+        # Extract object key and construct local file path
+        object_key = obj['Key']
+        local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
+        # Create directories if necessary
+        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+        # Download the object
+        try:
+            s3.download_file(bucket_name, object_key, local_file_path)
+            print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
+        except Exception as e:
+            print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
+    """
+    Download specific files from an S3 folder to a local folder.
+    """
+    s3 = boto3.client('s3')
+    print("Trying to download file: ", filenames)
+    if filenames == '*':
+        # List all objects in the S3 folder
+        print("Trying to download all files in AWS folder: ", s3_folder)
+        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+        print("Found files in AWS folder: ", response.get('Contents', []))
+        filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
+        print("Found filenames in AWS folder: ", filenames)
+    for filename in filenames:
+        object_key = os.path.join(s3_folder, filename)
+        local_file_path = os.path.join(local_folder, filename)
+        # Create directories if necessary
+        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+        # Download the object
+        try:
+            s3.download_file(bucket_name, object_key, local_file_path)
+            print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
+        except Exception as e:
+            print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
+    temp_dir = tempfile.mkdtemp()
+    local_address_stub = temp_dir + '/doc-redaction/'
+    files = []
+    if not 'LAMBETH_BOROUGH_PLAN_PASSWORD' in os.environ:
+        out_message = "Can't verify password for dataset access. Do you have a valid AWS connection? Data not loaded."
+        return files, out_message
+    if aws_password:
+        if "Lambeth borough plan" in in_aws_keyword_file and aws_password == os.environ['LAMBETH_BOROUGH_PLAN_PASSWORD']:
+            s3_folder_stub = 'example-data/lambeth-borough-plan/latest/'
+            local_folder_path = local_address_stub
+            # Check if folder exists
+            if not os.path.exists(local_folder_path):
+                print(f"Folder {local_folder_path} does not exist! Making folder.")
+                os.mkdir(local_folder_path)
+            # Check if folder is empty
+            if len(os.listdir(local_folder_path)) == 0:
+                print(f"Folder {local_folder_path} is empty")
+                # Download data
+                download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
+                print("AWS data downloaded")
+            else:
+                print(f"Folder {local_folder_path} is not empty")
+            #files = os.listdir(local_folder_stub)
+            #print(files)
+            files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
+            out_message = "Data successfully loaded from AWS"
+            print(out_message)
+        else:
+            out_message = "Data not loaded from AWS"
+            print(out_message)
+    else:
+        out_message = "No password provided. Please ask the data team for access if you need this."
+        print(out_message)
+    return files, out_message

tools/file_conversion.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from pdf2image import convert_from_path
+import os
+def is_pdf(filename):
+    """
+    Check if a file name is a PDF.
+    Args:
+        filename (str): The name of the file.
+    Returns:
+        bool: True if the file name ends with ".pdf", False otherwise.
+    """
+    return filename.lower().endswith(".pdf")
+# %%
+## Convert pdf to image if necessary
+def convert_pdf_to_images(pdf_path):
+    image_paths = []
+    # Convert PDF to a list of images
+    images = convert_from_path(pdf_path)
+    # Save each image as a separate file
+    # for i, image in enumerate(images):
+    #     page_path = f"processing/page_{i+1}.png"
+    #     image.save(page_path, "PNG")
+    #     image_paths.append(page_path)
+    print("PDF has been converted to images.")
+    return images
+# %%
+def process_file(file_path):
+    # Get the file extension
+    file_extension = os.path.splitext(file_path)[1].lower()
+    # Check if the file is an image type
+    if file_extension in ['.jpg', '.jpeg', '.png', '.gif']:
+        print(f"{file_path} is an image file.")
+        # Perform image processing here
+        out_path = [file_path]
+    # Check if the file is a PDF
+    elif file_extension == '.pdf':
+        print(f"{file_path} is a PDF file. Converting to image set")
+        # Run your function for processing PDF files here
+        out_path = convert_pdf_to_images(file_path)
+    else:
+        print(f"{file_path} is not an image or PDF file.")
+        out_path = ['']
+    return out_path

tools/file_redaction.py ADDED Viewed

	@@ -0,0 +1,203 @@

+from PIL import Image
+from typing import List
+import pandas as pd
+from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
+from pdfminer.high_level import extract_pages
+from tools.file_conversion import process_file
+from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
+from pikepdf import Pdf, Dictionary, Name
+from gradio import Progress
+from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
+def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
+    '''
+    take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
+    '''
+    progress(0, desc="Converting pages to image")
+    image_paths = process_file(file_path)
+    # Create a new PDF
+    #pdf = pikepdf.new()
+    images = []
+    number_of_pages = len(image_paths)
+    progress(0.1, desc="Redacting pages")
+    for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
+        # Get the image to redact using PIL lib (pillow)
+        image = image_paths[i] #Image.open(image_paths[i])
+        # %%
+        image_analyser = ImageAnalyzerEngine(nlp_analyser)
+        engine = ImageRedactorEngine(image_analyser)
+        if language == 'en':
+            ocr_lang = 'eng'
+        else: ocr_lang = language
+        # %%
+        # Redact the image with pink color
+        redacted_image = engine.redact(image,
+            fill=(0, 0, 0),
+            ocr_kwargs={"lang": ocr_lang},
+            allow_list=allow_list,
+            ad_hoc_recognizers= None,
+            **{
+                "language": language,
+                "entities": chosen_redact_entities,
+                "score_threshold": score_threshold
+            },
+            )
+        images.append(redacted_image)
+        # multiple inputs (variant 2)
+        # with open("name.pdf","wb") as f:
+	    # f.write(img2pdf.convert(["test1.jpg", "test2.png"]))
+        # # Create page from image
+        # pdf.add_blank_page(page_size=(redacted_image.width, redacted_image.height))
+        # page = pdf.pages[-1]
+        # page.add_image(redacted_image, 0, 0)
+        # %%
+        # Get descriptive output of results for checks - not necessary except for debugging
+        # bboxes = image_analyser.analyze(image)
+        # # %%
+        # check_df = pd.DataFrame(bboxes)[0].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
+        # check_df.columns = ["type", "start", "end", "score", "left", "top", "width", "height"]
+        # check_df.to_csv("check_df.csv")
+    return images
+def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress()):
+    '''
+    Redact chosen entities from a pdf that is made up of multiple pages that are not images.
+    '''
+    combined_analyzer_results = []
+    analyser_explanations = []
+    annotations_all_pages = []
+    analyzed_bounding_boxes_df = pd.DataFrame()
+    pdf = Pdf.open(filename)
+    for page_num, page in progress.tqdm(enumerate(pdf.pages), total=len(pdf.pages), unit="pages", desc="Redacting pages"):
+        print("Page number is: ", page_num)
+        annotations_on_page = []
+        analyzed_bounding_boxes = []
+        for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
+            analyzer_results = []
+            for text_container in page_layout:
+                if isinstance(text_container, LTTextContainer):
+                    text_to_analyze = text_container.get_text()
+                    analyzer_results = []
+                    characters = []
+                    analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
+                                                            language=language,
+                                                            entities=chosen_redact_entities,
+                                                            score_threshold=score_threshold,
+                                                            return_decision_process=False,
+                                                            allow_list=allow_list)
+                        #if analyzer_results:
+                        #    pass
+                        #explanation = analyzer_results[0].analysis_explanation.to_dict()
+                        #analyser_explanations.append(explanation)
+                    characters = [char                    # This is what we want to include in the list
+                            for line in text_container          # Loop through each line in text_container
+                            if isinstance(line, LTTextLine)    # Check if the line is an instance of LTTextLine
+                            for char in line]                   # Loop through each character in the line
+                            #if isinstance(char, LTChar)]  # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
+                    #print(characters)
+                    # Collect unique types
+                    # unique_types = set()
+                    # for line in text_container:
+                    #     if isinstance(line, LTTextLine):
+                    #         print("Line: ", line)
+                    #         for char in line:
+                    #             unique_types.add(type(char))
+                    #             if isinstance(char, LTAnno):
+                    #                 print(char)
+                    # # Print the unique types
+                    # print("Unique types in text_container:")
+                    # for t in unique_types:
+                    #     print(t)
+                    # If any results found
+                    print(analyzer_results)
+                    if len(analyzer_results) > 0 and len(characters) > 0:
+                        analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
+                        combined_analyzer_results.extend(analyzer_results)
+            if len(analyzer_results) > 0:
+                # Create summary df of annotations to be made
+                analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
+                analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
+                analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
+                analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
+                analyzed_bounding_boxes_df_new['page'] = page_num + 1
+                analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0)
+            for analyzed_bounding_box in analyzed_bounding_boxes:
+                bounding_box = analyzed_bounding_box["boundingBox"]
+                annotation = Dictionary(
+                    Type=Name.Annot,
+                    Subtype=Name.Highlight,
+                    QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
+                    Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
+                    C=[0, 0, 0],
+                    CA=1, # Transparency
+                    T=analyzed_bounding_box["result"].entity_type
+                )
+                annotations_on_page.append(annotation)
+            annotations_all_pages.extend([annotations_on_page])
+            print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
+            page.Annots = pdf.make_indirect(annotations_on_page)
+        # Extracting data from dictionaries
+        # extracted_data = []
+        # for item in annotations_all_pages:
+        #     temp_dict = {}
+        #     #print(item)
+        #     for key, value in item.items():
+        #         if isinstance(value, Decimal):
+        #             temp_dict[key] = float(value)
+        #         elif isinstance(value, list):
+        #             temp_dict[key] = [float(v) if isinstance(v, Decimal) else v for v in value]
+        #         else:
+        #             temp_dict[key] = value
+        #     extracted_data.append(temp_dict)
+        # Creating DataFrame
+        # annotations_out = pd.DataFrame(extracted_data)
+        #print(df)
+        #annotations_out.to_csv("examples/annotations.csv")
+    analyzed_bounding_boxes_df.to_csv("output/annotations_made.csv")
+    return pdf

tools/helper_functions.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import os
+def get_file_path_end(file_path):
+    # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
+    basename = os.path.basename(file_path)
+    # Then, split the basename and its extension and return only the basename without the extension
+    filename_without_extension, _ = os.path.splitext(basename)
+    #print(filename_without_extension)
+    return filename_without_extension

tools/load_spacy_model_custom_recognisers.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# %%
+from typing import List
+from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
+from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
+import spacy
+import re
+# %%
+model_name = "en_core_web_lg" #"en_core_web_trf"
+score_threshold = 0.001
+# %% [markdown]
+# #### Custom recognisers
+# %%
+# Custom title recogniser
+import re
+titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
+titles_regex = '\\b' + ' \\b|\\b'.join(rf"{re.escape(street_type)}" for street_type in titles_list) + ' \\b'
+titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
+titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])
+# %%
+# Custom postcode recogniser
+# Define the regex pattern in a Presidio `Pattern` object:
+ukpostcode_pattern = Pattern(name="ukpostcode_pattern",regex="\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b", score = 1)
+# Define the recognizer with one or more patterns
+ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])
+# %%
+# Examples for testing
+#text = "I live in 510 Broad st SE5 9NG ."
+#numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
+#print("Result:")
+#print(numbers_result)
+# %%
+def extract_street_name(text:str) -> str:
+    """
+    Extracts the street name and preceding word (that should contain at least one number) from the given text.
+    """
+    street_types = [
+    'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
+    'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
+    'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
+    'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
+    'Alley', 'Arcade', 'Avenue', 'Ave', 'Bay', 'Bend', 'Brae', 'Byway', 'Close', 'Corner', 'Cove',
+    'Crescent', 'Cres', 'Cul-de-sac', 'Dell', 'Drive', 'Dr', 'Esplanade', 'Glen', 'Green', 'Grove', 'Heights', 'Hts',
+    'Mews', 'Parade', 'Path', 'Piazza', 'Promenade', 'Quay', 'Ridge', 'Row', 'Terrace', 'Ter', 'Track', 'Trail', 'View', 'Villas',
+    'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
+    ]
+    # Dynamically construct the regex pattern with all possible street types
+    street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)
+    # The overall regex pattern to capture the street name and preceding word(s)
+    pattern = rf'(?P<preceding_word>\w*\d\w*)\s*'
+    pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
+    # Find all matches in text
+    matches = re.finditer(pattern, text, re.IGNORECASE)
+    start_positions = []
+    end_positions = []
+    for match in matches:
+        preceding_word = match.group('preceding_word').strip()
+        street_name = match.group('street_name').strip()
+        start_pos = match.start()
+        end_pos = match.end()
+        print(f"Start: {start_pos}, End: {end_pos}")
+        print(f"Preceding words: {preceding_word}")
+        print(f"Street name: {street_name}")
+        print()
+        start_positions.append(start_pos)
+        end_positions.append(end_pos)
+    return start_positions, end_positions
+# %%
+# Some examples for testing
+#text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
+#text = "Roberto lives in Five 10 Broad st in Oregon"
+#text = "Roberto lives in 55 Oregon Square"
+#text = "There is 51a no way I will do that"
+#text = "I am writing to apply for"
+#extract_street_name(text)
+# %%
+class StreetNameRecognizer(EntityRecognizer):
+    def load(self) -> None:
+        """No loading is required."""
+        pass
+    def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
+        """
+        Logic for detecting a specific PII
+        """
+        start_pos, end_pos = extract_street_name(text)
+        results = []
+        for i in range(0, len(start_pos)):
+            result = RecognizerResult(
+                        entity_type="STREETNAME",
+                        start = start_pos[i],
+                        end = end_pos[i],
+                        score= 1
+                    )
+            results.append(result)
+        return results
+street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
+# %%
+# Create a class inheriting from SpacyNlpEngine
+class LoadedSpacyNlpEngine(SpacyNlpEngine):
+    def __init__(self, loaded_spacy_model):
+        super().__init__()
+        self.nlp = {"en": loaded_spacy_model}
+# %%
+# Load a model a-priori
+nlp = spacy.load(model_name)
+# Pass the loaded model to the new LoadedSpacyNlpEngine
+loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
+# %%
+nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
+                default_score_threshold=score_threshold,
+                supported_languages=["en"],
+                log_decision_process=True,
+                )
+# %%
+nlp_analyser.registry.add_recognizer(street_recogniser)
+nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
+nlp_analyser.registry.add_recognizer(titles_recogniser)