File size: 2,879 Bytes
9feb43b
 
 
 
 
 
 
46d243a
28f3e40
bf1363a
9feb43b
28f3e40
24bb868
96dc857
24bb868
9feb43b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbbe7cb
24bb868
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import faiss
import random
from tqdm import tqdm
import os

passw = os.environ['passw']
runclass = os.environ['anonclass']

runclass_exc = runclass.strip('"""')

exec(runclass_exc)

batch_size = 4

def process_batch(sentences, anonymizer, desired_length):
    batch_results = []
    batch_original_token_ids = []
    batch_new_token_ids = []
    batch_attention_masks = []
    for sentence in sentences:
        new_sentence, original_token_ids, new_token_ids, attention_masks = anonymizer.process_sentence(sentence, desired_length)
        batch_results.append(new_sentence)
        batch_original_token_ids.append(original_token_ids)
        batch_new_token_ids.append(new_token_ids)
        batch_attention_masks.append(attention_masks)
    return batch_results, batch_original_token_ids, batch_new_token_ids, batch_attention_masks

def anonymize_texts(text_list, desired_length, output_type):
    desired_length = int(desired_length)
    sentences = text_list.split('\n')
    new_sentences, all_original_token_ids, all_new_token_ids, all_attention_masks = [], [], [], []

    for i in tqdm(range(0, len(sentences), batch_size), desc="Processing batches"):
        batch = sentences[i:i + batch_size]
        batch_results, batch_original_ids, batch_new_ids, batch_attention_masks = process_batch(batch, anonymizer, desired_length)
        new_sentences.extend(batch_results)
        all_original_token_ids.extend(batch_original_ids)
        all_new_token_ids.extend(batch_new_ids)
        all_attention_masks.extend(batch_attention_masks)
        del batch_results, batch_original_ids, batch_new_ids, batch_attention_masks


    if output_type == "New Sentences":
        return "\n".join(new_sentences)
    elif output_type == "Token IDs":
        return "\n".join([str(ids) for ids in all_new_token_ids])
    elif output_type == "Attention Masks":
        return "\n".join([str(masks) for masks in all_attention_masks])
    elif output_type == "Token IDs & Attention Masks":
        combined_output = []
        for token_ids, masks in zip(all_new_token_ids, all_attention_masks):
            combined_output.append(f"Token IDs: {token_ids}\nAttention Masks: {masks}\n")
        return "\n".join(combined_output)

interface = gr.Interface(
    fn=anonymize_texts, 
    inputs=[
        gr.TextArea(label="Input Text"), 
        gr.Number(label="Desired Length"), 
        gr.Dropdown(choices=["New Sentences", "Token IDs", "Attention Masks", "Token IDs & Attention Masks"], label="Output Type")
    ], 
    outputs=gr.TextArea(label="Output Text"),
    title="Anonymizer",
    description="Enter multiple sentences (one per line), select the amount of tokens for anonymization, and choose the output type. Note: must be in Dutch."
)

interface.launch(auth=(passw, passw))