Spaces:
Sleeping
Sleeping
File size: 2,879 Bytes
9feb43b 46d243a 28f3e40 bf1363a 9feb43b 28f3e40 24bb868 96dc857 24bb868 9feb43b dbbe7cb 24bb868 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import gradio as gr
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import faiss
import random
from tqdm import tqdm
import os
passw = os.environ['passw']
runclass = os.environ['anonclass']
runclass_exc = runclass.strip('"""')
exec(runclass_exc)
batch_size = 4
def process_batch(sentences, anonymizer, desired_length):
batch_results = []
batch_original_token_ids = []
batch_new_token_ids = []
batch_attention_masks = []
for sentence in sentences:
new_sentence, original_token_ids, new_token_ids, attention_masks = anonymizer.process_sentence(sentence, desired_length)
batch_results.append(new_sentence)
batch_original_token_ids.append(original_token_ids)
batch_new_token_ids.append(new_token_ids)
batch_attention_masks.append(attention_masks)
return batch_results, batch_original_token_ids, batch_new_token_ids, batch_attention_masks
def anonymize_texts(text_list, desired_length, output_type):
desired_length = int(desired_length)
sentences = text_list.split('\n')
new_sentences, all_original_token_ids, all_new_token_ids, all_attention_masks = [], [], [], []
for i in tqdm(range(0, len(sentences), batch_size), desc="Processing batches"):
batch = sentences[i:i + batch_size]
batch_results, batch_original_ids, batch_new_ids, batch_attention_masks = process_batch(batch, anonymizer, desired_length)
new_sentences.extend(batch_results)
all_original_token_ids.extend(batch_original_ids)
all_new_token_ids.extend(batch_new_ids)
all_attention_masks.extend(batch_attention_masks)
del batch_results, batch_original_ids, batch_new_ids, batch_attention_masks
if output_type == "New Sentences":
return "\n".join(new_sentences)
elif output_type == "Token IDs":
return "\n".join([str(ids) for ids in all_new_token_ids])
elif output_type == "Attention Masks":
return "\n".join([str(masks) for masks in all_attention_masks])
elif output_type == "Token IDs & Attention Masks":
combined_output = []
for token_ids, masks in zip(all_new_token_ids, all_attention_masks):
combined_output.append(f"Token IDs: {token_ids}\nAttention Masks: {masks}\n")
return "\n".join(combined_output)
interface = gr.Interface(
fn=anonymize_texts,
inputs=[
gr.TextArea(label="Input Text"),
gr.Number(label="Desired Length"),
gr.Dropdown(choices=["New Sentences", "Token IDs", "Attention Masks", "Token IDs & Attention Masks"], label="Output Type")
],
outputs=gr.TextArea(label="Output Text"),
title="Anonymizer",
description="Enter multiple sentences (one per line), select the amount of tokens for anonymization, and choose the output type. Note: must be in Dutch."
)
interface.launch(auth=(passw, passw)) |