Spaces:

zama-fhe
/

encrypted-anonymization

Running

App Files Files Community

kcelia commited on Mar 29, 2024

Commit

cf6aebf

unverified ·

1 Parent(s): 1a494e6

chore: add encryption

Browse files

Files changed (2) hide show

app.py +35 -11
utils_demo.py +5 -0

app.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import os
 import re
 from typing import Dict, List
 import gradio as gr
 import pandas as pd
 from fhe_anonymizer import FHEAnonymizer
@@ -11,16 +11,23 @@ from openai import OpenAI
 from utils_demo import *
 from concrete.ml.deployment import FHEModelClient
 ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
 ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
 MAPPING_SENTENCES = read_pickle(MAPPING_SENTENCES_PATH)
 clean_directory()
 anonymizer = FHEAnonymizer()
 client = OpenAI(api_key=os.environ.get("openaikey"))
 def select_static_sentences_fn(selected_sentences: List):
@@ -39,11 +46,7 @@ def key_gen_fn() -> Dict:
     Returns:
         dict: A dictionary containing the generated keys and related information.
     """
-    print("Key Gen..")
-    # Generate a random user ID
-    user_id = np.random.randint(0, 2**32)
-    print(f"Your user ID is: {user_id}....")
     client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{user_id}")
     client.load()
@@ -74,16 +77,16 @@ def key_gen_fn() -> Dict:
 def encrypt_query_fn(query):
-    print(f"Query: {query}")
-    evaluation_key_path = KEYS_DIR / "evaluation_key"
     if not evaluation_key_path.is_file():
         error_message = "Error ❌: Please generate the key first!"
         return {output_encrypted_box: gr.update(value=error_message)}
     if is_user_query_valid(query):
-        # TODO: check if the query is related to our context
         error_msg = (
             "Unable to process ❌: The request exceeds the length limit or falls "
             "outside the scope of this document. Please refine your query."
@@ -91,9 +94,30 @@ def encrypt_query_fn(query):
         print(error_msg)
         return {query_box: gr.update(value=error_msg)}
-    anonymizer.encrypt_query(query)
-    encrypted_tokens = read_pickle(KEYS_DIR / "encrypted_quantized_query")
     encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]

 import os
 import re
 from typing import Dict, List
+import numpy
 import gradio as gr
 import pandas as pd
 from fhe_anonymizer import FHEAnonymizer
 from utils_demo import *
 from concrete.ml.deployment import FHEModelClient
 ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
 ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
 MAPPING_SENTENCES = read_pickle(MAPPING_SENTENCES_PATH)
+subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
+time.sleep(3)
 clean_directory()
 anonymizer = FHEAnonymizer()
 client = OpenAI(api_key=os.environ.get("openaikey"))
+# Generate a random user ID
+user_id = numpy.random.randint(0, 2**32)
+print(f"Your user ID is: {user_id}....")
 def select_static_sentences_fn(selected_sentences: List):
     Returns:
         dict: A dictionary containing the generated keys and related information.
     """
+    print("Step 1: Key Generation:")
     client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{user_id}")
     client.load()
 def encrypt_query_fn(query):
+    print(f"Step 2 Query encryption: {query=}")
+    evaluation_key_path = KEYS_DIR / f"{user_id}/evaluation_key"
     if not evaluation_key_path.is_file():
         error_message = "Error ❌: Please generate the key first!"
         return {output_encrypted_box: gr.update(value=error_message)}
     if is_user_query_valid(query):
         error_msg = (
             "Unable to process ❌: The request exceeds the length limit or falls "
             "outside the scope of this document. Please refine your query."
         print(error_msg)
         return {query_box: gr.update(value=error_msg)}
+    # Retrieve the client API
+    client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{user_id}")
+    client.load()
+    # Pattern to identify words and non-words (including punctuation, spaces, etc.)
+    tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
+    encrypted_tokens = []
+    for token in tokens:
+        if bool(re.match(r"^\s+$", token)):
+            continue
+        # Directly append non-word tokens or whitespace to processed_tokens
+        # Prediction for each word
+        emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
+        encrypted_x = client.quantize_encrypt_serialize(emb_x)
+        assert isinstance(encrypted_x, bytes)
+        encrypted_tokens.append(encrypted_x)
+    write_pickle(KEYS_DIR / f"{user_id}/encrypted_input", encrypted_tokens)
+    #anonymizer.encrypt_query(query)
     encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]

utils_demo.py CHANGED Viewed

@@ -6,6 +6,7 @@ import shutil
 import string
 from collections import Counter
 from pathlib import Path
 import numpy as np
 import torch
@@ -35,6 +36,10 @@ PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
 ALL_DIRS = [KEYS_DIR]
 PUNCTUATION_LIST = list(string.punctuation)
 PUNCTUATION_LIST.remove("%")
 PUNCTUATION_LIST.remove("$")

 import string
 from collections import Counter
 from pathlib import Path
+from transformers import AutoModel, AutoTokenizer
 import numpy as np
 import torch
 ALL_DIRS = [KEYS_DIR]
+# Load tokenizer and model
+TOKENIZER =  AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
+EMBEDDINGS_MODEL = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
 PUNCTUATION_LIST = list(string.punctuation)
 PUNCTUATION_LIST.remove("%")
 PUNCTUATION_LIST.remove("$")