"""A Gradio app for anonymizing text data using FHE.""" import gradio as gr from fhe_anonymizer import FHEAnonymizer import pandas as pd from openai import OpenAI import os import json import re anonymizer = FHEAnonymizer() client = OpenAI( api_key=os.environ.get("openaikey"), ) def deidentify_text(input_text): anonymized_text, identified_words_with_prob = anonymizer(input_text) # Convert the list of identified words and probabilities into a DataFrame if identified_words_with_prob: identified_df = pd.DataFrame( identified_words_with_prob, columns=["Identified Words", "Probability"] ) else: identified_df = pd.DataFrame(columns=["Identified Words", "Probability"]) return anonymized_text, identified_df def query_chatgpt(anonymized_query): with open("files/anonymized_document.txt", "r") as file: anonymized_document = file.read() with open("files/chatgpt_prompt.txt", "r") as file: prompt = file.read() # Prepare prompt full_prompt = ( prompt + "\n" ) query = "Document content:\n```\n" + anonymized_document + "\n\n```" + "Query:\n```\n" + anonymized_query + "\n```" print(full_prompt) completion = client.chat.completions.create( model="gpt-4-1106-preview", # Replace with "gpt-4" if available messages=[ {"role": "system", "content": prompt}, {"role": "user", "content": query}, ], ) anonymized_response = completion.choices[0].message.content with open("original_document_uuid_mapping.json", "r") as file: uuid_map = json.load(file) inverse_uuid_map = {v: k for k, v in uuid_map.items()} # TODO load the inverse mapping from disk for efficiency # Pattern to identify words and non-words (including punctuation, spaces, etc.) token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)" tokens = re.findall(token_pattern, anonymized_response) processed_tokens = [] print(tokens) for token in tokens: # Directly append non-word tokens or whitespace to processed_tokens if not token.strip() or not re.match(r"\w+", token): processed_tokens.append(token) continue print(token) if token in inverse_uuid_map: processed_tokens.append(inverse_uuid_map[token]) else: processed_tokens.append(token) deanonymized_response = "".join(processed_tokens) return anonymized_response, deanonymized_response # Default demo text from the file with open("demo_text.txt", "r") as file: default_demo_text = file.read() with open("files/original_document.txt", "r") as file: original_document = file.read() with open("files/anonymized_document.txt", "r") as file: anonymized_document = file.read() demo = gr.Blocks() with demo: gr.Markdown( """
Concrete-ML
—
Documentation
—
Community
—
@zama_fhe
#
#