Spaces:

ahmedsalman82
/

TrustGuardian-Cybersecurity.Compliance.Agent

Sleeping

App Files Files Community

ahmedsalman82 commited on May 1

Commit

f6a0104

verified ·

1 Parent(s): f95e5b0

Create app.py

Browse files

Files changed (1) hide show

app.py +234 -0

app.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import os
+import io
+import re
+import json
+import PyPDF2
+import gradio as gr
+import numpy as np
+from datetime import datetime
+from typing import Optional, Dict, List
+from dotenv import load_dotenv
+import tiktoken
+from langchain_groq import ChatGroq
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.memory import ConversationSummaryBufferMemory
+from langchain.chains import RetrievalQA
+from langchain.schema import Document
+from langchain_astradb import AstraDBVectorStore
+from langchain_huggingface import HuggingFaceEmbeddings
+# Load environment variables
+load_dotenv()
+# System constants
+DEBUG_MODE = False
+MAX_RETRIES = 3
+MODEL_TOKEN_LIMIT = 6000
+DOC_TOKENS = 2500
+REG_TOKENS = 1500
+MEMORY_TOKENS = 1000
+def log_debug(message: str) -> None:
+    if DEBUG_MODE:
+        print(f"[DEBUG {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}")
+# Load API keys
+try:
+    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+    ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
+    ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
+    if not all([GROQ_API_KEY, ASTRA_DB_API_ENDPOINT, ASTRA_DB_APPLICATION_TOKEN]):
+        raise ValueError("Missing API keys")
+    log_debug("API keys loaded")
+except Exception as e:
+    raise ValueError(f"Failed to load API keys: {str(e)}")
+# Initialize embedding model
+try:
+    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    log_debug("Embedding model initialized")
+except Exception as e:
+    raise ValueError(f"Failed to initialize embedding model: {str(e)}")
+# Initialize vector store
+try:
+    astra_vectorstore = AstraDBVectorStore(
+        embedding=embedding_model,
+        collection_name="trustguardian_kb",
+        api_endpoint=ASTRA_DB_API_ENDPOINT,
+        token=ASTRA_DB_APPLICATION_TOKEN
+    )
+    retriever = astra_vectorstore.as_retriever(
+        search_type="mmr",
+        search_kwargs={"k": 6, "fetch_k": 12, "lambda_mult": 0.6}
+    )
+    log_debug("Vector store initialized")
+except Exception as e:
+    raise ValueError(f"Failed to initialize vector store: {str(e)}")
+# Initialize LLM
+try:
+    llm = ChatGroq(groq_api_key=GROQ_API_KEY, model_name="mistral-saba-24b")
+    log_debug("LLM initialized")
+except Exception as e:
+    raise ValueError(f"Failed to initialize LLM: {str(e)}")
+# Initialize memory
+try:
+    memory = ConversationSummaryBufferMemory(llm=llm, max_token_limit=8000, return_messages=True)
+    doc_memory = {"latest_doc": ""}
+    log_debug("Memory initialized")
+except Exception as e:
+    raise ValueError(f"Failed to initialize memory: {str(e)}")
+# Document processing
+class DocumentProcessor:
+    @staticmethod
+    def clean_text(text: str) -> str:
+        text = re.sub(r'%PDF-\d+\.\d+|obj|endobj|stream|endstream|xref|trailer|startxref', '', text)
+        text = re.sub(r'[^\x20-\x7E\n]', '', text)
+        text = re.sub(r'\s+', ' ', text)
+        text = re.sub(r'\\n', '\n', text)
+        return text.strip()
+    @staticmethod
+    def test_text_quality(text: str) -> tuple:
+        if not text.strip():
+            return False, "Empty text"
+        words = text.split()
+        unique_words = set(words)
+        if len(words) < 10:
+            return False, f"Too few words: {len(words)}"
+        if len(unique_words) < 5:
+            return False, f"Too little variety: {len(unique_words)} unique words"
+        return True, f"Text quality good: {len(words)} words"
+    @staticmethod
+    def extract_text_from_pdf(file_data: bytes) -> str:
+        try:
+            reader = PyPDF2.PdfReader(io.BytesIO(file_data))
+            text_parts = [page.extract_text() for page in reader.pages if page.extract_text().strip()]
+            return "\n".join(text_parts)
+        except Exception as e:
+            raise ValueError(f"PDF extraction failed: {str(e)}")
+def extract_text_from_uploaded_file(uploaded_file) -> str:
+    try:
+        file_data = uploaded_file.read() if hasattr(uploaded_file, 'read') else uploaded_file
+        text = DocumentProcessor.extract_text_from_pdf(file_data)
+        cleaned_text = DocumentProcessor.clean_text(text)
+        quality, msg = DocumentProcessor.test_text_quality(cleaned_text)
+        if not quality:
+            raise ValueError(f"Poor text quality: {msg}")
+        return cleaned_text
+    except Exception as e:
+        raise ValueError(f"Document processing failed: {str(e)}\nEnsure valid PDF with text content.")
+# Token management
+class TokenManager:
+    def __init__(self):
+        self.encoding = tiktoken.get_encoding("cl100k_base")
+    def count_tokens(self, text: str) -> int:
+        return len(self.encoding.encode(text))
+    def truncate_to_limit(self, text: str, max_tokens: int) -> str:
+        tokens = self.encoding.encode(text)
+        if len(tokens) > max_tokens:
+            tokens = tokens[:max_tokens]
+        return self.encoding.decode(tokens)
+token_manager = TokenManager()
+# Text analysis helpers
+def analyze_document_structure(text: str) -> Dict:
+    words = text.split()
+    lines = text.split('\n')
+    return {
+        'total_chars': len(text),
+        'total_words': len(words),
+        'total_lines': len(lines),
+        'unique_words': len(set(words))
+    }
+def extract_key_sections(text: str) -> List[str]:
+    section_patterns = [
+        r'^[A-Z][^a-z\n]{2,}[:\-]',
+        r'^\d+\.\s+[A-Z][^a-z]{2,}',
+        r'^[IVX]+\.\s+[A-Z]'
+    ]
+    return [line.strip() for line in text.split('\n') if any(re.match(p, line.strip()) for p in section_patterns)]
+# Main processing logic
+class TrustGuardian:
+    def __init__(self):
+        self.token_manager = TokenManager()
+        self.conversation_history = []
+    def generate_response_prompt(self, doc_text: str, user_query: str, reg_context: str = "") -> str:
+        return f"""
+        You are TrustGuardian, an expert compliance analyst. Provide precise, clear responses with exact references (e.g., "GDPR Article 32(1)(b)") where applicable.
+        TASK: {user_query}
+        {'DOCUMENT CONTENT: ' + doc_text[:2500] if doc_text else 'NO DOCUMENT'}
+        {'REGULATORY CONTEXT: ' + reg_context if reg_context else ''}
+        INSTRUCTIONS:
+        - For documents: Analyze relevant sections, cite document parts (e.g., "Section 3.2") and standards (e.g., "SOC 2 TSC CC6.1").
+        - For regulations: Cite specific sections (e.g., "HIPAA §164.308"), explain clearly, provide examples.
+        - For general queries: Explain compliance aspects, suggest best practices, note sources.
+        - If no reference exists, state "No specific reference available" and use general knowledge.
+        - Format with headings, bullets, and citations.
+        - Suggest next steps if relevant.
+        """
+    def process_regulatory_context(self, query: str) -> tuple:
+        try:
+            rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
+            result = rag_chain.invoke({"query": query})
+            context = result["result"]
+            sources = result.get("source_documents", [])
+            citations = [f"{doc.metadata.get('source', 'Unknown')}: \"{doc.page_content[:150].replace('\n', ' ').strip()}...\"" for doc in sources]
+            return context, citations
+        except Exception as e:
+            log_debug(f"Regulatory context error: {str(e)}")
+            return "", []
+    def handle_user_input(self, upload, user_query: str) -> str:
+        try:
+            normalized_query = user_query.lower().strip()
+            if normalized_query in ["hi", "hello", "hey", "salaam", "salam", "hola"]:
+                return "👋 Hello! I'm TrustGuardian. Upload a PDF or ask about compliance (e.g., 'HIPAA requirements')."
+            doc_text = ""
+            if upload:
+                doc_text = extract_text_from_uploaded_file(upload)
+                analyze_document_structure(doc_text)
+                extract_key_sections(doc_text)
+            reg_context, citations = ("", []) if not any(term in normalized_query for term in ['compliance', 'regulation', 'requirement', 'law', 'standard']) else self.process_regulatory_context(user_query)
+            prompt = self.generate_response_prompt(doc_text, user_query, reg_context)
+            response = llm.invoke(prompt).content.strip()
+            final_response = response + ("\n\nSources:\n" + "\n".join(citations) if citations else "")
+            self.conversation_history.append({"user": user_query, "assistant": final_response, "timestamp": datetime.now().isoformat()})
+            return final_response
+        except Exception as e:
+            return f"⚠️ Error: {str(e)}\nTry rephrasing or check file format."
+# Initialize and run
+guardian = TrustGuardian()
+ui = gr.Interface(
+    fn=guardian.handle_user_input,
+    inputs=[
+        gr.File(label="📄 Upload PDF", type="binary", file_types=[".pdf"]),
+        gr.Textbox(label="💭 Ask a Question", placeholder="E.g., 'Summarize document' or 'GDPR requirements'", lines=2)
+    ],
+    outputs=gr.Markdown(label="📝 Analysis"),
+    title="🛡️ TrustGuardian – Compliance Assistant",
+    description="Upload a PDF or ask about compliance regulations. Get precise answers with exact references.",
+    examples=[[None, "What are HIPAA requirements?"], [None, "Explain GDPR basics"]],
+    theme=gr.themes.Soft()
+)
+if __name__ == "__main__":
+    ui.launch(server_name="0.0.0.0", server_port=7860)