Update app.py
Browse files
app.py
CHANGED
|
@@ -1,234 +1,296 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
import numpy as np
|
| 8 |
from datetime import datetime
|
| 9 |
-
from typing import Optional, Dict, List
|
| 10 |
-
from dotenv import load_dotenv
|
| 11 |
-
import tiktoken
|
| 12 |
from langchain_groq import ChatGroq
|
| 13 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 14 |
from langchain.memory import ConversationSummaryBufferMemory
|
| 15 |
-
from langchain.chains import RetrievalQA
|
| 16 |
from langchain.schema import Document
|
| 17 |
-
from langchain_astradb import AstraDBVectorStore
|
| 18 |
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
#
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
# System constants
|
| 24 |
-
DEBUG_MODE = False
|
| 25 |
MAX_RETRIES = 3
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
| 30 |
|
|
|
|
| 31 |
def log_debug(message: str) -> None:
|
|
|
|
| 32 |
if DEBUG_MODE:
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
try:
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
except Exception as e:
|
| 44 |
-
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
#
|
|
|
|
| 47 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 49 |
-
log_debug("Embedding model initialized")
|
|
|
|
| 50 |
except Exception as e:
|
| 51 |
-
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
# Initialize
|
|
|
|
|
|
|
| 54 |
try:
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
token=ASTRA_DB_APPLICATION_TOKEN
|
| 60 |
-
)
|
| 61 |
-
retriever = astra_vectorstore.as_retriever(
|
| 62 |
-
search_type="mmr",
|
| 63 |
-
search_kwargs={"k": 6, "fetch_k": 12, "lambda_mult": 0.6}
|
| 64 |
)
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
except Exception as e:
|
| 67 |
-
|
|
|
|
|
|
|
| 68 |
|
| 69 |
-
# Initialize
|
|
|
|
| 70 |
try:
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
except Exception as e:
|
| 74 |
-
|
|
|
|
|
|
|
| 75 |
|
| 76 |
-
# Initialize
|
|
|
|
| 77 |
try:
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
except Exception as e:
|
| 82 |
-
|
|
|
|
|
|
|
| 83 |
|
| 84 |
-
# Document
|
|
|
|
| 85 |
class DocumentProcessor:
|
| 86 |
@staticmethod
|
| 87 |
-
def clean_text(
|
| 88 |
-
text = re.sub(r'%PDF-\d+\.\d+|obj|endobj|stream|endstream|xref|trailer|startxref', '', text)
|
| 89 |
-
text = re.sub(r'[^\x20-\x7E\n]', '', text)
|
| 90 |
-
text = re.sub(r'\s+', ' ', text)
|
| 91 |
-
text = re.sub(r'\\n', '\n', text)
|
| 92 |
-
return text.strip()
|
| 93 |
-
|
| 94 |
@staticmethod
|
| 95 |
-
def test_text_quality(
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
if
|
| 101 |
-
|
| 102 |
-
if len(unique_words) < 5:
|
| 103 |
-
return False, f"Too little variety: {len(unique_words)} unique words"
|
| 104 |
-
return True, f"Text quality good: {len(words)} words"
|
| 105 |
-
|
| 106 |
@staticmethod
|
| 107 |
-
def extract_text_from_pdf(
|
|
|
|
| 108 |
try:
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
def extract_text_from_uploaded_file(
|
|
|
|
| 116 |
try:
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
quality,
|
| 121 |
-
if not quality:
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
#
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
return self.encoding.decode(tokens)
|
| 140 |
-
|
| 141 |
-
token_manager = TokenManager()
|
| 142 |
-
|
| 143 |
-
# Text analysis helpers
|
| 144 |
-
def analyze_document_structure(text: str) -> Dict:
|
| 145 |
-
words = text.split()
|
| 146 |
-
lines = text.split('\n')
|
| 147 |
-
return {
|
| 148 |
-
'total_chars': len(text),
|
| 149 |
-
'total_words': len(words),
|
| 150 |
-
'total_lines': len(lines),
|
| 151 |
-
'unique_words': len(set(words))
|
| 152 |
-
}
|
| 153 |
-
|
| 154 |
-
def extract_key_sections(text: str) -> List[str]:
|
| 155 |
-
section_patterns = [
|
| 156 |
-
r'^[A-Z][^a-z\n]{2,}[:\-]',
|
| 157 |
-
r'^\d+\.\s+[A-Z][^a-z]{2,}',
|
| 158 |
-
r'^[IVX]+\.\s+[A-Z]'
|
| 159 |
-
]
|
| 160 |
-
return [line.strip() for line in text.split('\n') if any(re.match(p, line.strip()) for p in section_patterns)]
|
| 161 |
-
|
| 162 |
-
# Main processing logic
|
| 163 |
class TrustGuardian:
|
| 164 |
-
def __init__(self):
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
def generate_response_prompt(self, doc_text: str, user_query: str, reg_context: str = "") -> str:
|
| 169 |
-
return f"""
|
| 170 |
-
You are TrustGuardian, an expert compliance analyst. Provide precise, clear responses with exact references (e.g., "GDPR Article 32(1)(b)") where applicable.
|
| 171 |
-
|
| 172 |
-
TASK: {user_query}
|
| 173 |
-
{'DOCUMENT CONTENT: ' + doc_text[:2500] if doc_text else 'NO DOCUMENT'}
|
| 174 |
-
{'REGULATORY CONTEXT: ' + reg_context if reg_context else ''}
|
| 175 |
-
|
| 176 |
-
INSTRUCTIONS:
|
| 177 |
-
- For documents: Analyze relevant sections, cite document parts (e.g., "Section 3.2") and standards (e.g., "SOC 2 TSC CC6.1").
|
| 178 |
-
- For regulations: Cite specific sections (e.g., "HIPAA Β§164.308"), explain clearly, provide examples.
|
| 179 |
-
- For general queries: Explain compliance aspects, suggest best practices, note sources.
|
| 180 |
-
- If no reference exists, state "No specific reference available" and use general knowledge.
|
| 181 |
-
- Format with headings, bullets, and citations.
|
| 182 |
-
- Suggest next steps if relevant.
|
| 183 |
-
"""
|
| 184 |
-
|
| 185 |
-
def process_regulatory_context(self, query: str) -> tuple:
|
| 186 |
-
try:
|
| 187 |
-
rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
|
| 188 |
-
result = rag_chain.invoke({"query": query})
|
| 189 |
-
context = result["result"]
|
| 190 |
-
sources = result.get("source_documents", [])
|
| 191 |
-
citations = [f"{doc.metadata.get('source', 'Unknown')}: \"{doc.page_content[:150].replace('\n', ' ').strip()}...\"" for doc in sources]
|
| 192 |
-
return context, citations
|
| 193 |
-
except Exception as e:
|
| 194 |
-
log_debug(f"Regulatory context error: {str(e)}")
|
| 195 |
-
return "", []
|
| 196 |
-
|
| 197 |
-
def handle_user_input(self, upload, user_query: str) -> str:
|
| 198 |
try:
|
| 199 |
normalized_query = user_query.lower().strip()
|
| 200 |
-
if normalized_query in ["hi",
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
guardian = TrustGuardian()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
ui = gr.Interface(
|
| 221 |
-
fn=
|
| 222 |
-
inputs=[
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
title="π‘οΈ TrustGuardian β Compliance Assistant",
|
| 228 |
-
description="Upload a PDF or ask about compliance regulations. Get precise answers with exact references.",
|
| 229 |
-
examples=[[None, "What are HIPAA requirements?"], [None, "Explain GDPR basics"]],
|
| 230 |
-
theme=gr.themes.Soft()
|
| 231 |
)
|
|
|
|
|
|
|
| 232 |
|
|
|
|
| 233 |
if __name__ == "__main__":
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py for TrustGuardian Hugging Face Space
|
| 2 |
+
|
| 3 |
+
print("Starting TrustGuardian Application...")
|
| 4 |
+
|
| 5 |
+
# --- π₯ Import Libraries ---
|
| 6 |
+
print("π Importing libraries...")
|
| 7 |
+
import os, io, re, sys, json, numpy as np, time, fitz, tiktoken, gradio as gr, traceback
|
| 8 |
from datetime import datetime
|
| 9 |
+
from typing import Optional, Dict, List, Any
|
|
|
|
|
|
|
| 10 |
from langchain_groq import ChatGroq
|
|
|
|
| 11 |
from langchain.memory import ConversationSummaryBufferMemory
|
|
|
|
| 12 |
from langchain.schema import Document
|
|
|
|
| 13 |
from langchain_huggingface import HuggingFaceEmbeddings
|
| 14 |
+
from langchain_core.prompts import PromptTemplate
|
| 15 |
+
from pinecone import Pinecone
|
| 16 |
+
from langchain_pinecone import PineconeVectorStore
|
| 17 |
+
from langchain_core.messages import HumanMessage, AIMessage
|
| 18 |
+
from langchain.chains import ConversationalRetrievalChain
|
| 19 |
+
print("β
Libraries imported.")
|
| 20 |
|
| 21 |
+
# --- βοΈ System Configuration & Globals ---
|
| 22 |
+
print("\nβοΈ Configuring system settings...")
|
|
|
|
|
|
|
|
|
|
| 23 |
MAX_RETRIES = 3
|
| 24 |
+
DEBUG_MODE = True # Kept True as requested
|
| 25 |
+
VERSION = "2.0"
|
| 26 |
+
MEMORY_TOKENS = 2000
|
| 27 |
+
MAX_HISTORY_TOKENS = 4000
|
| 28 |
+
MAX_DOC_TOKENS_DIRECT = 3000 # Aggressive truncation for doc-only queries
|
| 29 |
+
MAX_RAG_TOKENS = 4000
|
| 30 |
|
| 31 |
+
# --- Logger ---
|
| 32 |
def log_debug(message: str) -> None:
|
| 33 |
+
"""Debug logger function"""
|
| 34 |
if DEBUG_MODE:
|
| 35 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 36 |
+
print(f"[DEBUG {timestamp}] {message}")
|
| 37 |
|
| 38 |
+
log_debug("Debug logging enabled.")
|
| 39 |
+
|
| 40 |
+
# --- Tokenizer Setup (with robust fallback) ---
|
| 41 |
+
print("\nβοΈ Setting up tokenizer functions...")
|
| 42 |
+
# Define Fallback Functions FIRST
|
| 43 |
+
def count_tokens_fallback(text: str) -> int: log_debug("Using estimated token count"); return len(text) // 4
|
| 44 |
+
def truncate_to_limit_fallback(text: str, max_tokens: int) -> str: log_debug("Using estimated truncation"); return text[:max_tokens*4]
|
| 45 |
+
# Assign default functions
|
| 46 |
+
count_tokens = count_tokens_fallback
|
| 47 |
+
truncate_to_limit = truncate_to_limit_fallback
|
| 48 |
+
# Try to get real Tiktoken functions
|
| 49 |
try:
|
| 50 |
+
token_manager = tiktoken.get_encoding("cl100k_base")
|
| 51 |
+
def count_tokens_real(text: str) -> int:
|
| 52 |
+
try: return len(token_manager.encode(text))
|
| 53 |
+
except Exception as e: log_debug(f"Tiktoken count error: {e}. Falling back."); return count_tokens_fallback(text)
|
| 54 |
+
def truncate_to_limit_real(text: str, max_tokens: int) -> str:
|
| 55 |
+
try: tokens=token_manager.encode(text); T=tokens[:max_tokens] if len(tokens)>max_tokens else tokens; log_debug(f"Truncated tokens: {len(T)}/{len(tokens)}"); return token_manager.decode(T)
|
| 56 |
+
except Exception as e: log_debug(f"Tiktoken truncate error: {e}. Falling back."); return truncate_to_limit_fallback(text, max_tokens)
|
| 57 |
+
# Overwrite the globals with the real functions
|
| 58 |
+
count_tokens = count_tokens_real
|
| 59 |
+
truncate_to_limit = truncate_to_limit_real
|
| 60 |
+
print("β
Tiktoken tokenizer functions ready.")
|
| 61 |
except Exception as e:
|
| 62 |
+
print(f"β οΈ Warning: Failed tiktoken init: {e}. Using estimated token functions.")
|
| 63 |
+
# --- End Tokenizer Setup ---
|
| 64 |
+
|
| 65 |
|
| 66 |
+
# --- π Load API Keys from Environment Variables (Hugging Face Secrets) ---
|
| 67 |
+
print("\nπ Loading API keys from environment variables...")
|
| 68 |
try:
|
| 69 |
+
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
|
| 70 |
+
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
|
| 71 |
+
|
| 72 |
+
if not GROQ_API_KEY: raise ValueError("Secret 'GROQ_API_KEY' not found in environment variables.")
|
| 73 |
+
if not PINECONE_API_KEY: raise ValueError("Secret 'PINECONE_API_KEY' not found in environment variables.")
|
| 74 |
+
|
| 75 |
+
# IMPORTANT: Set Pinecone key in environment for Langchain wrapper if needed
|
| 76 |
+
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
|
| 77 |
+
log_debug("API Keys retrieved from environment variables.")
|
| 78 |
+
print("β
API keys ready.")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
log_debug(f"Error loading API keys: {e}")
|
| 81 |
+
# Raising SystemExit might cause issues on HF, better to print and exit gracefully?
|
| 82 |
+
print(f"FATAL ERROR: Could not load API keys from Secrets. Please check Space settings. Error: {e}")
|
| 83 |
+
sys.exit(1) # Exit if keys are missing
|
| 84 |
+
|
| 85 |
+
# --- π Initialize Embedding Model ---
|
| 86 |
+
print("\nπ§ Initializing embedding model...")
|
| 87 |
+
try:
|
| 88 |
+
# Consider adding cache_folder='./models' for HF persistence if needed
|
| 89 |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 90 |
+
log_debug("Embedding model initialized.")
|
| 91 |
+
print("β
Embedding model ready.")
|
| 92 |
except Exception as e:
|
| 93 |
+
log_debug(f"Embedding init error: {e}\n{traceback.format_exc()}")
|
| 94 |
+
print(f"FATAL ERROR: Could not initialize embedding model: {e}")
|
| 95 |
+
sys.exit(1)
|
| 96 |
|
| 97 |
+
# --- π² Initialize Pinecone Vector Store ---
|
| 98 |
+
print("\nπ Setting up Pinecone vector store...")
|
| 99 |
+
PINECONE_INDEX_NAME = "trustguardian" # Make sure this matches your index name
|
| 100 |
try:
|
| 101 |
+
# Uses PINECONE_API_KEY from environment variable set earlier
|
| 102 |
+
vectorstore = PineconeVectorStore.from_existing_index(
|
| 103 |
+
index_name=PINECONE_INDEX_NAME,
|
| 104 |
+
embedding=embedding_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
)
|
| 106 |
+
# Add a simple check to confirm connection (optional but recommended)
|
| 107 |
+
log_debug(f"Attempting connection to Pinecone index '{PINECONE_INDEX_NAME}'...")
|
| 108 |
+
# Try a dummy search or fetch stats if possible with vectorstore object
|
| 109 |
+
# Example: vectorstore.similarity_search("test connection", k=1)
|
| 110 |
+
log_debug(f"Successfully initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'.")
|
| 111 |
+
print("β
Pinecone vector store ready.")
|
| 112 |
+
except Exception as e:
|
| 113 |
+
log_debug(f"Pinecone init error: {e}\n{traceback.format_exc()}")
|
| 114 |
+
print(f"FATAL ERROR: Could not connect to Pinecone index '{PINECONE_INDEX_NAME}': {e}")
|
| 115 |
+
sys.exit(1)
|
| 116 |
+
|
| 117 |
+
# --- π€ Initialize LLM ---
|
| 118 |
+
print("\nπ€ Initializing LLM...")
|
| 119 |
+
try:
|
| 120 |
+
# Using llama-3.1-8b-instant
|
| 121 |
+
llm = ChatGroq(groq_api_key=GROQ_API_KEY, model_name="llama-3.1-8b-instant")
|
| 122 |
+
log_debug(f"LLM initialized with model: {llm.model_name}.")
|
| 123 |
+
print(f"β
LLM ready ({llm.model_name}).")
|
| 124 |
except Exception as e:
|
| 125 |
+
log_debug(f"LLM init error: {e}\n{traceback.format_exc()}")
|
| 126 |
+
print(f"FATAL ERROR: Could not initialize LLM: {e}")
|
| 127 |
+
sys.exit(1)
|
| 128 |
|
| 129 |
+
# --- π§ Initialize Memory ---
|
| 130 |
+
print("\nπ Setting up conversation memory...")
|
| 131 |
try:
|
| 132 |
+
memory = ConversationSummaryBufferMemory(
|
| 133 |
+
llm=llm,
|
| 134 |
+
max_token_limit=MEMORY_TOKENS,
|
| 135 |
+
return_messages=True,
|
| 136 |
+
memory_key="chat_history",
|
| 137 |
+
output_key='answer' # Matches chain output key
|
| 138 |
+
)
|
| 139 |
+
log_debug("Memory system initialized.")
|
| 140 |
+
print("β
Memory systems ready.")
|
| 141 |
except Exception as e:
|
| 142 |
+
log_debug(f"Memory init error: {e}\n{traceback.format_exc()}")
|
| 143 |
+
print(f"FATAL ERROR: Could not initialize memory: {e}")
|
| 144 |
+
sys.exit(1)
|
| 145 |
|
| 146 |
+
# --- π Initialize Conversational Retrieval Chain ---
|
| 147 |
+
print("\nπ Initializing ConversationalRetrievalChain...")
|
| 148 |
try:
|
| 149 |
+
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 8, "lambda_mult": 0.5})
|
| 150 |
+
qa_chain = ConversationalRetrievalChain.from_llm(
|
| 151 |
+
llm=llm,
|
| 152 |
+
retriever=retriever,
|
| 153 |
+
memory=memory, # Pass the memory object here
|
| 154 |
+
return_source_documents=True, # To display sources
|
| 155 |
+
verbose=DEBUG_MODE # Chain will log intermediate steps if True
|
| 156 |
+
)
|
| 157 |
+
print("β
ConversationalRetrievalChain ready.")
|
| 158 |
except Exception as e:
|
| 159 |
+
log_debug(f"Chain init error: {e}\n{traceback.format_exc()}")
|
| 160 |
+
print(f"FATAL ERROR: Could not initialize qa_chain: {e}")
|
| 161 |
+
sys.exit(1)
|
| 162 |
|
| 163 |
+
# --- π Document Processing Functions ---
|
| 164 |
+
print("\nπ Setting up document processing functions...")
|
| 165 |
class DocumentProcessor:
|
| 166 |
@staticmethod
|
| 167 |
+
def clean_text(t): log_debug("Cleaning(simplified)...");t=re.sub(r'\b(obj|endobj|stream|endstream|xref|trailer|startxref)\b','',t,flags=re.IGNORECASE);t=re.sub(r'\s+',' ',t).strip();return t
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
@staticmethod
|
| 169 |
+
def test_text_quality(t): # Verified Multi-line formatting
|
| 170 |
+
log_debug(f"Testing quality(len:{len(t)})...");
|
| 171 |
+
if not t or not t.strip():log_debug("Fail:Empty");return False,"Empty text"
|
| 172 |
+
w=t.split();wc=len(w);uc=len(set(w));log_debug(f"W:{wc},U:{uc}")
|
| 173 |
+
if wc<10:log_debug("Fail:W<10");return False,f"Too few words:{wc}"
|
| 174 |
+
if uc<5:log_debug("Fail:U<5");return False,f"Too little variety:{uc}"
|
| 175 |
+
log_debug("Pass.");return True,f"Quality OK:{wc} words"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
@staticmethod
|
| 177 |
+
def extract_text_from_pdf(d): # Using PyMuPDF
|
| 178 |
+
log_debug("Extracting(PyMuPDF)...");tp=[];doc=None
|
| 179 |
try:
|
| 180 |
+
doc=fitz.open(stream=d,filetype="pdf");[tp.append(p.get_text("text",sort=True))for i in range(len(doc))if(p:=doc.load_page(i))and p.get_text("text")]
|
| 181 |
+
full_text="\\n".join(filter(None, tp));log_debug(f"Extracted len:{len(full_text)}")
|
| 182 |
+
if not full_text: log_debug("Warning: PyMuPDF extracted no text.")
|
| 183 |
+
return full_text
|
| 184 |
+
except Exception as e: log_debug(f"PyMuPDF error:{e}"); raise ValueError(f"PyMuPDF failed:{e}")
|
| 185 |
+
finally:
|
| 186 |
+
if doc: doc.close() # Ensure document is closed
|
| 187 |
|
| 188 |
+
def extract_text_from_uploaded_file(b):
|
| 189 |
+
log_debug("\\nπ Processing upload...");t="";ct=""
|
| 190 |
try:
|
| 191 |
+
if not isinstance(b,bytes):raise ValueError("Expected bytes.")
|
| 192 |
+
t=DocumentProcessor.extract_text_from_pdf(b);ct=DocumentProcessor.clean_text(t)
|
| 193 |
+
log_debug(f"Cleaned length:{len(ct)}")
|
| 194 |
+
quality,msg=DocumentProcessor.test_text_quality(ct);log_debug(f"Quality check:{msg}")
|
| 195 |
+
if not quality:raise ValueError(f"Poor quality:{msg}")
|
| 196 |
+
return ct
|
| 197 |
+
except Exception as e:err=f"Doc processing fail:{e}";log_debug(err);raise ValueError(err)
|
| 198 |
+
print("β
Document processing functions ready.")
|
| 199 |
+
|
| 200 |
+
# --- Text Analysis Helpers ---
|
| 201 |
+
# (Keep as is - can be removed if not called in final logic)
|
| 202 |
+
def analyze_document_structure(t): log_debug("Analyzing doc structure (optional)..."); return {} # Dummy implementation if not used
|
| 203 |
+
def extract_key_sections(t): log_debug("Extracting key sections (optional)..."); return [] # Dummy implementation if not used
|
| 204 |
+
print("β
Text analysis helpers ready.")
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
# --- Helper for Conditional Logic ---
|
| 208 |
+
def query_seems_doc_specific(query: str) -> bool: # ... (Definition as before) ...
|
| 209 |
+
query_lower=query.lower();dk=["this document","this file","uploaded document","uploaded file","summarize","summarise","analyze this","analyse this","extract from"]; is_s=any(k in query_lower for k in dk);log_debug(f"Query doc-specific check: {is_s}");return is_s
|
| 210 |
+
|
| 211 |
+
# --- π§ Main Application Class & Logic (Approach 1 - Conditional) ---
|
| 212 |
+
print("\nπ Setting up main application logic...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
class TrustGuardian:
|
| 214 |
+
def __init__(self): log_debug("TrustGuardian initialized (uses global components)")
|
| 215 |
+
def handle_user_input(self, upload_data: Optional[bytes], user_query: str) -> str:
|
| 216 |
+
log_debug(f"\\nπ Processing Request: '{user_query[:100]}...'"); text_to_return=""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
try:
|
| 218 |
normalized_query = user_query.lower().strip()
|
| 219 |
+
if normalized_query in ["hi","hello","hey","salaam","salam","hola"]: return "π Hello! ..."
|
| 220 |
+
|
| 221 |
+
doc_is_uploaded = upload_data is not None
|
| 222 |
+
is_doc_query = doc_is_uploaded and query_seems_doc_specific(user_query)
|
| 223 |
+
|
| 224 |
+
if is_doc_query: # Mode 1: Doc-specific Query
|
| 225 |
+
log_debug("Mode: Doc Query - Direct LLM Call")
|
| 226 |
+
try:
|
| 227 |
+
doc_text=extract_text_from_uploaded_file(upload_data)
|
| 228 |
+
truncated_doc = truncate_to_limit(doc_text, MAX_DOC_TOKENS_DIRECT) # Use constant
|
| 229 |
+
prompt=f"User Query:{user_query}\n\nDocument Content(Truncated):\n{truncated_doc}\n\nInstructions:Answer based ONLY on doc."
|
| 230 |
+
log_debug(f"Doc-only prompt (~{count_tokens(prompt)} tokens)")
|
| 231 |
+
# Use global llm object
|
| 232 |
+
response_message = llm.invoke(prompt)
|
| 233 |
+
text_to_return = response_message.content.strip(); log_debug("Generated doc-specific response.")
|
| 234 |
+
log_debug("Skipping memory update for doc-specific query.")
|
| 235 |
+
except Exception as e: log_debug(f"Error during doc processing/query: {e}"); text_to_return = f"β οΈ Doc Error: {e}"
|
| 236 |
+
else: # Mode 2: KB/Chat Query
|
| 237 |
+
log_debug("Mode: KB/Chat Query - Using ConversationalRetrievalChain")
|
| 238 |
+
# Use global qa_chain object (which includes memory)
|
| 239 |
+
chat_history_messages = memory.chat_memory.messages # Get history in correct format
|
| 240 |
+
log_debug(f"Passing {len(chat_history_messages)} history messages to chain.")
|
| 241 |
+
chain_input = {"question": user_query, "chat_history": chat_history_messages}
|
| 242 |
+
result = qa_chain.invoke(chain_input) # Memory is updated by the chain
|
| 243 |
+
log_debug(f"qa_chain completed.")
|
| 244 |
+
text_to_return = result.get("answer", "Sorry, I couldn't generate response.")
|
| 245 |
+
if result.get("source_documents"): # Append sources
|
| 246 |
+
citations=[f"π {doc.metadata.get('source',f'Src{i+1}')}" for i,doc in enumerate(result["source_documents"])]
|
| 247 |
+
if citations: text_to_return += "\n\n---\nπ Sources Consulted:\n" + "\n".join(list(set(citations)))
|
| 248 |
+
except Exception as e: error_msg=f"Request error: {e}"; log_debug(f"Error: {error_msg}\n{traceback.format_exc()}"); text_to_return=f"β οΈ Error: {error_msg}"
|
| 249 |
+
return text_to_return if text_to_return else "Unexpected issue."
|
| 250 |
+
|
| 251 |
+
# --- Initialize Guardian Instance ---
|
| 252 |
guardian = TrustGuardian()
|
| 253 |
+
print("β
Main application logic ready.")
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
# --- π¨ Gradio Interface Definition ---
|
| 257 |
+
print("\nπ¨ Setting up Gradio user interface...")
|
| 258 |
+
def ui_handler(upload_file_input, query):
|
| 259 |
+
"""Wrapper function for Gradio interface."""
|
| 260 |
+
try:
|
| 261 |
+
upload_bytes=None
|
| 262 |
+
if upload_file_input is not None:
|
| 263 |
+
if isinstance(upload_file_input, bytes): upload_bytes = upload_file_input; log_debug(f"Received {len(upload_bytes)} bytes.")
|
| 264 |
+
else: log_debug(f"Warning: Received unexpected type: {type(upload_file_input)}"); raise ValueError("Unexpected file data type.")
|
| 265 |
+
else: log_debug("No file uploaded.")
|
| 266 |
+
if not isinstance(query,str): query=str(query) if query is not None else ""
|
| 267 |
+
# Call main handler in the guardian instance
|
| 268 |
+
response_markdown = guardian.handle_user_input(upload_bytes, query)
|
| 269 |
+
return response_markdown
|
| 270 |
+
except Exception as e:
|
| 271 |
+
log_debug(f"Gradio Handler Error: {e}\n{traceback.format_exc()}")
|
| 272 |
+
return f"β οΈ System Error in UI Handler: {str(e)}"
|
| 273 |
+
|
| 274 |
+
# Define Gradio components
|
| 275 |
+
file_input = gr.File(label="π Upload Document (PDF)", type="binary", file_types=[".pdf"])
|
| 276 |
+
text_input = gr.Textbox(label="π Ask a Question", placeholder="E.g., 'Summarize doc' or 'HIPAA requirements?'", lines=3)
|
| 277 |
+
markdown_output = gr.Markdown(label="π Analysis & Response")
|
| 278 |
+
|
| 279 |
+
# Define the Interface
|
| 280 |
ui = gr.Interface(
|
| 281 |
+
fn=ui_handler,
|
| 282 |
+
inputs=[file_input, text_input],
|
| 283 |
+
outputs=[markdown_output],
|
| 284 |
+
title="π‘οΈ TrustGuardian β Compliance Analysis Assistant (v" + VERSION + ")",
|
| 285 |
+
description="Upload a PDF document for analysis (summary/Q&A based on first ~3000 tokens) or ask a general compliance question about standards like GDPR, HIPAA, NIST, ISO 27001, SOC 2, PCI DSS.",
|
| 286 |
+
allow_flagging="never"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
)
|
| 288 |
+
print("β
User interface defined.")
|
| 289 |
+
|
| 290 |
|
| 291 |
+
# --- Launch Gradio App ---
|
| 292 |
if __name__ == "__main__":
|
| 293 |
+
print("\nπ Launching Gradio UI...")
|
| 294 |
+
# Set server_name for HF Spaces compatibility
|
| 295 |
+
ui.launch(server_name="0.0.0.0", server_port=7860, debug=DEBUG_MODE) # Use port 7860 common for HF
|
| 296 |
+
print(" Gradio launch initiated. App should be running.")
|