ahmedsalman82 commited on
Commit
1227a81
Β·
verified Β·
1 Parent(s): 2473557

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +253 -191
app.py CHANGED
@@ -1,234 +1,296 @@
1
- import os
2
- import io
3
- import re
4
- import json
5
- import PyPDF2
6
- import gradio as gr
7
- import numpy as np
8
  from datetime import datetime
9
- from typing import Optional, Dict, List
10
- from dotenv import load_dotenv
11
- import tiktoken
12
  from langchain_groq import ChatGroq
13
- from langchain.text_splitter import RecursiveCharacterTextSplitter
14
  from langchain.memory import ConversationSummaryBufferMemory
15
- from langchain.chains import RetrievalQA
16
  from langchain.schema import Document
17
- from langchain_astradb import AstraDBVectorStore
18
  from langchain_huggingface import HuggingFaceEmbeddings
 
 
 
 
 
 
19
 
20
- # Load environment variables
21
- load_dotenv()
22
-
23
- # System constants
24
- DEBUG_MODE = False
25
  MAX_RETRIES = 3
26
- MODEL_TOKEN_LIMIT = 6000
27
- DOC_TOKENS = 2500
28
- REG_TOKENS = 1500
29
- MEMORY_TOKENS = 1000
 
 
30
 
 
31
  def log_debug(message: str) -> None:
 
32
  if DEBUG_MODE:
33
- print(f"[DEBUG {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}")
 
34
 
35
- # Load API keys
 
 
 
 
 
 
 
 
 
 
36
  try:
37
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
38
- ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
39
- ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
40
- if not all([GROQ_API_KEY, ASTRA_DB_API_ENDPOINT, ASTRA_DB_APPLICATION_TOKEN]):
41
- raise ValueError("Missing API keys")
42
- log_debug("API keys loaded")
 
 
 
 
 
43
  except Exception as e:
44
- raise ValueError(f"Failed to load API keys: {str(e)}")
 
 
45
 
46
- # Initialize embedding model
 
47
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
49
- log_debug("Embedding model initialized")
 
50
  except Exception as e:
51
- raise ValueError(f"Failed to initialize embedding model: {str(e)}")
 
 
52
 
53
- # Initialize vector store
 
 
54
  try:
55
- astra_vectorstore = AstraDBVectorStore(
56
- embedding=embedding_model,
57
- collection_name="trustguardian_kb",
58
- api_endpoint=ASTRA_DB_API_ENDPOINT,
59
- token=ASTRA_DB_APPLICATION_TOKEN
60
- )
61
- retriever = astra_vectorstore.as_retriever(
62
- search_type="mmr",
63
- search_kwargs={"k": 6, "fetch_k": 12, "lambda_mult": 0.6}
64
  )
65
- log_debug("Vector store initialized")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  except Exception as e:
67
- raise ValueError(f"Failed to initialize vector store: {str(e)}")
 
 
68
 
69
- # Initialize LLM
 
70
  try:
71
- llm = ChatGroq(groq_api_key=GROQ_API_KEY, model_name="mistral-saba-24b")
72
- log_debug("LLM initialized")
 
 
 
 
 
 
 
73
  except Exception as e:
74
- raise ValueError(f"Failed to initialize LLM: {str(e)}")
 
 
75
 
76
- # Initialize memory
 
77
  try:
78
- memory = ConversationSummaryBufferMemory(llm=llm, max_token_limit=8000, return_messages=True)
79
- doc_memory = {"latest_doc": ""}
80
- log_debug("Memory initialized")
 
 
 
 
 
 
81
  except Exception as e:
82
- raise ValueError(f"Failed to initialize memory: {str(e)}")
 
 
83
 
84
- # Document processing
 
85
  class DocumentProcessor:
86
  @staticmethod
87
- def clean_text(text: str) -> str:
88
- text = re.sub(r'%PDF-\d+\.\d+|obj|endobj|stream|endstream|xref|trailer|startxref', '', text)
89
- text = re.sub(r'[^\x20-\x7E\n]', '', text)
90
- text = re.sub(r'\s+', ' ', text)
91
- text = re.sub(r'\\n', '\n', text)
92
- return text.strip()
93
-
94
  @staticmethod
95
- def test_text_quality(text: str) -> tuple:
96
- if not text.strip():
97
- return False, "Empty text"
98
- words = text.split()
99
- unique_words = set(words)
100
- if len(words) < 10:
101
- return False, f"Too few words: {len(words)}"
102
- if len(unique_words) < 5:
103
- return False, f"Too little variety: {len(unique_words)} unique words"
104
- return True, f"Text quality good: {len(words)} words"
105
-
106
  @staticmethod
107
- def extract_text_from_pdf(file_data: bytes) -> str:
 
108
  try:
109
- reader = PyPDF2.PdfReader(io.BytesIO(file_data))
110
- text_parts = [page.extract_text() for page in reader.pages if page.extract_text().strip()]
111
- return "\n".join(text_parts)
112
- except Exception as e:
113
- raise ValueError(f"PDF extraction failed: {str(e)}")
 
 
114
 
115
- def extract_text_from_uploaded_file(uploaded_file) -> str:
 
116
  try:
117
- file_data = uploaded_file.read() if hasattr(uploaded_file, 'read') else uploaded_file
118
- text = DocumentProcessor.extract_text_from_pdf(file_data)
119
- cleaned_text = DocumentProcessor.clean_text(text)
120
- quality, msg = DocumentProcessor.test_text_quality(cleaned_text)
121
- if not quality:
122
- raise ValueError(f"Poor text quality: {msg}")
123
- return cleaned_text
124
- except Exception as e:
125
- raise ValueError(f"Document processing failed: {str(e)}\nEnsure valid PDF with text content.")
126
-
127
- # Token management
128
- class TokenManager:
129
- def __init__(self):
130
- self.encoding = tiktoken.get_encoding("cl100k_base")
131
-
132
- def count_tokens(self, text: str) -> int:
133
- return len(self.encoding.encode(text))
134
-
135
- def truncate_to_limit(self, text: str, max_tokens: int) -> str:
136
- tokens = self.encoding.encode(text)
137
- if len(tokens) > max_tokens:
138
- tokens = tokens[:max_tokens]
139
- return self.encoding.decode(tokens)
140
-
141
- token_manager = TokenManager()
142
-
143
- # Text analysis helpers
144
- def analyze_document_structure(text: str) -> Dict:
145
- words = text.split()
146
- lines = text.split('\n')
147
- return {
148
- 'total_chars': len(text),
149
- 'total_words': len(words),
150
- 'total_lines': len(lines),
151
- 'unique_words': len(set(words))
152
- }
153
-
154
- def extract_key_sections(text: str) -> List[str]:
155
- section_patterns = [
156
- r'^[A-Z][^a-z\n]{2,}[:\-]',
157
- r'^\d+\.\s+[A-Z][^a-z]{2,}',
158
- r'^[IVX]+\.\s+[A-Z]'
159
- ]
160
- return [line.strip() for line in text.split('\n') if any(re.match(p, line.strip()) for p in section_patterns)]
161
-
162
- # Main processing logic
163
  class TrustGuardian:
164
- def __init__(self):
165
- self.token_manager = TokenManager()
166
- self.conversation_history = []
167
-
168
- def generate_response_prompt(self, doc_text: str, user_query: str, reg_context: str = "") -> str:
169
- return f"""
170
- You are TrustGuardian, an expert compliance analyst. Provide precise, clear responses with exact references (e.g., "GDPR Article 32(1)(b)") where applicable.
171
-
172
- TASK: {user_query}
173
- {'DOCUMENT CONTENT: ' + doc_text[:2500] if doc_text else 'NO DOCUMENT'}
174
- {'REGULATORY CONTEXT: ' + reg_context if reg_context else ''}
175
-
176
- INSTRUCTIONS:
177
- - For documents: Analyze relevant sections, cite document parts (e.g., "Section 3.2") and standards (e.g., "SOC 2 TSC CC6.1").
178
- - For regulations: Cite specific sections (e.g., "HIPAA Β§164.308"), explain clearly, provide examples.
179
- - For general queries: Explain compliance aspects, suggest best practices, note sources.
180
- - If no reference exists, state "No specific reference available" and use general knowledge.
181
- - Format with headings, bullets, and citations.
182
- - Suggest next steps if relevant.
183
- """
184
-
185
- def process_regulatory_context(self, query: str) -> tuple:
186
- try:
187
- rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
188
- result = rag_chain.invoke({"query": query})
189
- context = result["result"]
190
- sources = result.get("source_documents", [])
191
- citations = [f"{doc.metadata.get('source', 'Unknown')}: \"{doc.page_content[:150].replace('\n', ' ').strip()}...\"" for doc in sources]
192
- return context, citations
193
- except Exception as e:
194
- log_debug(f"Regulatory context error: {str(e)}")
195
- return "", []
196
-
197
- def handle_user_input(self, upload, user_query: str) -> str:
198
  try:
199
  normalized_query = user_query.lower().strip()
200
- if normalized_query in ["hi", "hello", "hey", "salaam", "salam", "hola"]:
201
- return "πŸ‘‹ Hello! I'm TrustGuardian. Upload a PDF or ask about compliance (e.g., 'HIPAA requirements')."
202
-
203
- doc_text = ""
204
- if upload:
205
- doc_text = extract_text_from_uploaded_file(upload)
206
- analyze_document_structure(doc_text)
207
- extract_key_sections(doc_text)
208
-
209
- reg_context, citations = ("", []) if not any(term in normalized_query for term in ['compliance', 'regulation', 'requirement', 'law', 'standard']) else self.process_regulatory_context(user_query)
210
- prompt = self.generate_response_prompt(doc_text, user_query, reg_context)
211
- response = llm.invoke(prompt).content.strip()
212
- final_response = response + ("\n\nSources:\n" + "\n".join(citations) if citations else "")
213
- self.conversation_history.append({"user": user_query, "assistant": final_response, "timestamp": datetime.now().isoformat()})
214
- return final_response
215
- except Exception as e:
216
- return f"⚠️ Error: {str(e)}\nTry rephrasing or check file format."
217
-
218
- # Initialize and run
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  guardian = TrustGuardian()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  ui = gr.Interface(
221
- fn=guardian.handle_user_input,
222
- inputs=[
223
- gr.File(label="πŸ“„ Upload PDF", type="binary", file_types=[".pdf"]),
224
- gr.Textbox(label="πŸ’­ Ask a Question", placeholder="E.g., 'Summarize document' or 'GDPR requirements'", lines=2)
225
- ],
226
- outputs=gr.Markdown(label="πŸ“ Analysis"),
227
- title="πŸ›‘οΈ TrustGuardian – Compliance Assistant",
228
- description="Upload a PDF or ask about compliance regulations. Get precise answers with exact references.",
229
- examples=[[None, "What are HIPAA requirements?"], [None, "Explain GDPR basics"]],
230
- theme=gr.themes.Soft()
231
  )
 
 
232
 
 
233
  if __name__ == "__main__":
234
- ui.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
1
+ # app.py for TrustGuardian Hugging Face Space
2
+
3
+ print("Starting TrustGuardian Application...")
4
+
5
+ # --- πŸ”₯ Import Libraries ---
6
+ print("πŸ“š Importing libraries...")
7
+ import os, io, re, sys, json, numpy as np, time, fitz, tiktoken, gradio as gr, traceback
8
  from datetime import datetime
9
+ from typing import Optional, Dict, List, Any
 
 
10
  from langchain_groq import ChatGroq
 
11
  from langchain.memory import ConversationSummaryBufferMemory
 
12
  from langchain.schema import Document
 
13
  from langchain_huggingface import HuggingFaceEmbeddings
14
+ from langchain_core.prompts import PromptTemplate
15
+ from pinecone import Pinecone
16
+ from langchain_pinecone import PineconeVectorStore
17
+ from langchain_core.messages import HumanMessage, AIMessage
18
+ from langchain.chains import ConversationalRetrievalChain
19
+ print("βœ… Libraries imported.")
20
 
21
+ # --- βš™οΈ System Configuration & Globals ---
22
+ print("\nβš™οΈ Configuring system settings...")
 
 
 
23
  MAX_RETRIES = 3
24
+ DEBUG_MODE = True # Kept True as requested
25
+ VERSION = "2.0"
26
+ MEMORY_TOKENS = 2000
27
+ MAX_HISTORY_TOKENS = 4000
28
+ MAX_DOC_TOKENS_DIRECT = 3000 # Aggressive truncation for doc-only queries
29
+ MAX_RAG_TOKENS = 4000
30
 
31
+ # --- Logger ---
32
  def log_debug(message: str) -> None:
33
+ """Debug logger function"""
34
  if DEBUG_MODE:
35
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
36
+ print(f"[DEBUG {timestamp}] {message}")
37
 
38
+ log_debug("Debug logging enabled.")
39
+
40
+ # --- Tokenizer Setup (with robust fallback) ---
41
+ print("\nβš™οΈ Setting up tokenizer functions...")
42
+ # Define Fallback Functions FIRST
43
+ def count_tokens_fallback(text: str) -> int: log_debug("Using estimated token count"); return len(text) // 4
44
+ def truncate_to_limit_fallback(text: str, max_tokens: int) -> str: log_debug("Using estimated truncation"); return text[:max_tokens*4]
45
+ # Assign default functions
46
+ count_tokens = count_tokens_fallback
47
+ truncate_to_limit = truncate_to_limit_fallback
48
+ # Try to get real Tiktoken functions
49
  try:
50
+ token_manager = tiktoken.get_encoding("cl100k_base")
51
+ def count_tokens_real(text: str) -> int:
52
+ try: return len(token_manager.encode(text))
53
+ except Exception as e: log_debug(f"Tiktoken count error: {e}. Falling back."); return count_tokens_fallback(text)
54
+ def truncate_to_limit_real(text: str, max_tokens: int) -> str:
55
+ try: tokens=token_manager.encode(text); T=tokens[:max_tokens] if len(tokens)>max_tokens else tokens; log_debug(f"Truncated tokens: {len(T)}/{len(tokens)}"); return token_manager.decode(T)
56
+ except Exception as e: log_debug(f"Tiktoken truncate error: {e}. Falling back."); return truncate_to_limit_fallback(text, max_tokens)
57
+ # Overwrite the globals with the real functions
58
+ count_tokens = count_tokens_real
59
+ truncate_to_limit = truncate_to_limit_real
60
+ print("βœ… Tiktoken tokenizer functions ready.")
61
  except Exception as e:
62
+ print(f"⚠️ Warning: Failed tiktoken init: {e}. Using estimated token functions.")
63
+ # --- End Tokenizer Setup ---
64
+
65
 
66
+ # --- πŸ”‘ Load API Keys from Environment Variables (Hugging Face Secrets) ---
67
+ print("\nπŸ” Loading API keys from environment variables...")
68
  try:
69
+ GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
70
+ PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
71
+
72
+ if not GROQ_API_KEY: raise ValueError("Secret 'GROQ_API_KEY' not found in environment variables.")
73
+ if not PINECONE_API_KEY: raise ValueError("Secret 'PINECONE_API_KEY' not found in environment variables.")
74
+
75
+ # IMPORTANT: Set Pinecone key in environment for Langchain wrapper if needed
76
+ os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
77
+ log_debug("API Keys retrieved from environment variables.")
78
+ print("βœ… API keys ready.")
79
+ except Exception as e:
80
+ log_debug(f"Error loading API keys: {e}")
81
+ # Raising SystemExit might cause issues on HF, better to print and exit gracefully?
82
+ print(f"FATAL ERROR: Could not load API keys from Secrets. Please check Space settings. Error: {e}")
83
+ sys.exit(1) # Exit if keys are missing
84
+
85
+ # --- πŸ“š Initialize Embedding Model ---
86
+ print("\n🧠 Initializing embedding model...")
87
+ try:
88
+ # Consider adding cache_folder='./models' for HF persistence if needed
89
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
90
+ log_debug("Embedding model initialized.")
91
+ print("βœ… Embedding model ready.")
92
  except Exception as e:
93
+ log_debug(f"Embedding init error: {e}\n{traceback.format_exc()}")
94
+ print(f"FATAL ERROR: Could not initialize embedding model: {e}")
95
+ sys.exit(1)
96
 
97
+ # --- 🌲 Initialize Pinecone Vector Store ---
98
+ print("\nπŸ”„ Setting up Pinecone vector store...")
99
+ PINECONE_INDEX_NAME = "trustguardian" # Make sure this matches your index name
100
  try:
101
+ # Uses PINECONE_API_KEY from environment variable set earlier
102
+ vectorstore = PineconeVectorStore.from_existing_index(
103
+ index_name=PINECONE_INDEX_NAME,
104
+ embedding=embedding_model
 
 
 
 
 
105
  )
106
+ # Add a simple check to confirm connection (optional but recommended)
107
+ log_debug(f"Attempting connection to Pinecone index '{PINECONE_INDEX_NAME}'...")
108
+ # Try a dummy search or fetch stats if possible with vectorstore object
109
+ # Example: vectorstore.similarity_search("test connection", k=1)
110
+ log_debug(f"Successfully initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'.")
111
+ print("βœ… Pinecone vector store ready.")
112
+ except Exception as e:
113
+ log_debug(f"Pinecone init error: {e}\n{traceback.format_exc()}")
114
+ print(f"FATAL ERROR: Could not connect to Pinecone index '{PINECONE_INDEX_NAME}': {e}")
115
+ sys.exit(1)
116
+
117
+ # --- πŸ€– Initialize LLM ---
118
+ print("\nπŸ€– Initializing LLM...")
119
+ try:
120
+ # Using llama-3.1-8b-instant
121
+ llm = ChatGroq(groq_api_key=GROQ_API_KEY, model_name="llama-3.1-8b-instant")
122
+ log_debug(f"LLM initialized with model: {llm.model_name}.")
123
+ print(f"βœ… LLM ready ({llm.model_name}).")
124
  except Exception as e:
125
+ log_debug(f"LLM init error: {e}\n{traceback.format_exc()}")
126
+ print(f"FATAL ERROR: Could not initialize LLM: {e}")
127
+ sys.exit(1)
128
 
129
+ # --- 🧠 Initialize Memory ---
130
+ print("\nπŸ’­ Setting up conversation memory...")
131
  try:
132
+ memory = ConversationSummaryBufferMemory(
133
+ llm=llm,
134
+ max_token_limit=MEMORY_TOKENS,
135
+ return_messages=True,
136
+ memory_key="chat_history",
137
+ output_key='answer' # Matches chain output key
138
+ )
139
+ log_debug("Memory system initialized.")
140
+ print("βœ… Memory systems ready.")
141
  except Exception as e:
142
+ log_debug(f"Memory init error: {e}\n{traceback.format_exc()}")
143
+ print(f"FATAL ERROR: Could not initialize memory: {e}")
144
+ sys.exit(1)
145
 
146
+ # --- πŸ”— Initialize Conversational Retrieval Chain ---
147
+ print("\nπŸ”— Initializing ConversationalRetrievalChain...")
148
  try:
149
+ retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 8, "lambda_mult": 0.5})
150
+ qa_chain = ConversationalRetrievalChain.from_llm(
151
+ llm=llm,
152
+ retriever=retriever,
153
+ memory=memory, # Pass the memory object here
154
+ return_source_documents=True, # To display sources
155
+ verbose=DEBUG_MODE # Chain will log intermediate steps if True
156
+ )
157
+ print("βœ… ConversationalRetrievalChain ready.")
158
  except Exception as e:
159
+ log_debug(f"Chain init error: {e}\n{traceback.format_exc()}")
160
+ print(f"FATAL ERROR: Could not initialize qa_chain: {e}")
161
+ sys.exit(1)
162
 
163
+ # --- πŸ“„ Document Processing Functions ---
164
+ print("\nπŸ“„ Setting up document processing functions...")
165
  class DocumentProcessor:
166
  @staticmethod
167
+ def clean_text(t): log_debug("Cleaning(simplified)...");t=re.sub(r'\b(obj|endobj|stream|endstream|xref|trailer|startxref)\b','',t,flags=re.IGNORECASE);t=re.sub(r'\s+',' ',t).strip();return t
 
 
 
 
 
 
168
  @staticmethod
169
+ def test_text_quality(t): # Verified Multi-line formatting
170
+ log_debug(f"Testing quality(len:{len(t)})...");
171
+ if not t or not t.strip():log_debug("Fail:Empty");return False,"Empty text"
172
+ w=t.split();wc=len(w);uc=len(set(w));log_debug(f"W:{wc},U:{uc}")
173
+ if wc<10:log_debug("Fail:W<10");return False,f"Too few words:{wc}"
174
+ if uc<5:log_debug("Fail:U<5");return False,f"Too little variety:{uc}"
175
+ log_debug("Pass.");return True,f"Quality OK:{wc} words"
 
 
 
 
176
  @staticmethod
177
+ def extract_text_from_pdf(d): # Using PyMuPDF
178
+ log_debug("Extracting(PyMuPDF)...");tp=[];doc=None
179
  try:
180
+ doc=fitz.open(stream=d,filetype="pdf");[tp.append(p.get_text("text",sort=True))for i in range(len(doc))if(p:=doc.load_page(i))and p.get_text("text")]
181
+ full_text="\\n".join(filter(None, tp));log_debug(f"Extracted len:{len(full_text)}")
182
+ if not full_text: log_debug("Warning: PyMuPDF extracted no text.")
183
+ return full_text
184
+ except Exception as e: log_debug(f"PyMuPDF error:{e}"); raise ValueError(f"PyMuPDF failed:{e}")
185
+ finally:
186
+ if doc: doc.close() # Ensure document is closed
187
 
188
+ def extract_text_from_uploaded_file(b):
189
+ log_debug("\\nπŸ” Processing upload...");t="";ct=""
190
  try:
191
+ if not isinstance(b,bytes):raise ValueError("Expected bytes.")
192
+ t=DocumentProcessor.extract_text_from_pdf(b);ct=DocumentProcessor.clean_text(t)
193
+ log_debug(f"Cleaned length:{len(ct)}")
194
+ quality,msg=DocumentProcessor.test_text_quality(ct);log_debug(f"Quality check:{msg}")
195
+ if not quality:raise ValueError(f"Poor quality:{msg}")
196
+ return ct
197
+ except Exception as e:err=f"Doc processing fail:{e}";log_debug(err);raise ValueError(err)
198
+ print("βœ… Document processing functions ready.")
199
+
200
+ # --- Text Analysis Helpers ---
201
+ # (Keep as is - can be removed if not called in final logic)
202
+ def analyze_document_structure(t): log_debug("Analyzing doc structure (optional)..."); return {} # Dummy implementation if not used
203
+ def extract_key_sections(t): log_debug("Extracting key sections (optional)..."); return [] # Dummy implementation if not used
204
+ print("βœ… Text analysis helpers ready.")
205
+
206
+
207
+ # --- Helper for Conditional Logic ---
208
+ def query_seems_doc_specific(query: str) -> bool: # ... (Definition as before) ...
209
+ query_lower=query.lower();dk=["this document","this file","uploaded document","uploaded file","summarize","summarise","analyze this","analyse this","extract from"]; is_s=any(k in query_lower for k in dk);log_debug(f"Query doc-specific check: {is_s}");return is_s
210
+
211
+ # --- 🧠 Main Application Class & Logic (Approach 1 - Conditional) ---
212
+ print("\nπŸ”„ Setting up main application logic...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  class TrustGuardian:
214
+ def __init__(self): log_debug("TrustGuardian initialized (uses global components)")
215
+ def handle_user_input(self, upload_data: Optional[bytes], user_query: str) -> str:
216
+ log_debug(f"\\nπŸ”„ Processing Request: '{user_query[:100]}...'"); text_to_return=""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  try:
218
  normalized_query = user_query.lower().strip()
219
+ if normalized_query in ["hi","hello","hey","salaam","salam","hola"]: return "πŸ‘‹ Hello! ..."
220
+
221
+ doc_is_uploaded = upload_data is not None
222
+ is_doc_query = doc_is_uploaded and query_seems_doc_specific(user_query)
223
+
224
+ if is_doc_query: # Mode 1: Doc-specific Query
225
+ log_debug("Mode: Doc Query - Direct LLM Call")
226
+ try:
227
+ doc_text=extract_text_from_uploaded_file(upload_data)
228
+ truncated_doc = truncate_to_limit(doc_text, MAX_DOC_TOKENS_DIRECT) # Use constant
229
+ prompt=f"User Query:{user_query}\n\nDocument Content(Truncated):\n{truncated_doc}\n\nInstructions:Answer based ONLY on doc."
230
+ log_debug(f"Doc-only prompt (~{count_tokens(prompt)} tokens)")
231
+ # Use global llm object
232
+ response_message = llm.invoke(prompt)
233
+ text_to_return = response_message.content.strip(); log_debug("Generated doc-specific response.")
234
+ log_debug("Skipping memory update for doc-specific query.")
235
+ except Exception as e: log_debug(f"Error during doc processing/query: {e}"); text_to_return = f"⚠️ Doc Error: {e}"
236
+ else: # Mode 2: KB/Chat Query
237
+ log_debug("Mode: KB/Chat Query - Using ConversationalRetrievalChain")
238
+ # Use global qa_chain object (which includes memory)
239
+ chat_history_messages = memory.chat_memory.messages # Get history in correct format
240
+ log_debug(f"Passing {len(chat_history_messages)} history messages to chain.")
241
+ chain_input = {"question": user_query, "chat_history": chat_history_messages}
242
+ result = qa_chain.invoke(chain_input) # Memory is updated by the chain
243
+ log_debug(f"qa_chain completed.")
244
+ text_to_return = result.get("answer", "Sorry, I couldn't generate response.")
245
+ if result.get("source_documents"): # Append sources
246
+ citations=[f"πŸ“š {doc.metadata.get('source',f'Src{i+1}')}" for i,doc in enumerate(result["source_documents"])]
247
+ if citations: text_to_return += "\n\n---\nπŸ“š Sources Consulted:\n" + "\n".join(list(set(citations)))
248
+ except Exception as e: error_msg=f"Request error: {e}"; log_debug(f"Error: {error_msg}\n{traceback.format_exc()}"); text_to_return=f"⚠️ Error: {error_msg}"
249
+ return text_to_return if text_to_return else "Unexpected issue."
250
+
251
+ # --- Initialize Guardian Instance ---
252
  guardian = TrustGuardian()
253
+ print("βœ… Main application logic ready.")
254
+
255
+
256
+ # --- 🎨 Gradio Interface Definition ---
257
+ print("\n🎨 Setting up Gradio user interface...")
258
+ def ui_handler(upload_file_input, query):
259
+ """Wrapper function for Gradio interface."""
260
+ try:
261
+ upload_bytes=None
262
+ if upload_file_input is not None:
263
+ if isinstance(upload_file_input, bytes): upload_bytes = upload_file_input; log_debug(f"Received {len(upload_bytes)} bytes.")
264
+ else: log_debug(f"Warning: Received unexpected type: {type(upload_file_input)}"); raise ValueError("Unexpected file data type.")
265
+ else: log_debug("No file uploaded.")
266
+ if not isinstance(query,str): query=str(query) if query is not None else ""
267
+ # Call main handler in the guardian instance
268
+ response_markdown = guardian.handle_user_input(upload_bytes, query)
269
+ return response_markdown
270
+ except Exception as e:
271
+ log_debug(f"Gradio Handler Error: {e}\n{traceback.format_exc()}")
272
+ return f"⚠️ System Error in UI Handler: {str(e)}"
273
+
274
+ # Define Gradio components
275
+ file_input = gr.File(label="πŸ“„ Upload Document (PDF)", type="binary", file_types=[".pdf"])
276
+ text_input = gr.Textbox(label="πŸ’­ Ask a Question", placeholder="E.g., 'Summarize doc' or 'HIPAA requirements?'", lines=3)
277
+ markdown_output = gr.Markdown(label="πŸ“ Analysis & Response")
278
+
279
+ # Define the Interface
280
  ui = gr.Interface(
281
+ fn=ui_handler,
282
+ inputs=[file_input, text_input],
283
+ outputs=[markdown_output],
284
+ title="πŸ›‘οΈ TrustGuardian – Compliance Analysis Assistant (v" + VERSION + ")",
285
+ description="Upload a PDF document for analysis (summary/Q&A based on first ~3000 tokens) or ask a general compliance question about standards like GDPR, HIPAA, NIST, ISO 27001, SOC 2, PCI DSS.",
286
+ allow_flagging="never"
 
 
 
 
287
  )
288
+ print("βœ… User interface defined.")
289
+
290
 
291
+ # --- Launch Gradio App ---
292
  if __name__ == "__main__":
293
+ print("\nπŸš€ Launching Gradio UI...")
294
+ # Set server_name for HF Spaces compatibility
295
+ ui.launch(server_name="0.0.0.0", server_port=7860, debug=DEBUG_MODE) # Use port 7860 common for HF
296
+ print(" Gradio launch initiated. App should be running.")