AsiminaM commited on
Commit
4a4f632
·
1 Parent(s): 2e6c7f7

resrtucturing of the repo

Browse files
Files changed (4) hide show
  1. app.py +10 -5
  2. file_processing.py +149 -0
  3. knowledge.py +517 -0
  4. responses.py +112 -0
app.py CHANGED
@@ -15,6 +15,11 @@ import matplotlib.pyplot as plt
15
  from matplotlib.backends.backend_agg import FigureCanvasAgg
16
  import plotly.graph_objects as go
17
  import plotly.express as px
 
 
 
 
 
18
 
19
  # ==========================================================
20
  # 🧠 1. Global Knowledge Graph with Persistent Storage
@@ -2127,7 +2132,7 @@ with gr.Blocks(title="Research Brain") as demo:
2127
  with gr.Column(scale=1):
2128
  gr.Markdown("### Research Assistant")
2129
  chatbot = gr.ChatInterface(
2130
- fn=lambda message, history: respond(message, history),
2131
  title="Query Knowledge Base",
2132
  description="Ask questions about your research data. Explore findings, relationships, and insights.",
2133
  examples=[
@@ -2177,19 +2182,19 @@ with gr.Blocks(title="Research Brain") as demo:
2177
  )
2178
 
2179
  upload_file_button.click(
2180
- fn=handle_file_upload,
2181
  inputs=file_upload,
2182
  outputs=graph_info
2183
  )
2184
 
2185
  show_button.click(
2186
- fn=show_graph_contents,
2187
  inputs=None,
2188
  outputs=graph_view
2189
  )
2190
 
2191
  visualize_button.click(
2192
- fn=visualize_knowledge_graph,
2193
  inputs=None,
2194
  outputs=graph_plot
2195
  )
@@ -2200,7 +2205,7 @@ with gr.Blocks(title="Research Brain") as demo:
2200
  )
2201
 
2202
  import_json_button.click(
2203
- fn=handle_import_json,
2204
  inputs=json_upload,
2205
  outputs=graph_info
2206
  )
 
15
  from matplotlib.backends.backend_agg import FigureCanvasAgg
16
  import plotly.graph_objects as go
17
  import plotly.express as px
18
+ from file_processing import handle_file_upload as fp_handle_file_upload
19
+ from knowledge import show_graph_contents as kb_show_graph_contents
20
+ from knowledge import visualize_knowledge_graph as kb_visualize_knowledge_graph
21
+ from knowledge import import_knowledge_from_json_file as kb_import_json
22
+ from responses import respond as rqa_respond
23
 
24
  # ==========================================================
25
  # 🧠 1. Global Knowledge Graph with Persistent Storage
 
2132
  with gr.Column(scale=1):
2133
  gr.Markdown("### Research Assistant")
2134
  chatbot = gr.ChatInterface(
2135
+ fn=lambda message, history: rqa_respond(message, history),
2136
  title="Query Knowledge Base",
2137
  description="Ask questions about your research data. Explore findings, relationships, and insights.",
2138
  examples=[
 
2182
  )
2183
 
2184
  upload_file_button.click(
2185
+ fn=fp_handle_file_upload,
2186
  inputs=file_upload,
2187
  outputs=graph_info
2188
  )
2189
 
2190
  show_button.click(
2191
+ fn=kb_show_graph_contents,
2192
  inputs=None,
2193
  outputs=graph_view
2194
  )
2195
 
2196
  visualize_button.click(
2197
+ fn=kb_visualize_knowledge_graph,
2198
  inputs=None,
2199
  outputs=graph_plot
2200
  )
 
2205
  )
2206
 
2207
  import_json_button.click(
2208
+ fn=kb_import_json,
2209
  inputs=json_upload,
2210
  outputs=graph_info
2211
  )
file_processing.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+ import pandas as pd
4
+ import PyPDF2
5
+ from docx import Document
6
+ from knowledge import add_to_graph
7
+
8
+ last_extracted_text = ""
9
+ processed_files = []
10
+
11
+ def extract_text_from_pdf(file_path):
12
+ try:
13
+ with open(file_path, 'rb') as file:
14
+ pdf_reader = PyPDF2.PdfReader(file)
15
+ text = ""
16
+ for page in pdf_reader.pages:
17
+ page_text = page.extract_text()
18
+ text += (page_text or "") + "\n"
19
+ return text.strip()
20
+ except Exception as e:
21
+ return f"Error reading PDF: {e}"
22
+
23
+ def extract_text_from_docx(file_path):
24
+ try:
25
+ doc = Document(file_path)
26
+ text = "".join(p.text + "\n" for p in doc.paragraphs)
27
+ return text.strip()
28
+ except Exception as e:
29
+ return f"Error reading DOCX: {e}"
30
+
31
+ def extract_text_from_txt(file_path):
32
+ try:
33
+ with open(file_path, 'r', encoding='utf-8') as file:
34
+ return file.read().strip()
35
+ except Exception as e:
36
+ return f"Error reading TXT: {e}"
37
+
38
+ def extract_text_from_csv(file_path):
39
+ try:
40
+ df = pd.read_csv(file_path)
41
+ text = f"CSV Data with {len(df)} rows and {len(df.columns)} columns:\n\n"
42
+ text += f"Columns: {', '.join(df.columns)}\n\n"
43
+ text += "Sample data:\n"
44
+ for i, row in df.head(5).iterrows():
45
+ text += f"Row {i+1}: {dict(row)}\n"
46
+ return text.strip()
47
+ except Exception as e:
48
+ return f"Error reading CSV: {e}"
49
+
50
+ def update_extracted_text(text):
51
+ global last_extracted_text
52
+ last_extracted_text = text
53
+
54
+ def show_extracted_text():
55
+ global last_extracted_text
56
+ if not last_extracted_text:
57
+ return " No file has been processed yet.\n\nUpload a file and process it to see the extracted text here."
58
+ preview = last_extracted_text[:1000]
59
+ if len(last_extracted_text) > 1000:
60
+ preview += "\n\n... (truncated, showing first 1000 characters)"
61
+ return f" **Last Extracted Text:**\n\n{preview}"
62
+
63
+ def process_uploaded_file(file):
64
+ if file is None:
65
+ return "No file uploaded."
66
+ file_path = file.name
67
+ file_extension = os.path.splitext(file_path)[1].lower()
68
+ if file_extension == '.pdf':
69
+ extracted_text = extract_text_from_pdf(file_path)
70
+ elif file_extension == '.docx':
71
+ extracted_text = extract_text_from_docx(file_path)
72
+ elif file_extension == '.txt':
73
+ extracted_text = extract_text_from_txt(file_path)
74
+ elif file_extension == '.csv':
75
+ extracted_text = extract_text_from_csv(file_path)
76
+ else:
77
+ return f" Unsupported file type: {file_extension}\n\nSupported formats: PDF, DOCX, TXT, CSV"
78
+ if extracted_text.startswith("Error"):
79
+ return f" {extracted_text}"
80
+ update_extracted_text(extracted_text)
81
+ preview = extracted_text[:300] + "..." if len(extracted_text) > 300 else extracted_text
82
+ result = add_to_graph(extracted_text)
83
+ file_size = len(extracted_text)
84
+ return f" Successfully processed {os.path.basename(file_path)}!\n\n📊 File stats:\n• Size: {file_size:,} characters\n• Type: {file_extension.upper()}\n\n Text preview:\n{preview}\n\n{result}"
85
+
86
+ def handle_file_upload(files):
87
+ global processed_files
88
+ if not files or len(files) == 0:
89
+ return "Please select at least one file to process."
90
+ results = []
91
+ new_processed = []
92
+ for file in files:
93
+ if file is None:
94
+ continue
95
+ try:
96
+ if isinstance(file, str):
97
+ file_path = file
98
+ file_name = os.path.basename(file)
99
+ else:
100
+ file_path = file.name
101
+ file_name = os.path.basename(file.name)
102
+ if any(f['name'] == file_name for f in processed_files):
103
+ results.append(f"SKIP: {file_name} - Already processed, skipping")
104
+ continue
105
+ result = process_uploaded_file(file)
106
+ results.append(f"SUCCESS: {file_name} - {result}")
107
+ new_processed.append({
108
+ 'name': file_name,
109
+ 'size': os.path.getsize(file_path) if os.path.exists(file_path) else 0,
110
+ 'processed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
111
+ })
112
+ except Exception as e:
113
+ file_name = os.path.basename(file) if isinstance(file, str) else os.path.basename(file.name) if hasattr(file, 'name') else str(file)
114
+ results.append(f"ERROR: {file_name} - Error: {e}")
115
+ processed_files.extend(new_processed)
116
+ total_files = len(files)
117
+ successful = len([r for r in results if r.startswith("SUCCESS")])
118
+ skipped = len([r for r in results if r.startswith("SKIP")])
119
+ failed = len([r for r in results if r.startswith("ERROR")])
120
+ summary = f"**Upload Summary:**\n"
121
+ summary += f"• Total files: {total_files}\n"
122
+ summary += f"• Successfully processed: {successful}\n"
123
+ summary += f"• Already processed: {skipped}\n"
124
+ summary += f"• Failed: {failed}\n\n"
125
+ summary += "**File Results:**\n"
126
+ for result in results:
127
+ summary += f"{result}\n"
128
+ return summary
129
+
130
+ def show_processed_files():
131
+ global processed_files
132
+ if not processed_files:
133
+ return "**No files processed yet.**\n\n**Start building your knowledge base:**\n1. Select one or more files (PDF, DOCX, TXT, CSV)\n2. Click 'Process Files' to extract knowledge\n3. View your processed files here\n4. Upload more files to expand your knowledge base!"
134
+ result = f"**Processed Files ({len(processed_files)}):**\n\n"
135
+ for i, file_info in enumerate(processed_files, 1):
136
+ result += f"**{i}. {file_info['name']}**\n"
137
+ result += f" • Size: {file_info['size']:,} bytes\n"
138
+ result += f" • Processed: {file_info['processed_at']}\n\n"
139
+ return result
140
+
141
+ def clear_processed_files():
142
+ global processed_files
143
+ processed_files = []
144
+ return "Processed files list cleared. You can now re-upload previously processed files."
145
+
146
+ def simple_test():
147
+ return " Event handler is working! Button clicked successfully!"
148
+
149
+
knowledge.py ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pickle
4
+ from datetime import datetime
5
+ import rdflib
6
+ import re
7
+ import networkx as nx
8
+
9
+ # Storage file paths
10
+ KNOWLEDGE_FILE = "knowledge_graph.pkl"
11
+ BACKUP_FILE = "knowledge_backup.json"
12
+
13
+ # Global RDF graph
14
+ graph = rdflib.Graph()
15
+
16
+ # Mapping of fact IDs to triples for editing operations
17
+ fact_index = {}
18
+
19
+ def save_knowledge_graph():
20
+ try:
21
+ with open(KNOWLEDGE_FILE, 'wb') as f:
22
+ pickle.dump(graph, f)
23
+ backup_data = {
24
+ "timestamp": datetime.now().isoformat(),
25
+ "total_facts": len(graph),
26
+ "facts": []
27
+ }
28
+ for i, (s, p, o) in enumerate(graph):
29
+ backup_data["facts"].append({
30
+ "id": i+1,
31
+ "subject": str(s),
32
+ "predicate": str(p),
33
+ "object": str(o)
34
+ })
35
+ with open(BACKUP_FILE, 'w', encoding='utf-8') as f:
36
+ json.dump(backup_data, f, indent=2, ensure_ascii=False)
37
+ return f" Saved {len(graph)} facts to storage"
38
+ except Exception as e:
39
+ return f" Error saving knowledge: {e}"
40
+
41
+ def load_knowledge_graph():
42
+ global graph
43
+ try:
44
+ if os.path.exists(KNOWLEDGE_FILE):
45
+ with open(KNOWLEDGE_FILE, 'rb') as f:
46
+ graph = pickle.load(f)
47
+ return f"📂 Loaded {len(graph)} facts from storage"
48
+ else:
49
+ return "📂 No existing knowledge file found, starting fresh"
50
+ except Exception as e:
51
+ return f" Error loading knowledge: {e}"
52
+
53
+ def create_comprehensive_backup():
54
+ try:
55
+ backup_data = {
56
+ "metadata": {
57
+ "timestamp": datetime.now().isoformat(),
58
+ "total_facts": len(graph),
59
+ "backup_type": "comprehensive_knowledge_base",
60
+ "graph_size": len(graph)
61
+ },
62
+ "facts": []
63
+ }
64
+ for i, (s, p, o) in enumerate(graph):
65
+ subject = str(s).split(':')[-1] if ':' in str(s) else str(s)
66
+ predicate = str(p).split(':')[-1] if ':' in str(p) else str(p)
67
+ object_val = str(o)
68
+ backup_data["facts"].append({
69
+ "id": i + 1,
70
+ "subject": subject,
71
+ "predicate": predicate,
72
+ "object": object_val,
73
+ "full_subject": str(s),
74
+ "full_predicate": str(p),
75
+ "full_object": str(o)
76
+ })
77
+ with open(BACKUP_FILE, 'w', encoding='utf-8') as f:
78
+ json.dump(backup_data, f, indent=2, ensure_ascii=False)
79
+ except Exception:
80
+ create_error_backup("unknown")
81
+
82
+ def create_error_backup(error_message):
83
+ try:
84
+ backup_data = {
85
+ "metadata": {
86
+ "timestamp": datetime.now().isoformat(),
87
+ "total_facts": 0,
88
+ "backup_type": "error_backup",
89
+ "error": error_message
90
+ },
91
+ "facts": []
92
+ }
93
+ with open(BACKUP_FILE, 'w', encoding='utf-8') as f:
94
+ json.dump(backup_data, f, indent=2, ensure_ascii=False)
95
+ except Exception:
96
+ pass
97
+
98
+ def extract_entities(text):
99
+ entities = []
100
+ capitalized_words = re.findall(r'\b[A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\b', text)
101
+ entities.extend(capitalized_words)
102
+ org_patterns = [
103
+ r'([A-Z][a-zA-Z\s]+)\s+(Inc|Ltd|LLC|Corp|Corporation|Company|Co\.|Ltd\.)',
104
+ r'([A-Z][a-zA-Z\s]+)\s+(University|Institute|Lab|Laboratory)',
105
+ ]
106
+ for pattern in org_patterns:
107
+ matches = re.findall(pattern, text)
108
+ entities.extend([m[0].strip() for m in matches])
109
+ location_keywords = ['in ', 'at ', 'near ', 'from ']
110
+ for keyword in location_keywords:
111
+ pattern = f'{keyword}([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)'
112
+ matches = re.findall(pattern, text)
113
+ entities.extend(matches)
114
+ dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}\b', text)
115
+ entities.extend(dates)
116
+ entities = list(set([e.strip() for e in entities if len(e.strip()) > 3]))
117
+ return entities[:50]
118
+
119
+ def extract_regular_triples_improved(text, entities):
120
+ triples = []
121
+ sentences = re.split(r'[.!?\n]+', text)
122
+ for sentence in sentences:
123
+ sentence = sentence.strip()
124
+ if len(sentence) < 15:
125
+ continue
126
+ improved_patterns = [
127
+ (r'([A-Z][a-zA-Z\s]+(?:,\s+[A-Z][a-zA-Z\s]+)*)\s+(is|are|was|were|becomes|represents|means|refers to|denotes)\s+(.+)', 'relates to'),
128
+ (r'([A-Z][a-zA-Z\s]+)\s+(uses|employs|utilizes|applies)\s+(.+)', 'uses'),
129
+ (r'([A-Z][a-zA-Z\s]+)\s+(develops|created|designed|implemented)\s+(.+)', 'creates'),
130
+ (r'([A-Z][a-zA-Z\s]+)\s+(requires|needs|demands)\s+(.+)', 'requires'),
131
+ (r'([A-Z][a-zA-Z\s]+)\s+(enables|allows|permits)\s+(.+)', 'enables'),
132
+ (r'([A-Z][a-zA-Z\s]+)\s+(affects|impacts|influences|affects)\s+(.+)', 'affects'),
133
+ (r'([A-Z][a-zA-Z\s]+)\s+(found|discovered|identified|observed|detected)\s+(.+)', 'discovered'),
134
+ (r'([A-Z][a-zA-Z\s]+)\s+(studies|analyzes|examines|investigates)\s+(.+)', 'studies'),
135
+ (r'([A-Z][a-zA-Z\s]+)\s+(proposes|suggests|recommends)\s+(.+)', 'proposes'),
136
+ (r'([A-Z][a-zA-Z\s]+)\s+(results in|leads to|causes)\s+(.+)', 'causes'),
137
+ (r'([A-Z][a-zA-Z\s]+)\s+(works with|collaborates with|partnered with)\s+(.+)', 'works with'),
138
+ (r'([A-Z][a-zA-Z\s]+)\s+(located in|based in|situated in)\s+(.+)', 'located in'),
139
+ ]
140
+ for pattern, predicate in improved_patterns:
141
+ match = re.search(pattern, sentence, re.IGNORECASE)
142
+ if match:
143
+ groups = match.groups()
144
+ subject = groups[0].strip() if len(groups) > 0 else ''
145
+ object_val = groups[-1].strip() if len(groups) > 1 else ''
146
+ subject = re.sub(r'^(the|a|an)\s+', '', subject, flags=re.IGNORECASE).strip()
147
+ object_val = re.sub(r'^(the|a|an)\s+', '', object_val, flags=re.IGNORECASE).strip()
148
+ if subject and object_val and len(subject) > 3 and len(object_val) > 3:
149
+ triples.append((subject, predicate, object_val))
150
+ break
151
+ clause_patterns = [
152
+ r'([A-Z][a-zA-Z\s]+)\s+which\s+(.+)',
153
+ r'([A-Z][a-zA-Z\s]+)\s+that\s+(.+)',
154
+ r'([A-Z][a-zA-Z\s]+)\s+who\s+(.+)',
155
+ ]
156
+ for pattern in clause_patterns:
157
+ match = re.search(pattern, sentence)
158
+ if match:
159
+ subject = match.group(1).strip()
160
+ description = match.group(2).strip()
161
+ if subject and description and len(subject) > 3 and len(description) > 3:
162
+ triples.append((subject, 'has property', description[:150]))
163
+ return triples
164
+
165
+ def extract_structured_triples(text):
166
+ triples = []
167
+ lines = text.split('\n')
168
+ patterns = [
169
+ (r'date\s*:?\s*([0-9\/\-\.]+)', 'date', 'is'),
170
+ (r'time\s*:?\s*([0-9:]+)', 'time', 'is'),
171
+ (r'created\s*:?\s*([0-9\/\-\.]+)', 'created_date', 'is'),
172
+ (r'modified\s*:?\s*([0-9\/\-\.]+)', 'modified_date', 'is'),
173
+ (r'id\s*:?\s*([A-Z0-9\-]+)', 'id', 'is'),
174
+ (r'number\s*:?\s*([A-Z0-9\-]+)', 'number', 'is'),
175
+ (r'code\s*:?\s*([A-Z0-9\-]+)', 'code', 'is'),
176
+ (r'reference\s*:?\s*([A-Z0-9\-]+)', 'reference', 'is'),
177
+ (r'name\s*:?\s*([A-Za-z\s&.,]+)', 'name', 'is'),
178
+ (r'title\s*:?\s*([A-Za-z\s&.,]+)', 'title', 'is'),
179
+ (r'company\s*:?\s*([A-Za-z\s&.,]+)', 'company', 'is'),
180
+ (r'organization\s*:?\s*([A-Za-z\s&.,]+)', 'organization', 'is'),
181
+ (r'email\s*:?\s*([A-Za-z0-9@\.\-]+)', 'email', 'is'),
182
+ (r'phone\s*:?\s*([0-9\s\-\+\(\)]+)', 'phone', 'is'),
183
+ (r'address\s*:?\s*([A-Za-z0-9\s\-\.,]+)', 'address', 'is'),
184
+ (r'description\s*:?\s*([A-Za-z0-9\s\-\.,]+)', 'description', 'is'),
185
+ (r'type\s*:?\s*([A-Za-z0-9\s\-\.,]+)', 'type', 'is'),
186
+ (r'category\s*:?\s*([A-Za-z0-9\s\-\.,]+)', 'category', 'is'),
187
+ (r'status\s*:?\s*([A-Za-z0-9\s\-\.,]+)', 'status', 'is'),
188
+ (r'location\s*:?\s*([A-Za-z0-9\s\-\.,]+)', 'location', 'is'),
189
+ (r'department\s*:?\s*([A-Za-z0-9\s\-\.,]+)', 'department', 'is'),
190
+ (r'section\s*:?\s*([A-Za-z0-9\s\-\.,]+)', 'section', 'is'),
191
+ (r'amount\s*:?\s*\$?([0-9,]+\.?[0-9]*)', 'amount', 'is'),
192
+ (r'total\s*:?\s*\$?([0-9,]+\.?[0-9]*)', 'total', 'is'),
193
+ (r'price\s*:?\s*\$?([0-9,]+\.?[0-9]*)', 'price', 'is'),
194
+ (r'cost\s*:?\s*\$?([0-9,]+\.?[0-9]*)', 'cost', 'is'),
195
+ ]
196
+ for line in lines:
197
+ line = line.strip()
198
+ if len(line) < 5:
199
+ continue
200
+ for pattern, subject, predicate in patterns:
201
+ match = re.search(pattern, line, re.IGNORECASE)
202
+ if match:
203
+ value = match.group(1).strip()
204
+ if value and len(value) > 1:
205
+ triples.append((subject, predicate, value))
206
+ break
207
+ kv_patterns = [
208
+ r'([A-Za-z\s]+):\s*([A-Za-z0-9\s\$\-\.\/,]+)',
209
+ r'([A-Za-z\s]+)\s*=\s*([A-Za-z0-9\s\$\-\.\/,]+)',
210
+ r'([A-Za-z\s]+)\s*-\s*([A-Za-z0-9\s\$\-\.\/,]+)',
211
+ ]
212
+ for line in lines:
213
+ for pattern in kv_patterns:
214
+ match = re.search(pattern, line)
215
+ if match:
216
+ key = match.group(1).strip().lower().replace(' ', '_')
217
+ value = match.group(2).strip()
218
+ if len(key) > 2 and len(value) > 1:
219
+ triples.append((key, 'is', value))
220
+ return triples
221
+
222
+ def extract_regular_triples(text):
223
+ triples = []
224
+ sentences = re.split(r"[.?!\n]", text)
225
+ patterns = [
226
+ r"\s+(is|are|was|were)\s+",
227
+ r"\s+(has|have|had)\s+",
228
+ r"\s+(uses|used|using)\s+",
229
+ r"\s+(creates|created|creating)\s+",
230
+ r"\s+(develops|developed|developing)\s+",
231
+ r"\s+(leads|led|leading)\s+",
232
+ r"\s+(affects|affected|affecting)\s+",
233
+ r"\s+(contains|contained|containing)\s+",
234
+ r"\s+(includes|included|including)\s+",
235
+ r"\s+(requires|required|requiring)\s+",
236
+ r"\s+(causes|caused|causing)\s+",
237
+ r"\s+(results|resulted|resulting)\s+",
238
+ r"\s+(enables|enabled|enabling)\s+",
239
+ r"\s+(provides|provided|providing)\s+",
240
+ r"\s+(supports|supported|supporting)\s+",
241
+ r"\s+(located|situated|found)\s+",
242
+ r"\s+(connects|links|relates)\s+",
243
+ r"\s+(depends|relies|based)\s+",
244
+ r"\s+(represents|symbolizes|stands)\s+",
245
+ r"\s+(describes|explains|defines)\s+",
246
+ r"\s+(refers|referring|referenced)\s+",
247
+ r"\s+(concerns|concerning|concerned)\s+",
248
+ r"\s+(relates|relating|related)\s+",
249
+ ]
250
+ for sentence in sentences:
251
+ sentence = sentence.strip()
252
+ if len(sentence) < 10:
253
+ continue
254
+ for pattern in patterns:
255
+ parts = re.split(pattern, sentence, maxsplit=1)
256
+ if len(parts) == 3:
257
+ subj, pred, obj = parts
258
+ subj = re.sub(r'^(the|a|an)\s+', '', subj.strip(), flags=re.IGNORECASE)
259
+ obj = re.sub(r'^(the|a|an)\s+', '', obj.strip(), flags=re.IGNORECASE)
260
+ if subj and pred and obj and len(subj) > 2 and len(obj) > 2:
261
+ triples.append((subj, pred.strip(), obj))
262
+ break
263
+ return triples
264
+
265
+ def extract_triples(text):
266
+ triples = []
267
+ entities = extract_entities(text)
268
+ for entity in entities:
269
+ triples.append((entity, 'type', 'entity'))
270
+ triples.extend(extract_structured_triples(text))
271
+ triples.extend(extract_regular_triples_improved(text, entities))
272
+ triples.extend(extract_regular_triples(text))
273
+ unique_triples = []
274
+ for s, p, o in triples:
275
+ if s and p and o and len(s) > 2 and len(p) > 1 and len(o) > 2:
276
+ s = s.strip()[:100]
277
+ p = p.strip()[:50]
278
+ o = o.strip()[:200]
279
+ if (s, p, o) not in unique_triples:
280
+ unique_triples.append((s, p, o))
281
+ return unique_triples
282
+
283
+ def add_to_graph(text):
284
+ new_triples = extract_triples(text)
285
+ for s, p, o in new_triples:
286
+ graph.add((rdflib.URIRef(f"urn:{s}"), rdflib.URIRef(f"urn:{p}"), rdflib.Literal(o)))
287
+ save_knowledge_graph()
288
+ return f" Added {len(new_triples)} new triples. Total facts stored: {len(graph)}.\n Saved"
289
+
290
+ def retrieve_context(question, limit=10):
291
+ matches = []
292
+ qwords = [w for w in question.lower().split() if w not in {
293
+ 'the','a','an','and','or','but','in','on','at','to','for','of','with','by','is','are','was','were','be','been','have','has','had','do','does','did','will','would','could','should','may','might','can','what','how','when','where','why','who'
294
+ } and len(w) > 2]
295
+ scored_matches = []
296
+ for s, p, o in graph:
297
+ subject = str(s).split(':')[-1] if ':' in str(s) else str(s)
298
+ predicate = str(p).split(':')[-1] if ':' in str(p) else str(p)
299
+ object_val = str(o)
300
+ fact_text = f"{subject} {predicate} {object_val}".lower()
301
+ score = 0
302
+ for word in qwords:
303
+ if word in fact_text:
304
+ score += 1
305
+ if word == subject.lower() or word == predicate.lower():
306
+ score += 2
307
+ if score > 0:
308
+ scored_matches.append((score, f"{subject} {predicate} {object_val}"))
309
+ scored_matches.sort(key=lambda x: x[0], reverse=True)
310
+ matches = [m[1] for m in scored_matches[:limit]]
311
+ if matches:
312
+ result = "**Relevant Knowledge:**\n"
313
+ for i, match in enumerate(matches, 1):
314
+ result += f"{i}. {match}\n"
315
+ return result
316
+ return "**No directly relevant facts found.**\n\nTry asking about topics that might be in your knowledge base, or add more knowledge first!"
317
+
318
+ def show_graph_contents():
319
+ if len(graph) == 0:
320
+ return "**Knowledge Graph Status: EMPTY**\n\n**How to build your knowledge base:**\n1. **Add text directly** - Paste any text in the 'Add Knowledge from Text' box above\n2. **Upload documents** - Use the file upload to process PDF, DOCX, TXT, CSV files\n3. **Extract facts** - The system will automatically extract knowledge from your content\n4. **Build knowledge** - Add more text or files to expand your knowledge base\n5. **Save knowledge** - Use 'Save Knowledge' to persist your data\n\n**Start by adding some text or uploading a document!**"
321
+ facts_by_subject = {}
322
+ all_facts = []
323
+ for s, p, o in graph:
324
+ subject = str(s).split(':')[-1] if ':' in str(s) else str(s)
325
+ predicate = str(p).split(':')[-1] if ':' in str(p) else str(p)
326
+ object_val = str(o)
327
+ fact_text = f"{subject} {predicate} {object_val}"
328
+ all_facts.append(fact_text)
329
+ facts_by_subject.setdefault(subject, []).append(f"{predicate} {object_val}")
330
+ result = f"**Knowledge Graph Overview**\n"
331
+ result += f"**Total Facts:** {len(graph)}\n"
332
+ result += f"**Unique Subjects:** {len(facts_by_subject)}\n\n"
333
+ result += "## **Knowledge by Subject:**\n\n"
334
+ for i, (subject, facts) in enumerate(facts_by_subject.items()):
335
+ if i >= 10:
336
+ remaining = len(facts_by_subject) - 10
337
+ result += f"... and {remaining} more subjects\n"
338
+ break
339
+ result += f"**{subject}:**\n"
340
+ for fact in facts:
341
+ result += f" • {fact}\n"
342
+ result += "\n"
343
+ result += "## **All Facts:**\n\n"
344
+ for i, fact in enumerate(all_facts[:20]):
345
+ result += f"{i+1}. {fact}\n"
346
+ if len(all_facts) > 20:
347
+ result += f"\n... and {len(all_facts) - 20} more facts"
348
+ return result
349
+
350
+ def visualize_knowledge_graph():
351
+ if len(graph) == 0:
352
+ return "<p>No knowledge in graph. Add some text or upload a document first!</p>"
353
+ try:
354
+ G = nx.Graph()
355
+ fact_data = {}
356
+ for s, p, o in graph:
357
+ subject = str(s).split(':')[-1] if ':' in str(s) else str(s)
358
+ predicate = str(p).split(':')[-1] if ':' in str(p) else str(p)
359
+ object_val = str(o)
360
+ subject_short = (subject[:30] + "...") if len(subject) > 30 else subject
361
+ object_short = (object_val[:30] + "...") if len(object_val) > 30 else object_val
362
+ if subject not in G:
363
+ G.add_node(subject, display=subject_short, node_type='subject')
364
+ if object_val not in G:
365
+ G.add_node(object_val, display=object_short, node_type='object')
366
+ G.add_edge(subject, object_val, label=predicate)
367
+ fact_data[(subject, object_val)] = f"{subject} {predicate} {object_val}"
368
+ pos = nx.spring_layout(G, k=2, iterations=100, seed=42)
369
+ import numpy as np
370
+ x_positions = [pos[n][0] for n in G.nodes()]
371
+ y_positions = [pos[n][1] for n in G.nodes()]
372
+ x_min, x_max = min(x_positions), max(x_positions)
373
+ y_min, y_max = min(y_positions), max(y_positions)
374
+ scale = min(500 / (x_max - x_min), 400 / (y_max - y_min)) if (x_max - x_min) > 0 and (y_max - y_min) > 0 else 50
375
+ offset_x = 350
376
+ offset_y = 300
377
+ svg_elements = []
378
+ for edge in G.edges():
379
+ x1 = pos[edge[0]][0] * scale + offset_x
380
+ y1 = pos[edge[0]][1] * scale + offset_y
381
+ x2 = pos[edge[1]][0] * scale + offset_x
382
+ y2 = pos[edge[1]][1] * scale + offset_y
383
+ edge_data = G[edge[0]][edge[1]]
384
+ label = edge_data.get('label', 'has')
385
+ svg_elements.append(f"""
386
+ <line x1="{x1}" y1="{y1}" x2="{x2}" y2="{y2}"
387
+ stroke="#999" stroke-width="2" opacity="0.5">
388
+ <title>{label}</title>
389
+ </line>
390
+ """)
391
+ node_info = []
392
+ for i, node in enumerate(G.nodes()):
393
+ x = pos[node][0] * scale + offset_x
394
+ y = pos[node][1] * scale + offset_y
395
+ display_name = G.nodes[node].get('display', node)
396
+ node_type = G.nodes[node].get('node_type', 'unknown')
397
+ color = '#4CAF50' if node_type == 'subject' else ('#2196F3' if node_type == 'object' else '#546E7A')
398
+ neighbors = list(G.neighbors(node))
399
+ neighbor_count = len(neighbors)
400
+ node_info.append(f"""
401
+ <circle cx="{x}" cy="{y}" r="{max(40, min(30, neighbor_count * 2 + 20))}"
402
+ fill="{color}" stroke="#fff" stroke-width="2">
403
+ <title>{display_name} ({neighbor_count} connections)</title>
404
+ </circle>
405
+ <text x="{x}" y="{y+6}" text-anchor="middle" font-size="15" font-weight="bold" fill="#000"
406
+ pointer-events="none">{display_name[:15]}</text>
407
+ """)
408
+ svg_content = '\n'.join(svg_elements + node_info)
409
+ html = f"""
410
+ <div style="width: 100%; min-height: 700px; max-height: 800px; background: white; border: 2px solid #ddd; border-radius: 10px; padding: 20px; position: relative; overflow: auto;">
411
+ <svg width="100%" height="550" style="border: 1px solid #ddd; border-radius: 5px; background: #f9f9f9; display: block;">
412
+ {svg_content}
413
+ </svg>
414
+ </div>
415
+ """
416
+ return html
417
+ except Exception as e:
418
+ return f"<p style='color: red; padding: 20px;'>Error creating visualization: {e}</p>"
419
+
420
+ def delete_all_knowledge():
421
+ global graph
422
+ count = len(graph)
423
+ graph = rdflib.Graph()
424
+ save_knowledge_graph()
425
+ return f"🗑️ Deleted all {count} facts from the knowledge graph. Graph is now empty."
426
+
427
+ def delete_knowledge_by_keyword(keyword):
428
+ global graph
429
+ if not keyword or keyword.strip() == "":
430
+ return "⚠️ Please enter a keyword to search for."
431
+ keyword = keyword.strip().lower()
432
+ deleted_count = 0
433
+ facts_to_remove = []
434
+ for s, p, o in graph:
435
+ fact_text = f"{s} {p} {o}".lower()
436
+ if keyword in fact_text:
437
+ facts_to_remove.append((s, p, o))
438
+ for fact in facts_to_remove:
439
+ graph.remove(fact)
440
+ deleted_count += 1
441
+ if deleted_count > 0:
442
+ save_knowledge_graph()
443
+ return f"🗑️ Deleted {deleted_count} facts containing '{keyword}'"
444
+ else:
445
+ return f"ℹ️ No facts found containing '{keyword}'"
446
+
447
+ def delete_recent_knowledge(count=5):
448
+ global graph
449
+ if len(graph) == 0:
450
+ return "ℹ️ Knowledge graph is already empty."
451
+ facts = list(graph)
452
+ facts_to_remove = facts[-count:] if count < len(facts) else facts
453
+ for fact in facts_to_remove:
454
+ graph.remove(fact)
455
+ save_knowledge_graph()
456
+ return f"🗑️ Deleted {len(facts_to_remove)} most recent facts"
457
+
458
+ def list_facts_for_editing():
459
+ global fact_index
460
+ fact_index = {}
461
+ options = []
462
+ for i, (s, p, o) in enumerate(list(graph), start=1):
463
+ subject = str(s).split(':')[-1] if ':' in str(s) else str(s)
464
+ predicate = str(p).split(':')[-1] if ':' in str(p) else str(p)
465
+ object_val = str(o)
466
+ label = f"{i}. {subject} {predicate} {object_val}"
467
+ options.append(label)
468
+ fact_index[i] = (s, p, o)
469
+ return options
470
+
471
+ def load_fact_by_label(fact_label):
472
+ if not fact_label:
473
+ return None
474
+ try:
475
+ fact_id = int(fact_label.split('.', 1)[0].strip())
476
+ return fact_index.get(fact_id)
477
+ except Exception:
478
+ return None
479
+
480
+ def import_knowledge_from_json_file(file):
481
+ try:
482
+ if file is None:
483
+ return "⚠️ No file selected."
484
+ file_path = file.name if hasattr(file, 'name') else str(file)
485
+ if not os.path.exists(file_path):
486
+ return f"⚠️ File not found: {file_path}"
487
+ with open(file_path, 'r', encoding='utf-8') as f:
488
+ data = json.load(f)
489
+ if isinstance(data, dict) and 'facts' in data:
490
+ facts = data['facts']
491
+ elif isinstance(data, list):
492
+ facts = data
493
+ else:
494
+ return "❌ Unsupported JSON structure. Expect an object with 'facts' or a list of facts."
495
+ added = 0
496
+ skipped = 0
497
+ for fact in facts:
498
+ try:
499
+ subject = fact.get('subject') or fact.get('full_subject')
500
+ predicate = fact.get('predicate') or fact.get('full_predicate')
501
+ obj = fact.get('object') or fact.get('full_object')
502
+ if not subject or not predicate or obj is None:
503
+ skipped += 1
504
+ continue
505
+ s_ref = rdflib.URIRef(subject if str(subject).startswith('urn:') else f"urn:{subject}")
506
+ p_ref = rdflib.URIRef(predicate if str(predicate).startswith('urn:') else f"urn:{predicate}")
507
+ o_lit = rdflib.Literal(obj)
508
+ graph.add((s_ref, p_ref, o_lit))
509
+ added += 1
510
+ except Exception:
511
+ skipped += 1
512
+ save_knowledge_graph()
513
+ return f"✅ Imported {added} facts. Skipped {skipped}. Total facts: {len(graph)}."
514
+ except Exception as e:
515
+ return f"❌ Import failed: {e}"
516
+
517
+
responses.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from knowledge import retrieve_context
2
+
3
+ def generate_document_summary(context):
4
+ if not context or "No directly relevant facts found" in context:
5
+ return "I don't have enough information about this document to provide a summary. Please add more knowledge to the knowledge base first."
6
+ facts = []
7
+ for line in context.split('\n'):
8
+ if line.strip() and not line.startswith('**'):
9
+ facts.append(line.strip())
10
+ document_type = "document"
11
+ key_info = []
12
+ for fact in facts:
13
+ fact_lower = fact.lower()
14
+ if 'invoice' in fact_lower or 'bill' in fact_lower:
15
+ document_type = "invoice"
16
+ elif 'contract' in fact_lower or 'agreement' in fact_lower:
17
+ document_type = "contract"
18
+ elif 'report' in fact_lower or 'analysis' in fact_lower:
19
+ document_type = "report"
20
+ elif any(k in fact_lower for k in ['company','organization','name','amount','total','cost','price','date','time','address','location','description','type','id','number','code']):
21
+ key_info.append(fact)
22
+ summary = f"Based on the information in my knowledge base, this appears to be a **{document_type}** document. "
23
+ if key_info:
24
+ summary += "Here are the key details I found:\n\n"
25
+ for info in key_info[:5]:
26
+ summary += f"• {info}\n"
27
+ else:
28
+ summary += "However, I don't have enough specific details to provide a comprehensive summary."
29
+ return summary
30
+
31
+ def _facts_from_context(context):
32
+ facts = []
33
+ for line in context.split('\n'):
34
+ if line.strip() and not line.startswith('**'):
35
+ facts.append(line.strip())
36
+ return facts
37
+
38
+ def generate_what_response(message, context):
39
+ facts = _facts_from_context(context)
40
+ if not facts:
41
+ return "I don't have specific information about that in my knowledge base."
42
+ response = "Based on my knowledge base, here's what I can tell you:\n\n"
43
+ for fact in facts[:3]:
44
+ response += f"• {fact}\n"
45
+ if len(facts) > 3:
46
+ response += f"\nI have {len(facts)} total facts about this topic in my knowledge base."
47
+ return response
48
+
49
+ def generate_who_response(message, context):
50
+ facts = _facts_from_context(context)
51
+ facts = [f for f in facts if any(k in f.lower() for k in ['company','name','person','επωνυμία','εταιρεία'])]
52
+ if not facts:
53
+ return "I don't have specific information about people or companies in my knowledge base."
54
+ return "Here's what I know about people/entities:\n\n" + "\n".join(f"• {f}" for f in facts)
55
+
56
+ def generate_when_response(message, context):
57
+ facts = _facts_from_context(context)
58
+ facts = [f for f in facts if any(k in f.lower() for k in ['date','ημερομηνία','due','προθεσμία'])]
59
+ if not facts:
60
+ return "I don't have specific date information in my knowledge base."
61
+ return "Here's the date information I have:\n\n" + "\n".join(f"• {f}" for f in facts)
62
+
63
+ def generate_where_response(message, context):
64
+ facts = _facts_from_context(context)
65
+ facts = [f for f in facts if any(k in f.lower() for k in ['address','διεύθυνση','location','place'])]
66
+ if not facts:
67
+ return "I don't have specific location information in my knowledge base."
68
+ return "Here's the location information I have:\n\n" + "\n".join(f"• {f}" for f in facts)
69
+
70
+ def generate_amount_response(message, context):
71
+ facts = _facts_from_context(context)
72
+ facts = [f for f in facts if any(k in f.lower() for k in ['amount','total','price','cost','σύνολο','φόρος','€','$'])]
73
+ if not facts:
74
+ return "I don't have specific financial information in my knowledge base."
75
+ return "Here's the financial information I have:\n\n" + "\n".join(f"• {f}" for f in facts)
76
+
77
+ def generate_general_response(message, context):
78
+ facts = _facts_from_context(context)
79
+ if not facts:
80
+ return "I don't have relevant information about that in my knowledge base."
81
+ response = "Based on my knowledge base, here's what I can tell you:\n\n"
82
+ for fact in facts[:4]:
83
+ response += f"• {fact}\n"
84
+ if len(facts) > 4:
85
+ response += f"\nI have {len(facts)} total relevant facts about this topic."
86
+ return response
87
+
88
+ def generate_intelligent_response(message, context, system_message):
89
+ message_lower = message.lower()
90
+ if any(phrase in message_lower for phrase in [
91
+ 'what is the document about', 'whats the document about', 'what is this about', 'whats this about',
92
+ 'describe the document', 'summarize the document', 'what does this contain', 'what is this about'
93
+ ]):
94
+ return generate_document_summary(context)
95
+ elif message_lower.startswith('what'):
96
+ return generate_what_response(message, context)
97
+ elif message_lower.startswith('who'):
98
+ return generate_who_response(message, context)
99
+ elif message_lower.startswith('when'):
100
+ return generate_when_response(message, context)
101
+ elif message_lower.startswith('where'):
102
+ return generate_where_response(message, context)
103
+ elif any(phrase in message_lower for phrase in ['how much','amount','total','cost','price']):
104
+ return generate_amount_response(message, context)
105
+ else:
106
+ return generate_general_response(message, context)
107
+
108
+ def respond(message, history, system_message="You are an intelligent assistant that answers questions based on factual information from a knowledge base. You provide clear, accurate, and helpful responses. When you have relevant information, you share it directly. When you don't have enough information, you clearly state this limitation. You always stay grounded in the facts provided and never hallucinate information."):
109
+ context = retrieve_context(message)
110
+ return generate_intelligent_response(message, context, system_message)
111
+
112
+