Spaces:

AD2000X
/

Ontology-RAG-Demo

Running

App Files Files Community

Ontology-RAG-Demo / src /semantic_retriever.py

AD2000X

Upload 14 files

e1cced0 verified 7 months ago

raw

history blame

9.53 kB

	# src/semantic_retriever.py

	from typing import List, Dict, Any, Tuple, Optional
	import numpy as np
	from langchain_community.embeddings import OpenAIEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain.schema import Document
	from src.ontology_manager import OntologyManager

	class SemanticRetriever:
	"""
	Enhanced retrieval system that combines vector search with ontology awareness.
	"""

	def __init__(
	self,
	ontology_manager: OntologyManager,
	embeddings_model = None,
	text_chunks: Optional[List[str]] = None
	):
	"""
	Initialize the semantic retriever.

	Args:
	ontology_manager: The ontology manager instance
	embeddings_model: The embeddings model to use (defaults to OpenAIEmbeddings)
	text_chunks: Optional list of text chunks to add to the vector store
	"""
	self.ontology_manager = ontology_manager
	self.embeddings = embeddings_model or OpenAIEmbeddings()

	# Create a vector store with the text representation of the ontology
	ontology_text = ontology_manager.get_text_representation()
	self.ontology_chunks = self._split_text(ontology_text)

	# Add additional text chunks if provided
	if text_chunks:
	self.text_chunks = text_chunks
	all_chunks = self.ontology_chunks + text_chunks
	else:
	self.text_chunks = []
	all_chunks = self.ontology_chunks

	# Convert to Document objects for FAISS
	documents = [Document(page_content=chunk, metadata={"source": "ontology" if i < len(self.ontology_chunks) else "text"})
	for i, chunk in enumerate(all_chunks)]

	# Create the vector store
	self.vector_store = FAISS.from_documents(documents, self.embeddings)

	def _split_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
	"""Split text into chunks for embedding."""
	chunks = []
	text_length = len(text)

	for i in range(0, text_length, chunk_size - overlap):
	chunk = text[i:i + chunk_size]
	if len(chunk) < 50: # Skip very small chunks
	continue
	chunks.append(chunk)

	return chunks

	def retrieve(self, query: str, k: int = 4, include_ontology_context: bool = True) -> List[Document]:
	"""
	Retrieve relevant documents using a hybrid approach.

	Args:
	query: The query string
	k: Number of documents to retrieve
	include_ontology_context: Whether to include additional ontology context

	Returns:
	A list of retrieved documents
	"""
	# Get semantic context from the ontology
	if include_ontology_context:
	ontology_context = self.ontology_manager.get_semantic_context(query)
	else:
	ontology_context = []

	# Perform vector similarity search
	vector_results = self.vector_store.similarity_search(query, k=k)

	# Combine results
	combined_results = vector_results

	# Add ontology context as additional documents
	for i, context in enumerate(ontology_context):
	combined_results.append(Document(
	page_content=context,
	metadata={"source": "ontology_context", "context_id": i}
	))

	return combined_results

	def retrieve_with_paths(self, query: str, k: int = 4) -> Dict[str, Any]:
	"""
	Enhanced retrieval that includes semantic paths between entities.

	Args:
	query: The query string
	k: Number of documents to retrieve

	Returns:
	A dictionary containing retrieved documents and semantic paths
	"""
	# Basic retrieval
	basic_results = self.retrieve(query, k)

	# Extract potential entities from the query (simplified approach)
	# A more sophisticated approach would use NER or entity linking
	entity_types = ["Product", "Department", "Employee", "Manager", "Customer", "Feedback"]
	query_words = query.lower().split()

	potential_entities = []
	for entity_type in entity_types:
	if entity_type.lower() in query_words:
	# Get instances of this type
	instances = self.ontology_manager.get_instances_of_class(entity_type)
	if instances:
	# Just take the first few for demonstration
	potential_entities.extend(instances[:2])

	# Find paths between potential entities
	paths = []
	if len(potential_entities) >= 2:
	for i in range(len(potential_entities)):
	for j in range(i+1, len(potential_entities)):
	source = potential_entities[i]
	target = potential_entities[j]

	# Find paths between these entities
	entity_paths = self.ontology_manager.find_paths(source, target, max_length=3)

	if entity_paths:
	for path in entity_paths:
	# Convert path to text
	path_text = self._path_to_text(path)
	paths.append({
	"source": source,
	"target": target,
	"path": path,
	"text": path_text
	})

	# Convert paths to documents
	path_documents = []
	for i, path_info in enumerate(paths):
	path_documents.append(Document(
	page_content=path_info["text"],
	metadata={
	"source": "semantic_path",
	"path_id": i,
	"source_entity": path_info["source"],
	"target_entity": path_info["target"]
	}
	))

	return {
	"documents": basic_results + path_documents,
	"paths": paths
	}

	def _path_to_text(self, path: List[Dict]) -> str:
	"""Convert a path to a text description."""
	if not path:
	return ""

	text_parts = []
	for edge in path:
	source = edge["source"]
	target = edge["target"]
	relation = edge["type"]

	# Get entity information
	source_info = self.ontology_manager.get_entity_info(source)
	target_info = self.ontology_manager.get_entity_info(target)

	# Get names if available
	source_name = source
	if "properties" in source_info and "name" in source_info["properties"]:
	source_name = source_info["properties"]["name"]

	target_name = target
	if "properties" in target_info and "name" in target_info["properties"]:
	target_name = target_info["properties"]["name"]

	# Describe the relationship
	text_parts.append(f"{source_name} {relation} {target_name}")

	return " -> ".join(text_parts)

	def search_by_property(self, class_type: str, property_name: str, property_value: str) -> List[Document]:
	"""
	Search for instances of a class with a specific property value.

	Args:
	class_type: The class to search in
	property_name: The property name to match
	property_value: The property value to match

	Returns:
	A list of matched entities as documents
	"""
	instances = self.ontology_manager.get_instances_of_class(class_type)

	results = []
	for instance_id in instances:
	entity_info = self.ontology_manager.get_entity_info(instance_id)
	if "properties" in entity_info:
	properties = entity_info["properties"]
	if property_name in properties:
	# Simple string matching (could be enhanced with fuzzy matching)
	if str(properties[property_name]).lower() == property_value.lower():
	# Convert to document
	doc_content = f"Instance: {instance_id}\n"
	doc_content += f"Type: {class_type}\n"
	doc_content += "Properties:\n"

	for prop_name, prop_value in properties.items():
	doc_content += f"- {prop_name}: {prop_value}\n"

	results.append(Document(
	page_content=doc_content,
	metadata={
	"source": "property_search",
	"instance_id": instance_id,
	"class_type": class_type
	}
	))

	return results