Spaces:

akashlives
/

aie4-rag-w2

Sleeping

App Files Files Community

aie4-rag-w2 / aimakerspace /semantic_chunking.py

akashlives

Support txt and pdf

f37291b verified 7 months ago

raw

history blame contribute delete

2.06 kB

	from langchain_experimental.text_splitter import SemanticChunker
	from typing import List


	class SemanticChunking:
	def __init__(self, embedding_model, breakpoint_threshold_type="percentile"):
	"""
	Initialize the SemanticChunking class with a specified breakpoint threshold type and an embedding model.

	Args:
	embedding_model: An instance of the EmbeddingModel class to generate embeddings.
	breakpoint_threshold_type (str): The type of breakpoint threshold to use for chunking.
	Options include 'percentile', 'standard_deviation', 'interquartile'.
	"""
	self.text_splitter = SemanticChunker(
	embedding_model, breakpoint_threshold_type=breakpoint_threshold_type
	)

	def split_text(self, text: str) -> List:
	"""
	Split the provided text into semantic chunks.

	Args:
	text (str): The text to be split into chunks.

	Returns:
	list: A list of documents (chunks) obtained from the text.
	"""
	docs = self.text_splitter.create_documents([text])
	return docs

	def get_chunk(self, docs: List, chunk_index: int) -> str:
	"""
	Get a specific chunk from the list of documents.

	Args:
	docs (list): The list of documents (chunks).
	chunk_index (int): The index of the chunk to retrieve.

	Returns:
	str: The content of the specified chunk.
	"""
	if chunk_index < 0 or chunk_index >= len(docs):
	raise ValueError(
	f"Chunk index out of range. Please choose a value between 0 and {len(docs) - 1}."
	)

	return docs[chunk_index].page_content

	def get_total_chunks(self, docs: List) -> int:
	"""
	Get the total number of chunks in the list of documents.

	Args:
	docs (list): The list of documents (chunks).

	Returns:
	int: The total number of chunks.
	"""
	return len(docs)