|
from abc import ABC, abstractmethod |
|
import re |
|
from collections import Counter |
|
import string |
|
from .model_loader import load_nltk_punkt |
|
from .utils import * |
|
|
|
|
|
class ChunkingStrategy(ABC): |
|
""" |
|
Abstract base class for chunking strategies. |
|
""" |
|
|
|
@abstractmethod |
|
def chunk(self, text: str) -> list: |
|
""" |
|
Abstract method to chunk the given text. |
|
|
|
Args: |
|
text (str): The text to chunk. |
|
|
|
Returns: |
|
list: A list of chunks. |
|
""" |
|
pass |
|
|
|
|
|
class IdentityChunking(ChunkingStrategy): |
|
""" |
|
Chunking strategy that returns the input text as a single chunk. |
|
""" |
|
def chunk(self, text: str) -> list: |
|
return [text] |
|
|
|
|
|
class RegexChunking(ChunkingStrategy): |
|
""" |
|
Chunking strategy that splits text based on regular expression patterns. |
|
""" |
|
def __init__(self, patterns=None, **kwargs): |
|
""" |
|
Initialize the RegexChunking object. |
|
|
|
Args: |
|
patterns (list): A list of regular expression patterns to split text. |
|
""" |
|
if patterns is None: |
|
patterns = [r'\n\n'] |
|
self.patterns = patterns |
|
|
|
def chunk(self, text: str) -> list: |
|
paragraphs = [text] |
|
for pattern in self.patterns: |
|
new_paragraphs = [] |
|
for paragraph in paragraphs: |
|
new_paragraphs.extend(re.split(pattern, paragraph)) |
|
paragraphs = new_paragraphs |
|
return paragraphs |
|
|
|
|
|
class NlpSentenceChunking(ChunkingStrategy): |
|
""" |
|
Chunking strategy that splits text into sentences using NLTK's sentence tokenizer. |
|
""" |
|
def __init__(self, **kwargs): |
|
""" |
|
Initialize the NlpSentenceChunking object. |
|
""" |
|
load_nltk_punkt() |
|
|
|
|
|
def chunk(self, text: str) -> list: |
|
|
|
|
|
|
|
|
|
|
|
|
|
from nltk.tokenize import sent_tokenize |
|
sentences = sent_tokenize(text) |
|
sens = [sent.strip() for sent in sentences] |
|
|
|
return list(set(sens)) |
|
|
|
|
|
class TopicSegmentationChunking(ChunkingStrategy): |
|
""" |
|
Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer. |
|
|
|
How it works: |
|
1. Segment the text into topics using TextTilingTokenizer |
|
2. Extract keywords for each topic segment |
|
""" |
|
|
|
def __init__(self, num_keywords=3, **kwargs): |
|
""" |
|
Initialize the TopicSegmentationChunking object. |
|
|
|
Args: |
|
num_keywords (int): The number of keywords to extract for each topic segment. |
|
""" |
|
import nltk as nl |
|
self.tokenizer = nl.tokenize.TextTilingTokenizer() |
|
self.num_keywords = num_keywords |
|
|
|
def chunk(self, text: str) -> list: |
|
|
|
segmented_topics = self.tokenizer.tokenize(text) |
|
return segmented_topics |
|
|
|
def extract_keywords(self, text: str) -> list: |
|
|
|
import nltk as nl |
|
tokens = nl.toknize.word_tokenize(text) |
|
tokens = [token.lower() for token in tokens if token not in nl.corpus.stopwords.words('english') and token not in string.punctuation] |
|
|
|
|
|
freq_dist = Counter(tokens) |
|
keywords = [word for word, freq in freq_dist.most_common(self.num_keywords)] |
|
return keywords |
|
|
|
def chunk_with_topics(self, text: str) -> list: |
|
|
|
segments = self.chunk(text) |
|
|
|
segments_with_topics = [(segment, self.extract_keywords(segment)) for segment in segments] |
|
return segments_with_topics |
|
|
|
|
|
class FixedLengthWordChunking(ChunkingStrategy): |
|
""" |
|
Chunking strategy that splits text into fixed-length word chunks. |
|
|
|
How it works: |
|
1. Split the text into words |
|
2. Create chunks of fixed length |
|
3. Return the list of chunks |
|
""" |
|
def __init__(self, chunk_size=100, **kwargs): |
|
""" |
|
Initialize the fixed-length word chunking strategy with the given chunk size. |
|
|
|
Args: |
|
chunk_size (int): The size of each chunk in words. |
|
""" |
|
self.chunk_size = chunk_size |
|
|
|
def chunk(self, text: str) -> list: |
|
words = text.split() |
|
return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)] |
|
|
|
|
|
class SlidingWindowChunking(ChunkingStrategy): |
|
""" |
|
Chunking strategy that splits text into overlapping word chunks. |
|
|
|
How it works: |
|
1. Split the text into words |
|
2. Create chunks of fixed length |
|
3. Return the list of chunks |
|
""" |
|
def __init__(self, window_size=100, step=50, **kwargs): |
|
""" |
|
Initialize the sliding window chunking strategy with the given window size and |
|
step size. |
|
|
|
Args: |
|
window_size (int): The size of the sliding window in words. |
|
step (int): The step size for sliding the window in words. |
|
""" |
|
self.window_size = window_size |
|
self.step = step |
|
|
|
def chunk(self, text: str) -> list: |
|
words = text.split() |
|
chunks = [] |
|
|
|
if len(words) <= self.window_size: |
|
return [text] |
|
|
|
for i in range(0, len(words) - self.window_size + 1, self.step): |
|
chunk = ' '.join(words[i:i + self.window_size]) |
|
chunks.append(chunk) |
|
|
|
|
|
if i + self.window_size < len(words): |
|
chunks.append(' '.join(words[-self.window_size:])) |
|
|
|
return chunks |
|
|
|
class OverlappingWindowChunking(ChunkingStrategy): |
|
""" |
|
Chunking strategy that splits text into overlapping word chunks. |
|
|
|
How it works: |
|
1. Split the text into words using whitespace |
|
2. Create chunks of fixed length equal to the window size |
|
3. Slide the window by the overlap size |
|
4. Return the list of chunks |
|
""" |
|
def __init__(self, window_size=1000, overlap=100, **kwargs): |
|
""" |
|
Initialize the overlapping window chunking strategy with the given window size and |
|
overlap size. |
|
|
|
Args: |
|
window_size (int): The size of the window in words. |
|
overlap (int): The size of the overlap between consecutive chunks in words. |
|
""" |
|
self.window_size = window_size |
|
self.overlap = overlap |
|
|
|
def chunk(self, text: str) -> list: |
|
words = text.split() |
|
chunks = [] |
|
|
|
if len(words) <= self.window_size: |
|
return [text] |
|
|
|
start = 0 |
|
while start < len(words): |
|
end = start + self.window_size |
|
chunk = ' '.join(words[start:end]) |
|
chunks.append(chunk) |
|
|
|
if end >= len(words): |
|
break |
|
|
|
start = end - self.overlap |
|
|
|
return chunks |