File size: 3,683 Bytes
502c3a8
 
 
 
 
 
 
 
9afc33a
 
502c3a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9afc33a
 
 
 
 
 
 
 
502c3a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5c02af
502c3a8
 
 
 
 
 
 
 
 
e8aaad2
502c3a8
 
 
 
46e0b3f
502c3a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from typing import List, TypedDict
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import HuggingFaceEmbeddings

import tiktoken

def load_web_documents(urls: List[str]) -> List[Document]:
    """
    Load documents from web URLs
    
    Args:
        urls: List of URLs to load
        
    Returns:
        List of loaded documents
    """
    loader = WebBaseLoader(urls)
    return loader.load()

def create_rag_pipeline(collection_name: str = "rag_collection"):
    # OpenAI embedding model
    # embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    # Fine tuned embedding model
    embedding_model = HuggingFaceEmbeddings(
        model_name="ric9176/cjo-ft-v0", 
    )
    # embedding_dim = 1536  # Dimension for text-embedding-3-small
    embedding_dim = 1024  # Dimension for Snowflake/snowflake-arctic-embed-l

    # Initialize Qdrant client (in-memory for development)
    client = QdrantClient(":memory:")
    
    # Create collection for vectors
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE),
    )

    # Create vector store
    vector_store = QdrantVectorStore(
        client=client,
        collection_name=collection_name,
        embedding=embedding_model,
    )

    # Create text splitter for chunking
    def tiktoken_len(text):
        tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(text)
        return len(tokens)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,  # Adjust based on your needs
        chunk_overlap=50,
        length_function=tiktoken_len,
    )

    # Create retriever
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})

    return {
        "vector_store": vector_store,
        "text_splitter": text_splitter,
        "retriever": retriever
    }

def add_documents(vector_store, text_splitter, documents: List[Document]):
    """
    Add documents to the vector store
    
    Args:
        vector_store: The initialized vector store
        text_splitter: The text splitter for chunking
        documents: List of Document objects to add
    """
    # Split documents into chunks
    chunks = []
    for doc in documents:
        # Split the page content of each document
        doc_chunks = text_splitter.split_text(doc.page_content)
        chunks.extend(doc_chunks)
    
    # Add chunks to vector store
    vector_store.add_texts(texts=chunks)

def add_urls_to_vectorstore(vector_store, text_splitter, urls: List[str]):
    """
    Load documents from URLs and add them to the vector store
    
    Args:
        vector_store: The initialized vector store
        text_splitter: The text splitter for chunking
        urls: List of URLs to load and add
    """
    # Load documents from URLs
    documents = load_web_documents(urls)
    
    # Add documents to vector store
    add_documents(vector_store, text_splitter, documents)

def get_relevant_context(retriever, question: str) -> List[Document]:
    """
    Get relevant context for a question
    
    Args:
        retriever: The initialized retriever
        question: The question to find context for
        
    Returns:
        List of relevant documents
    """
    return retriever.get_relevant_documents(question)