File size: 1,894 Bytes
94b1e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import os

from config import PERSIST_DIRECTORY

def process_safety_with_chroma(data):
    """
    Processes and stores the given structured JSON data into ChromaDB.
    Args:
        data (list): A list of dictionaries containing structured JSON data.
    Returns:
        Chroma: The Chroma vector store object.
    """
    
    documents = []
    # print("machidkkkk\n")
    for item in data:
        # print("machidkkkk\n")

        # Extract fields from the JSON structure
        content = item.get("snippet", "")
        highlighted_words = item.get("snippet_highlighted_words", [])
        highlighted_words_str = ", ".join(highlighted_words) if isinstance(highlighted_words, list) else str(highlighted_words)

        metadata = {
            "position": item.get("position"),
            "title": item.get("title"),
            "link": item.get("link"),
            "source": item.get("source"),
            "displayed_link": item.get("displayed_link"),
            # Flatten highlighted_words list into a comma-separated string
            "highlighted_words": ", ".join(highlighted_words) if isinstance(highlighted_words, list) else highlighted_words
        }
            # Create a document for each snippet
        # print("ffffff")
        # print ( "content", content)
        if content:
            content += f" Highlighted words: {highlighted_words_str}" if highlighted_words_str else ""
            documents.append(Document(page_content=content, metadata=metadata))
    # Initialize embeddings and Chroma store
    embeddings = OpenAIEmbeddings()
    vector_store = Chroma.from_documents(documents, embeddings, persist_directory=PERSIST_DIRECTORY)

    return vector_store