import os import pinecone from pydantic import Field from vector_db import Document from html_parser import HTMLParser from langchain.vectorstores import Pinecone from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME from config import EMBEDDING_API_BASE, EMBEDDING_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID from langchain.embeddings import OpenAIEmbeddings WEBSITE_FOLDER = 'website' parser = HTMLParser() # initialize pinecone pinecone.init( api_key=PINECONE_API_KEY, # find at app.pinecone.io environment=PINECONE_ENVIRONMENT, # next to api key in console ) # Azure embedding model definition embeddings = OpenAIEmbeddings( deployment=EMBEDDING_DEPLOYMENT_ID, openai_api_key=EMBEDDING_API_KEY, openai_api_base=EMBEDDING_API_BASE, openai_api_type=OPENAI_API_TYPE, openai_api_version=OPENAI_API_VERSION, chunk_size=16 ) if INDEX_NAME and INDEX_NAME not in pinecone.list_indexes(): pinecone.create_index( INDEX_NAME, metric="cosine", dimension=1536 ) print(f"Index {INDEX_NAME} created successfully") index = pinecone.Index(INDEX_NAME) index.delete(delete_all=True) files_src = os.listdir(WEBSITE_FOLDER) documents = [] for file in files_src: filepath = os.path.join(WEBSITE_FOLDER, file) filename = os.path.basename(filepath) data = parser.parse_file(filepath) texts= [] for d in data: texts.append(Document(page_content=d, metadata={"source": filepath})) documents.extend(texts) print(len(documents)) if len(documents)>0: document_id = [d.metadata['document_id'] + f"_{idx}" for (idx, d) in enumerate(documents)] Pinecone.from_documents(documents, embeddings, ids=document_id, index_name=INDEX_NAME) message = f"Add website to {INDEX_NAME} sucessfully"