import json import ast import os import pinecone from pydantic import Field from vector_db import Document from html_parser import HTMLParser from langchain.vectorstores import Pinecone from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME from config import EMBEDDING_API_BASE, EMBEDDING_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID from langchain.embeddings import OpenAIEmbeddings # initialize pinecone pinecone.init( api_key=PINECONE_API_KEY, # find at app.pinecone.io environment=PINECONE_ENVIRONMENT, # next to api key in console ) # Azure embedding model definition embeddings = OpenAIEmbeddings( deployment=EMBEDDING_DEPLOYMENT_ID, openai_api_key=EMBEDDING_API_KEY, openai_api_base=EMBEDDING_API_BASE, openai_api_type=OPENAI_API_TYPE, openai_api_version=OPENAI_API_VERSION, chunk_size=16 ) if INDEX_NAME and INDEX_NAME not in pinecone.list_indexes(): pinecone.create_index( INDEX_NAME, metric="cosine", dimension=1536 ) print(f"Index {INDEX_NAME} created successfully") index = pinecone.Index(INDEX_NAME) with open('data.json') as json_file: data = json.load(json_file) datas = ast.literal_eval(data) texts = [] for k, v in datas.items(): content = v["content"] post_url = v["post_url"] texts.append(Document(page_content=content, metadata={"source": post_url})) if len(texts)>0: Pinecone.from_documents(texts, embeddings, index_name=INDEX_NAME) message = f"Add files to {INDEX_NAME} sucessfully" print(message)