Chat_QnA_v2 / process_fb.py
binh99's picture
update new
d037cdf
raw
history blame
1.57 kB
import json
import ast
import os
import pinecone
from pydantic import Field
from vector_db import Document
from html_parser import HTMLParser
from langchain.vectorstores import Pinecone
from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME
from config import EMBEDDING_API_BASE, EMBEDDING_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID
from langchain.embeddings import OpenAIEmbeddings
# initialize pinecone
pinecone.init(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_ENVIRONMENT, # next to api key in console
)
# Azure embedding model definition
embeddings = OpenAIEmbeddings(
deployment=EMBEDDING_DEPLOYMENT_ID,
openai_api_key=EMBEDDING_API_KEY,
openai_api_base=EMBEDDING_API_BASE,
openai_api_type=OPENAI_API_TYPE,
openai_api_version=OPENAI_API_VERSION,
chunk_size=16
)
if INDEX_NAME and INDEX_NAME not in pinecone.list_indexes():
pinecone.create_index(
INDEX_NAME,
metric="cosine",
dimension=1536
)
print(f"Index {INDEX_NAME} created successfully")
index = pinecone.Index(INDEX_NAME)
with open('data.json') as json_file:
data = json.load(json_file)
datas = ast.literal_eval(data)
texts = []
for k, v in datas.items():
content = v["content"]
post_url = v["post_url"]
texts.append(Document(page_content=content, metadata={"source": post_url}))
if len(texts)>0:
Pinecone.from_documents(texts, embeddings, index_name=INDEX_NAME)
message = f"Add files to {INDEX_NAME} sucessfully"
print(message)