Chat_QnA_v2 / process_html.py
binh99's picture
update new
d037cdf
raw
history blame
1.86 kB
import os
import pinecone
from pydantic import Field
from vector_db import Document
from html_parser import HTMLParser
from langchain.vectorstores import Pinecone
from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME
from config import EMBEDDING_API_BASE, EMBEDDING_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID
from langchain.embeddings import OpenAIEmbeddings
WEBSITE_FOLDER = 'website'
parser = HTMLParser()
# initialize pinecone
pinecone.init(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_ENVIRONMENT, # next to api key in console
)
# Azure embedding model definition
embeddings = OpenAIEmbeddings(
deployment=EMBEDDING_DEPLOYMENT_ID,
openai_api_key=EMBEDDING_API_KEY,
openai_api_base=EMBEDDING_API_BASE,
openai_api_type=OPENAI_API_TYPE,
openai_api_version=OPENAI_API_VERSION,
chunk_size=16
)
if INDEX_NAME and INDEX_NAME not in pinecone.list_indexes():
pinecone.create_index(
INDEX_NAME,
metric="cosine",
dimension=1536
)
print(f"Index {INDEX_NAME} created successfully")
index = pinecone.Index(INDEX_NAME)
index.delete(delete_all=True)
files_src = os.listdir(WEBSITE_FOLDER)
documents = []
for file in files_src:
filepath = os.path.join(WEBSITE_FOLDER, file)
filename = os.path.basename(filepath)
data = parser.parse_file(filepath)
texts= []
for d in data:
texts.append(Document(page_content=d, metadata={"source": filepath, "document_id": filename}))
documents.extend(texts)
# print(len(documents))
# if len(documents)>0:
# document_id = [d.metadata['document_id'] + f"_{idx}" for (idx, d) in enumerate(documents)]
# Pinecone.from_documents(documents, embeddings, ids=document_id, index_name=INDEX_NAME)
# message = f"Add files to {INDEX_NAME} sucessfully"