Chat_QnA_v2 / scrab_fb.py
binh99's picture
update cosmos db
a4b89be
# import json
# from facebook_page_scraper import Facebook_scraper
# from facebook_page_scraper import Facebook_scraper
# from config import *
# #instantiate the Facebook_scraper class
# page_name = "fptsoftware.official"
# posts_count = 15
# browser = "firefox"
# timeout = 600 #600 seconds
# headless = True
# meta_ai = Facebook_scraper(page_name, posts_count, browser, timeout=timeout, headless=headless)
# json_data = meta_ai.scrap_to_json()
# with open('data.json', 'w') as f:
# json.dump(json_data, f)
import json
from pydantic import Field
from langchain.load.serializable import Serializable
import pinecone
# from langchain.vectorstores import Pinecone
from custom_vectordb import Pinecone
from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME, CONNECTION_STRING, CONTAINER_NAME, NAME_SPACE_1, NAME_SPACE_2
from config import EMBEDDING_API_BASE, EMBEDDING_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID
from langchain.embeddings import OpenAIEmbeddings
import ast
with open('data.json') as json_file:
data = json.load(json_file)
class Document(Serializable):
"""Class for storing a piece of text and associated metadata."""
page_content: str
"""String text."""
metadata: dict = Field(default_factory=dict)
"""Arbitrary metadata about the page content (e.g., source, relationships to other
documents, etc.).
"""
datas = ast.literal_eval(data)
# initialize pinecone
pinecone.init(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_ENVIRONMENT, # next to api key in console
)
index = pinecone.Index(INDEX_NAME)
index.delete(delete_all=True, namespace=NAME_SPACE_2)
embeddings = OpenAIEmbeddings(
deployment=EMBEDDING_DEPLOYMENT_ID,
openai_api_key=EMBEDDING_API_KEY,
openai_api_base=EMBEDDING_API_BASE,
openai_api_type=OPENAI_API_TYPE,
openai_api_version=OPENAI_API_VERSION,
chunk_size=16
)
texts = []
for k, v in datas.items():
content = v["content"].split("-----")[0] + "\nimage_link: " + str(v["image"])
post_url = v["post_url"]
texts.append(Document(page_content=content, metadata={"source": post_url}))
print(len(texts))
if len(texts)>0:
Pinecone.from_documents(texts, embeddings, index_name=INDEX_NAME, namespace=NAME_SPACE_2)
message = f"Add facebook data to space {NAME_SPACE_2} in {INDEX_NAME} sucessfully"
print(message)