# import json # from facebook_page_scraper import Facebook_scraper # from facebook_page_scraper import Facebook_scraper # from config import * # #instantiate the Facebook_scraper class # page_name = "fptsoftware.official" # posts_count = 15 # browser = "firefox" # timeout = 600 #600 seconds # headless = True # meta_ai = Facebook_scraper(page_name, posts_count, browser, timeout=timeout, headless=headless) # json_data = meta_ai.scrap_to_json() # with open('data.json', 'w') as f: # json.dump(json_data, f) import json from pydantic import Field from langchain.load.serializable import Serializable import pinecone # from langchain.vectorstores import Pinecone from custom_vectordb import Pinecone from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME, CONNECTION_STRING, CONTAINER_NAME, NAME_SPACE_1, NAME_SPACE_2 from config import EMBEDDING_API_BASE, EMBEDDING_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID from langchain.embeddings import OpenAIEmbeddings import ast with open('data.json') as json_file: data = json.load(json_file) class Document(Serializable): """Class for storing a piece of text and associated metadata.""" page_content: str """String text.""" metadata: dict = Field(default_factory=dict) """Arbitrary metadata about the page content (e.g., source, relationships to other documents, etc.). """ datas = ast.literal_eval(data) # initialize pinecone pinecone.init( api_key=PINECONE_API_KEY, # find at app.pinecone.io environment=PINECONE_ENVIRONMENT, # next to api key in console ) index = pinecone.Index(INDEX_NAME) index.delete(delete_all=True, namespace=NAME_SPACE_2) embeddings = OpenAIEmbeddings( deployment=EMBEDDING_DEPLOYMENT_ID, openai_api_key=EMBEDDING_API_KEY, openai_api_base=EMBEDDING_API_BASE, openai_api_type=OPENAI_API_TYPE, openai_api_version=OPENAI_API_VERSION, chunk_size=16 ) texts = [] for k, v in datas.items(): content = v["content"].split("-----")[0] + "\nimage_link: " + str(v["image"]) post_url = v["post_url"] texts.append(Document(page_content=content, metadata={"source": post_url})) print(len(texts)) if len(texts)>0: Pinecone.from_documents(texts, embeddings, index_name=INDEX_NAME, namespace=NAME_SPACE_2) message = f"Add facebook data to space {NAME_SPACE_2} in {INDEX_NAME} sucessfully" print(message)