Spaces:
Runtime error
Runtime error
# import json | |
# from facebook_page_scraper import Facebook_scraper | |
# from facebook_page_scraper import Facebook_scraper | |
# from config import * | |
# #instantiate the Facebook_scraper class | |
# page_name = "fptsoftware.official" | |
# posts_count = 15 | |
# browser = "firefox" | |
# timeout = 600 #600 seconds | |
# headless = True | |
# meta_ai = Facebook_scraper(page_name, posts_count, browser, timeout=timeout, headless=headless) | |
# json_data = meta_ai.scrap_to_json() | |
# with open('data.json', 'w') as f: | |
# json.dump(json_data, f) | |
import json | |
from pydantic import Field | |
from langchain.load.serializable import Serializable | |
import pinecone | |
# from langchain.vectorstores import Pinecone | |
from custom_vectordb import Pinecone | |
from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME, CONNECTION_STRING, CONTAINER_NAME, NAME_SPACE_1, NAME_SPACE_2 | |
from config import EMBEDDING_API_BASE, EMBEDDING_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID | |
from langchain.embeddings import OpenAIEmbeddings | |
import ast | |
with open('data.json') as json_file: | |
data = json.load(json_file) | |
class Document(Serializable): | |
"""Class for storing a piece of text and associated metadata.""" | |
page_content: str | |
"""String text.""" | |
metadata: dict = Field(default_factory=dict) | |
"""Arbitrary metadata about the page content (e.g., source, relationships to other | |
documents, etc.). | |
""" | |
datas = ast.literal_eval(data) | |
# initialize pinecone | |
pinecone.init( | |
api_key=PINECONE_API_KEY, # find at app.pinecone.io | |
environment=PINECONE_ENVIRONMENT, # next to api key in console | |
) | |
index = pinecone.Index(INDEX_NAME) | |
index.delete(delete_all=True, namespace=NAME_SPACE_2) | |
embeddings = OpenAIEmbeddings( | |
deployment=EMBEDDING_DEPLOYMENT_ID, | |
openai_api_key=EMBEDDING_API_KEY, | |
openai_api_base=EMBEDDING_API_BASE, | |
openai_api_type=OPENAI_API_TYPE, | |
openai_api_version=OPENAI_API_VERSION, | |
chunk_size=16 | |
) | |
texts = [] | |
for k, v in datas.items(): | |
content = v["content"].split("-----")[0] + "\nimage_link: " + str(v["image"]) | |
post_url = v["post_url"] | |
texts.append(Document(page_content=content, metadata={"source": post_url})) | |
print(len(texts)) | |
if len(texts)>0: | |
Pinecone.from_documents(texts, embeddings, index_name=INDEX_NAME, namespace=NAME_SPACE_2) | |
message = f"Add facebook data to space {NAME_SPACE_2} in {INDEX_NAME} sucessfully" | |
print(message) |