Spaces:

QAI
/

Chat_QnA_v2

Runtime error

App Files Files Community

Chat_QnA_v2 / scrab_fb.py

binh99

update cosmos db

a4b89be over 1 year ago

raw

history blame contribute delete

2.4 kB

	# import json
	# from facebook_page_scraper import Facebook_scraper
	# from facebook_page_scraper import Facebook_scraper
	# from config import *
	# #instantiate the Facebook_scraper class
	# page_name = "fptsoftware.official"
	# posts_count = 15
	# browser = "firefox"

	# timeout = 600 #600 seconds
	# headless = True
	# meta_ai = Facebook_scraper(page_name, posts_count, browser, timeout=timeout, headless=headless)
	# json_data = meta_ai.scrap_to_json()

	# with open('data.json', 'w') as f:
	# json.dump(json_data, f)
	import json
	from pydantic import Field
	from langchain.load.serializable import Serializable
	import pinecone
	# from langchain.vectorstores import Pinecone
	from custom_vectordb import Pinecone
	from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME, CONNECTION_STRING, CONTAINER_NAME, NAME_SPACE_1, NAME_SPACE_2
	from config import EMBEDDING_API_BASE, EMBEDDING_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID
	from langchain.embeddings import OpenAIEmbeddings
	import ast
	with open('data.json') as json_file:
	data = json.load(json_file)
	class Document(Serializable):
	"""Class for storing a piece of text and associated metadata."""

	page_content: str
	"""String text."""
	metadata: dict = Field(default_factory=dict)
	"""Arbitrary metadata about the page content (e.g., source, relationships to other
	documents, etc.).
	"""
	datas = ast.literal_eval(data)
	# initialize pinecone
	pinecone.init(
	api_key=PINECONE_API_KEY, # find at app.pinecone.io
	environment=PINECONE_ENVIRONMENT, # next to api key in console
	)
	index = pinecone.Index(INDEX_NAME)
	index.delete(delete_all=True, namespace=NAME_SPACE_2)
	embeddings = OpenAIEmbeddings(
	deployment=EMBEDDING_DEPLOYMENT_ID,
	openai_api_key=EMBEDDING_API_KEY,
	openai_api_base=EMBEDDING_API_BASE,
	openai_api_type=OPENAI_API_TYPE,
	openai_api_version=OPENAI_API_VERSION,
	chunk_size=16
	)
	texts = []
	for k, v in datas.items():
	content = v["content"].split("-----")[0] + "\nimage_link: " + str(v["image"])
	post_url = v["post_url"]
	texts.append(Document(page_content=content, metadata={"source": post_url}))
	print(len(texts))
	if len(texts)>0:
	Pinecone.from_documents(texts, embeddings, index_name=INDEX_NAME, namespace=NAME_SPACE_2)
	message = f"Add facebook data to space {NAME_SPACE_2} in {INDEX_NAME} sucessfully"
	print(message)