File size: 2,401 Bytes
a4b89be
 
 
 
 
 
 
 
d037cdf
a4b89be
 
 
 
d037cdf
a4b89be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d037cdf
a4b89be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# import json
# from facebook_page_scraper import Facebook_scraper
# from facebook_page_scraper import Facebook_scraper
# from config import *
# #instantiate the Facebook_scraper class
# page_name = "fptsoftware.official"
# posts_count = 15
# browser = "firefox"

# timeout = 600 #600 seconds
# headless = True
# meta_ai = Facebook_scraper(page_name, posts_count, browser,  timeout=timeout, headless=headless)
# json_data = meta_ai.scrap_to_json()

# with open('data.json', 'w') as f:
#     json.dump(json_data, f)
import json
from pydantic import Field
from langchain.load.serializable import Serializable
import pinecone
# from langchain.vectorstores import Pinecone
from custom_vectordb import Pinecone
from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME, CONNECTION_STRING, CONTAINER_NAME, NAME_SPACE_1, NAME_SPACE_2
from config import EMBEDDING_API_BASE, EMBEDDING_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID
from langchain.embeddings import OpenAIEmbeddings
import ast
with open('data.json') as json_file:
    data = json.load(json_file)
class Document(Serializable):
    """Class for storing a piece of text and associated metadata."""

    page_content: str
    """String text."""
    metadata: dict = Field(default_factory=dict)
    """Arbitrary metadata about the page content (e.g., source, relationships to other
        documents, etc.).
    """
datas = ast.literal_eval(data)
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENVIRONMENT,  # next to api key in console
)
index = pinecone.Index(INDEX_NAME) 
index.delete(delete_all=True, namespace=NAME_SPACE_2)
embeddings = OpenAIEmbeddings(
    deployment=EMBEDDING_DEPLOYMENT_ID, 
    openai_api_key=EMBEDDING_API_KEY,
    openai_api_base=EMBEDDING_API_BASE,
    openai_api_type=OPENAI_API_TYPE,
    openai_api_version=OPENAI_API_VERSION,
    chunk_size=16
)
texts = []
for k, v in datas.items():
    content = v["content"].split("-----")[0] + "\nimage_link: " + str(v["image"])
    post_url = v["post_url"]
    texts.append(Document(page_content=content, metadata={"source": post_url}))
print(len(texts))
if len(texts)>0:
    Pinecone.from_documents(texts, embeddings, index_name=INDEX_NAME, namespace=NAME_SPACE_2)
    message = f"Add facebook data to space {NAME_SPACE_2} in {INDEX_NAME} sucessfully"
    print(message)