Spaces:
Sleeping
Sleeping
Update scrape.py
Browse files
scrape.py
CHANGED
|
@@ -1,22 +1,22 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
from apify_client import ApifyClient
|
| 4 |
-
from dotenv import load_dotenv
|
| 5 |
from langchain.document_loaders import ApifyDatasetLoader
|
| 6 |
from langchain.document_loaders.base import Document
|
| 7 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 9 |
from langchain.vectorstores import Chroma
|
| 10 |
|
| 11 |
-
#
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
|
| 14 |
if __name__ == '__main__':
|
| 15 |
-
apify_client = ApifyClient(
|
| 16 |
-
|
| 17 |
-
print(f'Extracting data from "{website_url}". Please wait...')
|
| 18 |
actor_run_info = apify_client.actor('apify/website-content-crawler').call(
|
| 19 |
-
run_input={'startUrls': [{'url':
|
| 20 |
)
|
| 21 |
print('Saving data into the vector database. Please wait...')
|
| 22 |
loader = ApifyDatasetLoader(
|
|
@@ -29,7 +29,8 @@ if __name__ == '__main__':
|
|
| 29 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
|
| 30 |
docs = text_splitter.split_documents(documents)
|
| 31 |
|
| 32 |
-
|
|
|
|
| 33 |
|
| 34 |
vectordb = Chroma.from_documents(
|
| 35 |
documents=docs,
|
|
@@ -37,4 +38,4 @@ if __name__ == '__main__':
|
|
| 37 |
persist_directory='db2',
|
| 38 |
)
|
| 39 |
vectordb.persist()
|
| 40 |
-
print('All done!')
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
from apify_client import ApifyClient
|
|
|
|
| 4 |
from langchain.document_loaders import ApifyDatasetLoader
|
| 5 |
from langchain.document_loaders.base import Document
|
| 6 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
from langchain.vectorstores import Chroma
|
| 9 |
|
| 10 |
+
# Access variables and secrets as environment variables
|
| 11 |
+
WEBSITE_URL = os.environ.get('WEBSITE_URL')
|
| 12 |
+
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
|
| 13 |
+
APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN')
|
| 14 |
|
| 15 |
if __name__ == '__main__':
|
| 16 |
+
apify_client = ApifyClient(APIFY_API_TOKEN)
|
| 17 |
+
print(f'Extracting data from "{WEBSITE_URL}". Please wait...')
|
|
|
|
| 18 |
actor_run_info = apify_client.actor('apify/website-content-crawler').call(
|
| 19 |
+
run_input={'startUrls': [{'url': WEBSITE_URL}]}
|
| 20 |
)
|
| 21 |
print('Saving data into the vector database. Please wait...')
|
| 22 |
loader = ApifyDatasetLoader(
|
|
|
|
| 29 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
|
| 30 |
docs = text_splitter.split_documents(documents)
|
| 31 |
|
| 32 |
+
# Ensure the OPENAI_API_KEY is used correctly in OpenAIEmbeddings
|
| 33 |
+
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
|
| 34 |
|
| 35 |
vectordb = Chroma.from_documents(
|
| 36 |
documents=docs,
|
|
|
|
| 38 |
persist_directory='db2',
|
| 39 |
)
|
| 40 |
vectordb.persist()
|
| 41 |
+
print('All done!')
|