Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Delete scrape.py
Browse files
    	
        scrape.py
    DELETED
    
    | @@ -1,41 +0,0 @@ | |
| 1 | 
            -
            import os
         | 
| 2 | 
            -
             | 
| 3 | 
            -
            from apify_client import ApifyClient
         | 
| 4 | 
            -
            from langchain.document_loaders import ApifyDatasetLoader
         | 
| 5 | 
            -
            from langchain.document_loaders.base import Document
         | 
| 6 | 
            -
            from langchain.embeddings.openai import OpenAIEmbeddings
         | 
| 7 | 
            -
            from langchain.text_splitter import RecursiveCharacterTextSplitter
         | 
| 8 | 
            -
            from langchain.vectorstores import Chroma
         | 
| 9 | 
            -
             | 
| 10 | 
            -
            # Access variables and secrets as environment variables
         | 
| 11 | 
            -
            WEBSITE_URL = os.environ.get('WEBSITE_URL')
         | 
| 12 | 
            -
            OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
         | 
| 13 | 
            -
            APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN')
         | 
| 14 | 
            -
             | 
| 15 | 
            -
            if __name__ == '__main__':
         | 
| 16 | 
            -
                apify_client = ApifyClient(APIFY_API_TOKEN)
         | 
| 17 | 
            -
                print(f'Extracting data from "{WEBSITE_URL}". Please wait...')
         | 
| 18 | 
            -
                actor_run_info = apify_client.actor('apify/website-content-crawler').call(
         | 
| 19 | 
            -
                    run_input={'startUrls': [{'url': WEBSITE_URL}]}
         | 
| 20 | 
            -
                )
         | 
| 21 | 
            -
                print('Saving data into the vector database. Please wait...')
         | 
| 22 | 
            -
                loader = ApifyDatasetLoader(
         | 
| 23 | 
            -
                    dataset_id=actor_run_info['defaultDatasetId'],
         | 
| 24 | 
            -
                    dataset_mapping_function=lambda item: Document(
         | 
| 25 | 
            -
                        page_content=item['text'] or '', metadata={'source': item['url']}
         | 
| 26 | 
            -
                    ),
         | 
| 27 | 
            -
                )
         | 
| 28 | 
            -
                documents = loader.load()
         | 
| 29 | 
            -
                text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
         | 
| 30 | 
            -
                docs = text_splitter.split_documents(documents)
         | 
| 31 | 
            -
             | 
| 32 | 
            -
                # Ensure the OPENAI_API_KEY is used correctly in OpenAIEmbeddings
         | 
| 33 | 
            -
                embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
         | 
| 34 | 
            -
             | 
| 35 | 
            -
                vectordb = Chroma.from_documents(
         | 
| 36 | 
            -
                    documents=docs,
         | 
| 37 | 
            -
                    embedding=embedding,
         | 
| 38 | 
            -
                    persist_directory='db2',
         | 
| 39 | 
            -
                )
         | 
| 40 | 
            -
                vectordb.persist()
         | 
| 41 | 
            -
                print('All done!')
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
