Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Cleanup script to remove all web crawler data from MongoDB | |
| and list files to be removed | |
| """ | |
| import os | |
| import sys | |
| import logging | |
| import shutil | |
| from pymongo import MongoClient | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s [%(name)s] %(levelname)s: %(message)s' | |
| ) | |
| logger = logging.getLogger("cleanup") | |
| def cleanup_mongodb(): | |
| """Remove all web crawler data from MongoDB""" | |
| try: | |
| # Connect to MongoDB | |
| logger.info("Connecting to MongoDB...") | |
| client = MongoClient("mongodb://localhost:27017/") | |
| # Access crawler database | |
| db = client["crawler"] | |
| # List and drop all collections | |
| collections = db.list_collection_names() | |
| if not collections: | |
| logger.info("No collections found in the crawler database") | |
| else: | |
| logger.info(f"Found {len(collections)} collections to drop: {collections}") | |
| for collection in collections: | |
| logger.info(f"Dropping collection: {collection}") | |
| db[collection].drop() | |
| logger.info("All crawler collections dropped successfully") | |
| # Optional: Drop the entire database | |
| # client.drop_database("crawler") | |
| # logger.info("Dropped entire crawler database") | |
| logger.info("MongoDB cleanup completed") | |
| except Exception as e: | |
| logger.error(f"Error cleaning up MongoDB: {e}") | |
| return False | |
| return True | |
| def cleanup_files(): | |
| """List and remove files related to simple_crawler""" | |
| try: | |
| crawler_dir = os.path.dirname(os.path.abspath(__file__)) | |
| # Files directly related to simple_crawler | |
| simple_crawler_files = [ | |
| os.path.join(crawler_dir, "simple_crawler.py"), | |
| os.path.join(crawler_dir, "README_SIMPLE.md"), | |
| os.path.join(crawler_dir, "simple_crawler.log") | |
| ] | |
| # Check storage directories | |
| storage_dir = os.path.join(crawler_dir, "storage") | |
| if os.path.exists(storage_dir): | |
| logger.info(f"Will remove storage directory: {storage_dir}") | |
| simple_crawler_files.append(storage_dir) | |
| # List all files that will be removed | |
| logger.info("The following files will be removed:") | |
| for file_path in simple_crawler_files: | |
| if os.path.exists(file_path): | |
| logger.info(f" - {file_path}") | |
| else: | |
| logger.info(f" - {file_path} (not found)") | |
| # Confirm removal | |
| confirm = input("Do you want to proceed with removal? (y/n): ") | |
| if confirm.lower() != 'y': | |
| logger.info("File removal cancelled") | |
| return False | |
| # Remove files and directories | |
| for file_path in simple_crawler_files: | |
| if os.path.exists(file_path): | |
| if os.path.isdir(file_path): | |
| logger.info(f"Removing directory: {file_path}") | |
| shutil.rmtree(file_path) | |
| else: | |
| logger.info(f"Removing file: {file_path}") | |
| os.remove(file_path) | |
| logger.info("File cleanup completed") | |
| except Exception as e: | |
| logger.error(f"Error cleaning up files: {e}") | |
| return False | |
| return True | |
| if __name__ == "__main__": | |
| print("Web Crawler Cleanup Utility") | |
| print("---------------------------") | |
| print("This script will:") | |
| print("1. Remove all web crawler collections from MongoDB") | |
| print("2. List and remove files related to simple_crawler") | |
| print() | |
| proceed = input("Do you want to proceed? (y/n): ") | |
| if proceed.lower() != 'y': | |
| print("Cleanup cancelled") | |
| sys.exit(0) | |
| # Clean up MongoDB | |
| print("\nStep 1: Cleaning up MongoDB...") | |
| mongo_success = cleanup_mongodb() | |
| # Clean up files | |
| print("\nStep 2: Cleaning up files...") | |
| files_success = cleanup_files() | |
| # Summary | |
| print("\nCleanup Summary:") | |
| print(f"MongoDB cleanup: {'Completed' if mongo_success else 'Failed'}") | |
| print(f"File cleanup: {'Completed' if files_success else 'Failed'}") |