Spaces:

Rsr2425
/

SimpliFi

Sleeping

App Files Files Community

SimpliFi / backend /tests /test_crawler.py

Rsr2425

Added outlines of crawler code

323db03 9 months ago

raw

history blame

4.01 kB

	import pytest
	import os
	from unittest.mock import Mock, patch
	from bs4 import BeautifulSoup
	from scrapy.http import Response, Request, TextResponse
	from backend.app.crawler import DomainCrawler, WebsiteSpider


	@pytest.fixture
	def sample_html():
	return """
	<html>
	<head><title>Test Page</title></head>
	<body>
	<main>
	<h1>Main Content</h1>
	<p>This is the main content of the page.</p>
	</main>
	</body>
	</html>
	"""


	@pytest.fixture
	def crawler():
	return DomainCrawler("https://example.com", output_dir="test_output")


	@pytest.fixture
	def spider():
	return WebsiteSpider(start_url="https://example.com", output_dir="test_output")


	# def test_crawler_initialization(crawler):
	# assert crawler.start_url == "https://example.com"
	# assert crawler.domain == "example.com"
	# assert crawler.output_dir == "test_output"
	# assert os.path.exists("test_output")

	# # Test Scrapy settings
	# assert crawler.settings.get('BOT_NAME') == 'website_crawler'
	# assert crawler.settings.get('ROBOTSTXT_OBEY') is True
	# assert crawler.settings.get('DOWNLOAD_DELAY') == 1


	def create_response(url, body):
	request = Request(url=url)
	return TextResponse(
	url=url, body=body.encode("utf-8"), encoding="utf-8", request=request
	)


	# def test_spider_parse_with_main_content(spider, sample_html):
	# url = "https://example.com/test"
	# response = create_response(url, sample_html)

	# # Process the page
	# list(spider.parse_item(response))

	# # Check if file was created
	# files = os.listdir(spider.output_dir)
	# assert len(files) == 1

	# # Read the saved file
	# with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
	# content = f.read()

	# # Verify content
	# assert "URL: https://example.com/test" in content
	# assert "Title: Test Page" in content
	# assert "Main Content" in content
	# assert "This is the main content of the page." in content

	# def test_spider_parse_without_main_content(spider):
	# html_without_main = """
	# <html>
	# <head><title>No Main</title></head>
	# <body>
	# <div>Some body content</div>
	# </body>
	# </html>
	# """

	# url = "https://example.com/no-main"
	# response = create_response(url, html_without_main)

	# # Process the page
	# list(spider.parse_item(response))

	# files = os.listdir(spider.output_dir)
	# assert len(files) == 1

	# with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
	# content = f.read()

	# assert "URL: https://example.com/no-main" in content
	# assert "Title: No Main" in content
	# assert "Some body content" in content

	# def test_spider_parse_with_invalid_html(spider):
	# invalid_html = "<invalid><<html>"
	# url = "https://example.com/invalid"
	# response = create_response(url, invalid_html)

	# # Process should not raise an exception
	# list(spider.parse_item(response))

	# # Should still create a file
	# files = os.listdir(spider.output_dir)
	# assert len(files) == 1

	# @patch('scrapy.crawler.CrawlerProcess')
	# def test_start_crawling(mock_crawler_process_class, crawler):
	# # Configure the mock
	# mock_process = Mock()
	# mock_crawler_process_class.return_value = mock_process

	# # Start crawling
	# crawler.start()

	# # Verify process was created with correct settings
	# mock_crawler_process_class.assert_called_once_with(crawler.settings)

	# # Verify crawl method was called
	# mock_process.crawl.assert_called_once()
	# mock_process.start.assert_called_once()


	@pytest.fixture(autouse=True)
	def cleanup():
	# Setup - nothing needed
	yield
	# Cleanup after each test
	if os.path.exists("test_output"):
	for file in os.listdir("test_output"):
	os.remove(os.path.join("test_output", file))
	os.rmdir("test_output")