Technologic101 commited on
Commit
a4bb19d
·
1 Parent(s): ec2933c

task: init scraper

Browse files
Files changed (4) hide show
  1. poetry.lock +0 -0
  2. pyproject.toml +5 -14
  3. scraper.py +112 -0
  4. test_scraper.ipynb +103 -0
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -13,6 +13,7 @@ dependencies = [
13
  "beautifulsoup4>=4.12.0",
14
  "scrapy>=2.11.0",
15
  "selenium>=4.18.0",
 
16
  "playwright>=1.42.0",
17
  "pandas>=2.2.0",
18
  "numpy>=1.26.0",
@@ -28,15 +29,9 @@ dependencies = [
28
  ]
29
  requires-python = ">=3.11,<3.12"
30
 
31
- [build-system]
32
- requires = ["hatchling"]
33
- build-backend = "hatchling.build"
34
-
35
- [tool.hatch.build.targets.wheel]
36
- packages = ["src/build"]
37
-
38
- [tool.hatch.metadata]
39
- allow-direct-references = true
40
 
41
  [tool.ruff]
42
  line-length = 88
@@ -60,8 +55,4 @@ testpaths = ["tests"]
60
  python_version = "3.11"
61
  warn_return_any = true
62
  warn_unused_configs = true
63
- check_untyped_defs = true
64
-
65
- [tool.black]
66
- line-length = 88
67
- target-version = ['py311']
 
13
  "beautifulsoup4>=4.12.0",
14
  "scrapy>=2.11.0",
15
  "selenium>=4.18.0",
16
+ "selenium-wire>=5.1.0", # Added for scraper
17
  "playwright>=1.42.0",
18
  "pandas>=2.2.0",
19
  "numpy>=1.26.0",
 
29
  ]
30
  requires-python = ">=3.11,<3.12"
31
 
32
+ [tool.black]
33
+ line-length = 88
34
+ target-version = ['py311']
 
 
 
 
 
 
35
 
36
  [tool.ruff]
37
  line-length = 88
 
55
  python_version = "3.11"
56
  warn_return_any = true
57
  warn_unused_configs = true
58
+ check_untyped_defs = true
 
 
 
 
scraper.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import json
5
+ from selenium import webdriver
6
+ from selenium.webdriver.chrome.options import Options
7
+
8
+ def create_design_directory(design_id):
9
+ """Create a directory for the design if it doesn't exist"""
10
+ directory = f"designs/{design_id}"
11
+ if not os.path.exists(directory):
12
+ os.makedirs(directory)
13
+ return directory
14
+
15
+ def save_css(url, directory):
16
+ """Download and save CSS file"""
17
+ response = requests.get(url)
18
+ css_path = f"{directory}/style.css"
19
+ with open(css_path, "w", encoding="utf-8") as f:
20
+ f.write(response.text)
21
+
22
+ def save_metadata(metadata, directory):
23
+ """Save design metadata as JSON"""
24
+ metadata_path = f"{directory}/metadata.json"
25
+ with open(metadata_path, "w", encoding="utf-8") as f:
26
+ json.dump(metadata, f, indent=4)
27
+
28
+ def take_screenshot(url, directory):
29
+ """Take screenshots of the design at desktop and mobile widths"""
30
+ chrome_options = Options()
31
+ chrome_options.add_argument("--headless")
32
+
33
+ driver = webdriver.Chrome(options=chrome_options)
34
+
35
+ # Desktop screenshot (1920px width)
36
+ driver.set_window_size(1920, 1080)
37
+ driver.get(url)
38
+ # Wait for page to load and get full height
39
+ total_height = driver.execute_script("return document.body.scrollHeight")
40
+ driver.set_window_size(1920, total_height)
41
+ driver.save_screenshot(f"{directory}/screenshot_desktop.png")
42
+
43
+ # Mobile screenshot (480px width)
44
+ driver.set_window_size(480, 1080)
45
+ driver.get(url)
46
+ # Wait for page to load and get full height
47
+ total_height = driver.execute_script("return document.body.scrollHeight")
48
+ driver.set_window_size(480, total_height)
49
+ driver.save_screenshot(f"{directory}/screenshot_mobile.png")
50
+
51
+ driver.quit()
52
+
53
+ def scrape_design(design_id):
54
+ """Scrape a single design"""
55
+ # Create base URLs
56
+ design_url = f"https://www.csszengarden.com/{design_id}"
57
+ css_url = f"https://www.csszengarden.com/{design_id}/{design_id}.css"
58
+
59
+ # Create directory for this design
60
+ directory = create_design_directory(design_id)
61
+
62
+ # Get design page
63
+ response = requests.get(design_url)
64
+ print(f"Response status: {response.status_code}")
65
+
66
+ # Debug HTML content
67
+ print("\nFirst 500 characters of response:")
68
+ print(response.text[:500])
69
+
70
+ soup = BeautifulSoup(response.text, "html.parser")
71
+
72
+ # Debug found elements
73
+ print("\nFound elements:")
74
+ print(f"h1: {soup.select_one('h1')}")
75
+ print(f"author: {soup.select_one('meta[name=\"author\"]')}")
76
+
77
+ # Extract metadata with error handling
78
+ try:
79
+ metadata = {
80
+ "id": design_id,
81
+ "author": soup.select_one('meta[name="author"]')["content"] if soup.select_one('meta[name="author"]') else "Unknown Author",
82
+ "url": design_url,
83
+ "css_url": css_url
84
+ }
85
+ except Exception as e:
86
+ print(f"\nError extracting metadata: {str(e)}")
87
+ raise
88
+
89
+ # Save everything
90
+ save_css(css_url, directory)
91
+ save_metadata(metadata, directory)
92
+ take_screenshot(design_url, directory)
93
+
94
+ def main():
95
+ """Main function to scrape multiple designs"""
96
+ # Create designs directory if it doesn't exist
97
+ if not os.path.exists("designs"):
98
+ os.makedirs("designs")
99
+
100
+ # List of design IDs to scrape
101
+ design_ids = ["221", "220", "219"] # Add more IDs as needed
102
+
103
+ for design_id in design_ids:
104
+ try:
105
+ print(f"Scraping design {design_id}...")
106
+ scrape_design(design_id)
107
+ print(f"Successfully scraped design {design_id}")
108
+ except Exception as e:
109
+ print(f"Error scraping design {design_id}: {str(e)}")
110
+
111
+ if __name__ == "__main__":
112
+ main()
test_scraper.ipynb ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Test CSS Zen Garden Scraper\n",
8
+ "\n",
9
+ "This notebook tests the functionality of our CSS Zen Garden scraper."
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 2,
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "name": "stdout",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "Testing scraper with design 221...\n",
22
+ "Response status: 200\n",
23
+ "\n",
24
+ "First 500 characters of response:\n",
25
+ "<!DOCTYPE html>\n",
26
+ "<html lang=\"en\">\n",
27
+ "<head>\n",
28
+ "\t<meta charset=\"utf-8\">\n",
29
+ "\t<title>CSS Zen Garden: The Beauty of CSS Design</title>\n",
30
+ "\n",
31
+ "\t<link rel=\"stylesheet\" media=\"screen\" href=\"/221/221.css?v=8may2013\">\n",
32
+ "\t<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS\" href=\"http://www.csszengarden.com/zengarden.xml\">\n",
33
+ "\n",
34
+ "\t<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n",
35
+ "\t<meta name=\"author\" content=\"Dave Shea\">\n",
36
+ "\t<meta name=\"description\" content=\"A demonstration of what can be accomplished v\n",
37
+ "\n",
38
+ "Found elements:\n",
39
+ "h1: <h1>CSS Zen Garden</h1>\n",
40
+ "author: None\n",
41
+ "author link: None\n",
42
+ "Success!\n"
43
+ ]
44
+ }
45
+ ],
46
+ "source": [
47
+ "from scraper import create_design_directory, save_css, save_metadata, take_screenshot, scrape_design\n",
48
+ "\n",
49
+ "# Test with a single design first\n",
50
+ "test_design_id = \"221\"\n",
51
+ "\n",
52
+ "try:\n",
53
+ " print(f\"Testing scraper with design {test_design_id}...\")\n",
54
+ " scrape_design(test_design_id)\n",
55
+ " print(\"Success!\")\n",
56
+ "except Exception as e:\n",
57
+ " print(f\"Error: {str(e)}\")"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": null,
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "# If successful, let's check what we got\n",
67
+ "import json\n",
68
+ "import os\n",
69
+ "\n",
70
+ "design_dir = f\"designs/{test_design_id}\"\n",
71
+ "print(\"Files created:\")\n",
72
+ "print(os.listdir(design_dir))\n",
73
+ "\n",
74
+ "# Display metadata\n",
75
+ "with open(f\"{design_dir}/metadata.json\") as f:\n",
76
+ " metadata = json.load(f)\n",
77
+ "print(\"\\nMetadata:\")\n",
78
+ "print(json.dumps(metadata, indent=2))"
79
+ ]
80
+ }
81
+ ],
82
+ "metadata": {
83
+ "kernelspec": {
84
+ "display_name": ".venv",
85
+ "language": "python",
86
+ "name": "python3"
87
+ },
88
+ "language_info": {
89
+ "codemirror_mode": {
90
+ "name": "ipython",
91
+ "version": 3
92
+ },
93
+ "file_extension": ".py",
94
+ "mimetype": "text/x-python",
95
+ "name": "python",
96
+ "nbconvert_exporter": "python",
97
+ "pygments_lexer": "ipython3",
98
+ "version": "3.11.11"
99
+ }
100
+ },
101
+ "nbformat": 4,
102
+ "nbformat_minor": 2
103
+ }