Spaces:

Technologic101
/

imagineui

Runtime error

App Files Files Community

Technologic101 commited on Feb 21

Commit

a4bb19d

1 Parent(s): ec2933c

task: init scraper

Browse files

Files changed (4) hide show

poetry.lock +0 -0
pyproject.toml +5 -14
scraper.py +112 -0
test_scraper.ipynb +103 -0

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -13,6 +13,7 @@ dependencies = [
     "beautifulsoup4>=4.12.0",
     "scrapy>=2.11.0",
     "selenium>=4.18.0",
     "playwright>=1.42.0",
     "pandas>=2.2.0",
     "numpy>=1.26.0",
@@ -28,15 +29,9 @@ dependencies = [
 ]
 requires-python = ">=3.11,<3.12"
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-[tool.hatch.build.targets.wheel]
-packages = ["src/build"]
-[tool.hatch.metadata]
-allow-direct-references = true
 [tool.ruff]
 line-length = 88
@@ -60,8 +55,4 @@ testpaths = ["tests"]
 python_version = "3.11"
 warn_return_any = true
 warn_unused_configs = true
-check_untyped_defs = true
-[tool.black]
-line-length = 88
-target-version = ['py311']

     "beautifulsoup4>=4.12.0",
     "scrapy>=2.11.0",
     "selenium>=4.18.0",
+    "selenium-wire>=5.1.0",  # Added for scraper
     "playwright>=1.42.0",
     "pandas>=2.2.0",
     "numpy>=1.26.0",
 ]
 requires-python = ">=3.11,<3.12"
+[tool.black]
+line-length = 88
+target-version = ['py311']
 [tool.ruff]
 line-length = 88
 python_version = "3.11"
 warn_return_any = true
 warn_unused_configs = true
+check_untyped_defs = true

scraper.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+import requests
+from bs4 import BeautifulSoup
+import json
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+def create_design_directory(design_id):
+    """Create a directory for the design if it doesn't exist"""
+    directory = f"designs/{design_id}"
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    return directory
+def save_css(url, directory):
+    """Download and save CSS file"""
+    response = requests.get(url)
+    css_path = f"{directory}/style.css"
+    with open(css_path, "w", encoding="utf-8") as f:
+        f.write(response.text)
+def save_metadata(metadata, directory):
+    """Save design metadata as JSON"""
+    metadata_path = f"{directory}/metadata.json"
+    with open(metadata_path, "w", encoding="utf-8") as f:
+        json.dump(metadata, f, indent=4)
+def take_screenshot(url, directory):
+    """Take screenshots of the design at desktop and mobile widths"""
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    driver = webdriver.Chrome(options=chrome_options)
+    # Desktop screenshot (1920px width)
+    driver.set_window_size(1920, 1080)
+    driver.get(url)
+    # Wait for page to load and get full height
+    total_height = driver.execute_script("return document.body.scrollHeight")
+    driver.set_window_size(1920, total_height)
+    driver.save_screenshot(f"{directory}/screenshot_desktop.png")
+    # Mobile screenshot (480px width)
+    driver.set_window_size(480, 1080)
+    driver.get(url)
+    # Wait for page to load and get full height
+    total_height = driver.execute_script("return document.body.scrollHeight")
+    driver.set_window_size(480, total_height)
+    driver.save_screenshot(f"{directory}/screenshot_mobile.png")
+    driver.quit()
+def scrape_design(design_id):
+    """Scrape a single design"""
+    # Create base URLs
+    design_url = f"https://www.csszengarden.com/{design_id}"
+    css_url = f"https://www.csszengarden.com/{design_id}/{design_id}.css"
+    # Create directory for this design
+    directory = create_design_directory(design_id)
+    # Get design page
+    response = requests.get(design_url)
+    print(f"Response status: {response.status_code}")
+    # Debug HTML content
+    print("\nFirst 500 characters of response:")
+    print(response.text[:500])
+    soup = BeautifulSoup(response.text, "html.parser")
+    # Debug found elements
+    print("\nFound elements:")
+    print(f"h1: {soup.select_one('h1')}")
+    print(f"author: {soup.select_one('meta[name=\"author\"]')}")
+    # Extract metadata with error handling
+    try:
+        metadata = {
+            "id": design_id,
+            "author": soup.select_one('meta[name="author"]')["content"] if soup.select_one('meta[name="author"]') else "Unknown Author",
+            "url": design_url,
+            "css_url": css_url
+        }
+    except Exception as e:
+        print(f"\nError extracting metadata: {str(e)}")
+        raise
+    # Save everything
+    save_css(css_url, directory)
+    save_metadata(metadata, directory)
+    take_screenshot(design_url, directory)
+def main():
+    """Main function to scrape multiple designs"""
+    # Create designs directory if it doesn't exist
+    if not os.path.exists("designs"):
+        os.makedirs("designs")
+    # List of design IDs to scrape
+    design_ids = ["221", "220", "219"]  # Add more IDs as needed
+    for design_id in design_ids:
+        try:
+            print(f"Scraping design {design_id}...")
+            scrape_design(design_id)
+            print(f"Successfully scraped design {design_id}")
+        except Exception as e:
+            print(f"Error scraping design {design_id}: {str(e)}")
+if __name__ == "__main__":
+    main()

test_scraper.ipynb ADDED Viewed

	@@ -0,0 +1,103 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Test CSS Zen Garden Scraper\n",
+    "\n",
+    "This notebook tests the functionality of our CSS Zen Garden scraper."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Testing scraper with design 221...\n",
+      "Response status: 200\n",
+      "\n",
+      "First 500 characters of response:\n",
+      "<!DOCTYPE html>\n",
+      "<html lang=\"en\">\n",
+      "<head>\n",
+      "\t<meta charset=\"utf-8\">\n",
+      "\t<title>CSS Zen Garden: The Beauty of CSS Design</title>\n",
+      "\n",
+      "\t<link rel=\"stylesheet\" media=\"screen\" href=\"/221/221.css?v=8may2013\">\n",
+      "\t<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS\" href=\"http://www.csszengarden.com/zengarden.xml\">\n",
+      "\n",
+      "\t<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n",
+      "\t<meta name=\"author\" content=\"Dave Shea\">\n",
+      "\t<meta name=\"description\" content=\"A demonstration of what can be accomplished v\n",
+      "\n",
+      "Found elements:\n",
+      "h1: <h1>CSS Zen Garden</h1>\n",
+      "author: None\n",
+      "author link: None\n",
+      "Success!\n"
+     ]
+    }
+   ],
+   "source": [
+    "from scraper import create_design_directory, save_css, save_metadata, take_screenshot, scrape_design\n",
+    "\n",
+    "# Test with a single design first\n",
+    "test_design_id = \"221\"\n",
+    "\n",
+    "try:\n",
+    "    print(f\"Testing scraper with design {test_design_id}...\")\n",
+    "    scrape_design(test_design_id)\n",
+    "    print(\"Success!\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Error: {str(e)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If successful, let's check what we got\n",
+    "import json\n",
+    "import os\n",
+    "\n",
+    "design_dir = f\"designs/{test_design_id}\"\n",
+    "print(\"Files created:\")\n",
+    "print(os.listdir(design_dir))\n",
+    "\n",
+    "# Display metadata\n",
+    "with open(f\"{design_dir}/metadata.json\") as f:\n",
+    "    metadata = json.load(f)\n",
+    "print(\"\\nMetadata:\")\n",
+    "print(json.dumps(metadata, indent=2))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}