Technologic101 commited on
Commit
d63acef
·
1 Parent(s): 91237b8

task: async scraper with delay

Browse files
Files changed (2) hide show
  1. scraper.py +7 -3
  2. test_scraper.ipynb +24 -17
scraper.py CHANGED
@@ -33,6 +33,8 @@ async def take_screenshot(url, directory):
33
  # Desktop screenshot (1920px width)
34
  page = await browser.new_page(viewport={'width': 1920, 'height': 1080})
35
  await page.goto(url)
 
 
36
  # Get full height
37
  height = await page.evaluate('document.body.scrollHeight')
38
  await page.set_viewport_size({'width': 1920, 'height': int(height)})
@@ -41,6 +43,8 @@ async def take_screenshot(url, directory):
41
  # Mobile screenshot (480px width)
42
  page = await browser.new_page(viewport={'width': 480, 'height': 1080})
43
  await page.goto(url)
 
 
44
  # Get full height
45
  height = await page.evaluate('document.body.scrollHeight')
46
  await page.set_viewport_size({'width': 480, 'height': int(height)})
@@ -59,14 +63,14 @@ async def scrape_design(design_id):
59
 
60
  # Get design page
61
  response = requests.get(design_url)
62
- print(f"Response status: {response.status_code}")
63
 
64
  soup = BeautifulSoup(response.text, "html.parser")
65
  author_meta = soup.select_one('meta[name="author"]')
66
 
67
  # Debug found elements
68
- print("\nFound elements:")
69
- print(f"h1: {soup.select_one('h1')['content']}")
70
  print(f"author: {author_meta['content']}")
71
 
72
  # Extract metadata with error handling
 
33
  # Desktop screenshot (1920px width)
34
  page = await browser.new_page(viewport={'width': 1920, 'height': 1080})
35
  await page.goto(url)
36
+ # Wait for fade transitions
37
+ await page.wait_for_timeout(1500)
38
  # Get full height
39
  height = await page.evaluate('document.body.scrollHeight')
40
  await page.set_viewport_size({'width': 1920, 'height': int(height)})
 
43
  # Mobile screenshot (480px width)
44
  page = await browser.new_page(viewport={'width': 480, 'height': 1080})
45
  await page.goto(url)
46
+ # Wait for fade transitions
47
+ await page.wait_for_timeout(1500)
48
  # Get full height
49
  height = await page.evaluate('document.body.scrollHeight')
50
  await page.set_viewport_size({'width': 480, 'height': int(height)})
 
63
 
64
  # Get design page
65
  response = requests.get(design_url)
66
+ print(f"{design_id}: Response status: {response.status_code}")
67
 
68
  soup = BeautifulSoup(response.text, "html.parser")
69
  author_meta = soup.select_one('meta[name="author"]')
70
 
71
  # Debug found elements
72
+ print(f"{design_id}: \nFound elements:")
73
+ print(f"h1: {soup.select_one('h1').text}")
74
  print(f"author: {author_meta['content']}")
75
 
76
  # Extract metadata with error handling
test_scraper.ipynb CHANGED
@@ -27,32 +27,39 @@
27
  "name": "stdout",
28
  "output_type": "stream",
29
  "text": [
 
 
 
 
 
 
 
30
  "Testing scraper with design 221...\n",
 
 
 
 
 
31
  "Success!\n"
32
  ]
33
- },
34
- {
35
- "name": "stderr",
36
- "output_type": "stream",
37
- "text": [
38
- "/var/folders/02/z250w46j5_514v22h_ct_zq40000gn/T/ipykernel_37704/2179274543.py:8: RuntimeWarning: coroutine 'scrape_design' was never awaited\n",
39
- " scrape_design(test_design_id)\n",
40
- "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n"
41
- ]
42
  }
43
  ],
44
  "source": [
45
  "from scraper import scrape_design\n",
 
 
 
46
  "\n",
47
- "# Test with a single design first\n",
48
- "test_design_id = \"221\"\n",
 
 
 
 
 
 
49
  "\n",
50
- "try:\n",
51
- " print(f\"Testing scraper with design {test_design_id}...\")\n",
52
- " await scrape_design(test_design_id)\n",
53
- " print(\"Success!\")\n",
54
- "except Exception as e:\n",
55
- " print(f\"Error: {str(e)}\")"
56
  ]
57
  },
58
  {
 
27
  "name": "stdout",
28
  "output_type": "stream",
29
  "text": [
30
+ "Testing scraper with design 220...\n",
31
+ "Response status: 200\n",
32
+ "\n",
33
+ "Found elements:\n",
34
+ "h1: CSS Zen Garden\n",
35
+ "author: Dave Shea\n",
36
+ "Success!\n",
37
  "Testing scraper with design 221...\n",
38
+ "Response status: 200\n",
39
+ "\n",
40
+ "Found elements:\n",
41
+ "h1: CSS Zen Garden\n",
42
+ "author: Dave Shea\n",
43
  "Success!\n"
44
  ]
 
 
 
 
 
 
 
 
 
45
  }
46
  ],
47
  "source": [
48
  "from scraper import scrape_design\n",
49
+ "import asyncio\n",
50
+ "\n",
51
+ "test_design_ids = [\"220\", \"221\"]\n",
52
  "\n",
53
+ "async def test_scraper(ids):\n",
54
+ " for test_design_id in ids:\n",
55
+ " try:\n",
56
+ " print(f\"Testing scraper with design {test_design_id}...\")\n",
57
+ " await scrape_design(test_design_id)\n",
58
+ " print(\"Success!\")\n",
59
+ " except Exception as e:\n",
60
+ " print(f\"Error: {str(e)}\")\n",
61
  "\n",
62
+ "asyncio.run(test_scraper(test_design_ids))"
 
 
 
 
 
63
  ]
64
  },
65
  {