Spaces:
Runtime error
Runtime error
Commit
·
d63acef
1
Parent(s):
91237b8
task: async scraper with delay
Browse files- scraper.py +7 -3
- test_scraper.ipynb +24 -17
scraper.py
CHANGED
@@ -33,6 +33,8 @@ async def take_screenshot(url, directory):
|
|
33 |
# Desktop screenshot (1920px width)
|
34 |
page = await browser.new_page(viewport={'width': 1920, 'height': 1080})
|
35 |
await page.goto(url)
|
|
|
|
|
36 |
# Get full height
|
37 |
height = await page.evaluate('document.body.scrollHeight')
|
38 |
await page.set_viewport_size({'width': 1920, 'height': int(height)})
|
@@ -41,6 +43,8 @@ async def take_screenshot(url, directory):
|
|
41 |
# Mobile screenshot (480px width)
|
42 |
page = await browser.new_page(viewport={'width': 480, 'height': 1080})
|
43 |
await page.goto(url)
|
|
|
|
|
44 |
# Get full height
|
45 |
height = await page.evaluate('document.body.scrollHeight')
|
46 |
await page.set_viewport_size({'width': 480, 'height': int(height)})
|
@@ -59,14 +63,14 @@ async def scrape_design(design_id):
|
|
59 |
|
60 |
# Get design page
|
61 |
response = requests.get(design_url)
|
62 |
-
print(f"Response status: {response.status_code}")
|
63 |
|
64 |
soup = BeautifulSoup(response.text, "html.parser")
|
65 |
author_meta = soup.select_one('meta[name="author"]')
|
66 |
|
67 |
# Debug found elements
|
68 |
-
print("\nFound elements:")
|
69 |
-
print(f"h1: {soup.select_one('h1')
|
70 |
print(f"author: {author_meta['content']}")
|
71 |
|
72 |
# Extract metadata with error handling
|
|
|
33 |
# Desktop screenshot (1920px width)
|
34 |
page = await browser.new_page(viewport={'width': 1920, 'height': 1080})
|
35 |
await page.goto(url)
|
36 |
+
# Wait for fade transitions
|
37 |
+
await page.wait_for_timeout(1500)
|
38 |
# Get full height
|
39 |
height = await page.evaluate('document.body.scrollHeight')
|
40 |
await page.set_viewport_size({'width': 1920, 'height': int(height)})
|
|
|
43 |
# Mobile screenshot (480px width)
|
44 |
page = await browser.new_page(viewport={'width': 480, 'height': 1080})
|
45 |
await page.goto(url)
|
46 |
+
# Wait for fade transitions
|
47 |
+
await page.wait_for_timeout(1500)
|
48 |
# Get full height
|
49 |
height = await page.evaluate('document.body.scrollHeight')
|
50 |
await page.set_viewport_size({'width': 480, 'height': int(height)})
|
|
|
63 |
|
64 |
# Get design page
|
65 |
response = requests.get(design_url)
|
66 |
+
print(f"{design_id}: Response status: {response.status_code}")
|
67 |
|
68 |
soup = BeautifulSoup(response.text, "html.parser")
|
69 |
author_meta = soup.select_one('meta[name="author"]')
|
70 |
|
71 |
# Debug found elements
|
72 |
+
print(f"{design_id}: \nFound elements:")
|
73 |
+
print(f"h1: {soup.select_one('h1').text}")
|
74 |
print(f"author: {author_meta['content']}")
|
75 |
|
76 |
# Extract metadata with error handling
|
test_scraper.ipynb
CHANGED
@@ -27,32 +27,39 @@
|
|
27 |
"name": "stdout",
|
28 |
"output_type": "stream",
|
29 |
"text": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
"Testing scraper with design 221...\n",
|
|
|
|
|
|
|
|
|
|
|
31 |
"Success!\n"
|
32 |
]
|
33 |
-
},
|
34 |
-
{
|
35 |
-
"name": "stderr",
|
36 |
-
"output_type": "stream",
|
37 |
-
"text": [
|
38 |
-
"/var/folders/02/z250w46j5_514v22h_ct_zq40000gn/T/ipykernel_37704/2179274543.py:8: RuntimeWarning: coroutine 'scrape_design' was never awaited\n",
|
39 |
-
" scrape_design(test_design_id)\n",
|
40 |
-
"RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n"
|
41 |
-
]
|
42 |
}
|
43 |
],
|
44 |
"source": [
|
45 |
"from scraper import scrape_design\n",
|
|
|
|
|
|
|
46 |
"\n",
|
47 |
-
"
|
48 |
-
"test_design_id
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
"\n",
|
50 |
-
"
|
51 |
-
" print(f\"Testing scraper with design {test_design_id}...\")\n",
|
52 |
-
" await scrape_design(test_design_id)\n",
|
53 |
-
" print(\"Success!\")\n",
|
54 |
-
"except Exception as e:\n",
|
55 |
-
" print(f\"Error: {str(e)}\")"
|
56 |
]
|
57 |
},
|
58 |
{
|
|
|
27 |
"name": "stdout",
|
28 |
"output_type": "stream",
|
29 |
"text": [
|
30 |
+
"Testing scraper with design 220...\n",
|
31 |
+
"Response status: 200\n",
|
32 |
+
"\n",
|
33 |
+
"Found elements:\n",
|
34 |
+
"h1: CSS Zen Garden\n",
|
35 |
+
"author: Dave Shea\n",
|
36 |
+
"Success!\n",
|
37 |
"Testing scraper with design 221...\n",
|
38 |
+
"Response status: 200\n",
|
39 |
+
"\n",
|
40 |
+
"Found elements:\n",
|
41 |
+
"h1: CSS Zen Garden\n",
|
42 |
+
"author: Dave Shea\n",
|
43 |
"Success!\n"
|
44 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
}
|
46 |
],
|
47 |
"source": [
|
48 |
"from scraper import scrape_design\n",
|
49 |
+
"import asyncio\n",
|
50 |
+
"\n",
|
51 |
+
"test_design_ids = [\"220\", \"221\"]\n",
|
52 |
"\n",
|
53 |
+
"async def test_scraper(ids):\n",
|
54 |
+
" for test_design_id in ids:\n",
|
55 |
+
" try:\n",
|
56 |
+
" print(f\"Testing scraper with design {test_design_id}...\")\n",
|
57 |
+
" await scrape_design(test_design_id)\n",
|
58 |
+
" print(\"Success!\")\n",
|
59 |
+
" except Exception as e:\n",
|
60 |
+
" print(f\"Error: {str(e)}\")\n",
|
61 |
"\n",
|
62 |
+
"asyncio.run(test_scraper(test_design_ids))"
|
|
|
|
|
|
|
|
|
|
|
63 |
]
|
64 |
},
|
65 |
{
|