NitinBot001 commited on
Commit
a6c126b
Β·
verified Β·
1 Parent(s): 1f2ebfc

Upload app3.py

Browse files
Files changed (1) hide show
  1. app3.py +691 -0
app3.py ADDED
@@ -0,0 +1,691 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+ import re
5
+ import time
6
+ from urllib.parse import urljoin, quote
7
+ import logging
8
+ import urllib3
9
+ from requests.adapters import HTTPAdapter
10
+ from urllib3.util.retry import Retry
11
+
12
+ # Disable SSL warnings
13
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
14
+
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class PhoneDBScraper:
20
+ def __init__(self):
21
+ self.base_url = "https://phonedb.net"
22
+ self.session = requests.Session()
23
+
24
+ # Configure session with better headers and SSL handling
25
+ self.session.headers.update({
26
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
27
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
28
+ 'Accept-Language': 'en-US,en;q=0.9',
29
+ 'Accept-Encoding': 'gzip, deflate, br',
30
+ 'DNT': '1',
31
+ 'Connection': 'keep-alive',
32
+ 'Upgrade-Insecure-Requests': '1',
33
+ })
34
+
35
+ # Set up retry strategy
36
+ retry_strategy = Retry(
37
+ total=3,
38
+ status_forcelist=[429, 500, 502, 503, 504],
39
+ allowed_methods=["HEAD", "GET", "OPTIONS"], # Updated parameter name
40
+ backoff_factor=1
41
+ )
42
+
43
+ adapter = HTTPAdapter(max_retries=retry_strategy)
44
+ self.session.mount("http://", adapter)
45
+ self.session.mount("https://", adapter)
46
+
47
+ # Disable SSL verification (use with caution)
48
+ self.session.verify = False
49
+
50
+ def search_phone(self, phone_name):
51
+ """Search for a phone by name and return search results"""
52
+ # Try different search approaches
53
+ search_urls = [
54
+ f"{self.base_url}/index.php?m=device&s=query&q={quote(phone_name)}",
55
+ f"{self.base_url}/search?q={quote(phone_name)}",
56
+ f"{self.base_url}/index.php?m=device&s=list&q={quote(phone_name)}"
57
+ ]
58
+
59
+ for search_url in search_urls:
60
+ try:
61
+ logger.info(f"Trying search URL: {search_url}")
62
+ response = self.session.get(search_url, timeout=30)
63
+ response.raise_for_status()
64
+
65
+ soup = BeautifulSoup(response.content, 'html.parser')
66
+
67
+ # Find search results with multiple selectors
68
+ results = []
69
+
70
+ # Look for various possible result containers
71
+ selectors = [
72
+ 'div.device-item',
73
+ 'div.device',
74
+ 'div.phone-item',
75
+ 'tr[onclick*="device"]',
76
+ 'a[href*="device"]',
77
+ 'a[href*="phone"]',
78
+ 'td a[href*="index.php"]'
79
+ ]
80
+
81
+ search_results = []
82
+ for selector in selectors:
83
+ found = soup.select(selector)
84
+ if found:
85
+ search_results.extend(found)
86
+ break
87
+
88
+ # Also try finding links with device IDs
89
+ if not search_results:
90
+ search_results = soup.find_all('a', href=re.compile(r'(device|phone|id=\d+)'))
91
+
92
+ for result in search_results[:10]: # Limit to first 10 results
93
+ title = ""
94
+ link = ""
95
+
96
+ if result.name == 'a':
97
+ link = result.get('href', '')
98
+ title = result.get_text(strip=True) or result.get('title', '')
99
+ elif result.name in ['div', 'tr']:
100
+ link_elem = result.find('a')
101
+ if link_elem:
102
+ link = link_elem.get('href', '')
103
+ title = link_elem.get_text(strip=True) or result.get_text(strip=True)
104
+ else:
105
+ # Check for onclick events with device info
106
+ onclick = result.get('onclick', '')
107
+ if 'device' in onclick:
108
+ # Extract device ID from onclick
109
+ device_match = re.search(r'id=(\d+)', onclick)
110
+ if device_match:
111
+ link = f"/index.php?m=device&id={device_match.group(1)}"
112
+ title = result.get_text(strip=True)
113
+
114
+ # Clean up the link and title
115
+ if link and title:
116
+ # Clean title
117
+ title = re.sub(r'\s+', ' ', title).strip()
118
+
119
+ # Ensure absolute URL
120
+ if link.startswith('/'):
121
+ link = self.base_url + link
122
+ elif not link.startswith('http'):
123
+ link = f"{self.base_url}/{link}"
124
+
125
+ # Filter relevant results
126
+ if any(word.lower() in title.lower() for word in phone_name.split()):
127
+ results.append({
128
+ 'title': title,
129
+ 'url': link
130
+ })
131
+
132
+ if results:
133
+ logger.info(f"Found {len(results)} results using URL: {search_url}")
134
+ return results
135
+
136
+ except Exception as e:
137
+ logger.warning(f"Search URL failed {search_url}: {e}")
138
+ continue
139
+
140
+ logger.error(f"All search methods failed for: {phone_name}")
141
+ return []
142
+
143
+ def get_phone_specs(self, phone_url):
144
+ """Extract detailed specifications from a phone page"""
145
+ try:
146
+ logger.info(f"Fetching specs from: {phone_url}")
147
+ response = self.session.get(phone_url, timeout=30)
148
+ response.raise_for_status()
149
+
150
+ soup = BeautifulSoup(response.content, 'html.parser')
151
+
152
+ # Extract phone data
153
+ phone_data = {
154
+ 'name': '',
155
+ 'brand': '',
156
+ 'images': [],
157
+ 'specifications': {},
158
+ 'source_url': phone_url
159
+ }
160
+
161
+ # Get phone name from multiple possible locations
162
+ title_candidates = [
163
+ soup.find('h1'),
164
+ soup.find('h2'),
165
+ soup.find('title'),
166
+ soup.find('div', class_=re.compile(r'title|name|header')),
167
+ soup.find('td', string=re.compile(r'Model|Name', re.I))
168
+ ]
169
+
170
+ for candidate in title_candidates:
171
+ if candidate:
172
+ title = candidate.get_text(strip=True)
173
+ if title and len(title) > 3:
174
+ phone_data['name'] = title
175
+ break
176
+
177
+ # Extract brand from title or URL
178
+ if phone_data['name']:
179
+ phone_data['brand'] = phone_data['name'].split()[0]
180
+
181
+ # Get images with multiple approaches
182
+ images = []
183
+
184
+ # Look for images in various containers
185
+ img_selectors = [
186
+ 'img[src*="phone"]',
187
+ 'img[src*="device"]',
188
+ 'img[src*="mobile"]',
189
+ 'img[alt*="phone"]',
190
+ 'img[alt*="device"]',
191
+ '.device-image img',
192
+ '.phone-image img',
193
+ 'td img',
194
+ 'div img'
195
+ ]
196
+
197
+ for selector in img_selectors:
198
+ imgs = soup.select(selector)
199
+ for img in imgs:
200
+ src = img.get('src', '')
201
+ if src:
202
+ # Convert relative URLs to absolute
203
+ if src.startswith('/'):
204
+ img_url = self.base_url + src
205
+ elif not src.startswith('http'):
206
+ img_url = f"{self.base_url}/{src}"
207
+ else:
208
+ img_url = src
209
+
210
+ # Avoid duplicates and filter out tiny images
211
+ if img_url not in images and not any(x in src.lower() for x in ['icon', 'logo', 'button', 'spacer']):
212
+ images.append(img_url)
213
+
214
+ phone_data['images'] = images[:5] # Limit to 5 images
215
+
216
+ # Extract specifications using multiple methods
217
+ specs = {}
218
+
219
+ # Method 1: PhoneDB specific table structure
220
+ spec_tables = soup.find_all('table')
221
+ for table in spec_tables:
222
+ rows = table.find_all('tr')
223
+ for row in rows:
224
+ cells = row.find_all(['td', 'th'])
225
+ if len(cells) >= 2:
226
+ key = cells[0].get_text(strip=True)
227
+ value = cells[1].get_text(strip=True)
228
+
229
+ # Clean up key and value
230
+ key = re.sub(r'[^\w\s]', '', key).strip()
231
+ value = re.sub(r'\s+', ' ', value).strip()
232
+
233
+ if key and value and len(key) < 100 and len(value) < 500:
234
+ specs[key] = value
235
+
236
+ # Method 2: Look for labeled specifications
237
+ labeled_specs = soup.find_all(['dt', 'label', 'b', 'strong'])
238
+ for label in labeled_specs:
239
+ label_text = label.get_text(strip=True)
240
+ if ':' in label_text:
241
+ key, value = label_text.split(':', 1)
242
+ specs[key.strip()] = value.strip()
243
+ else:
244
+ # Look for value in next sibling
245
+ sibling = label.find_next_sibling()
246
+ if sibling:
247
+ value = sibling.get_text(strip=True)
248
+ if value:
249
+ specs[label_text] = value
250
+
251
+ # Method 3: Extract common phone specifications from text
252
+ text_content = soup.get_text()
253
+
254
+ # Updated patterns for better matching
255
+ spec_patterns = {
256
+ 'Display Size': r'(\d+\.?\d*)\s*(?:inch|"|β€³)',
257
+ 'Display Resolution': r'(\d+)\s*[xΓ—]\s*(\d+)',
258
+ 'RAM': r'(\d+)\s*GB\s*(?:RAM|Memory)',
259
+ 'Storage': r'(\d+)\s*GB\s*(?:storage|internal|ROM)',
260
+ 'Battery': r'(\d+)\s*mAh',
261
+ 'Main Camera': r'(\d+(?:\.\d+)?)\s*MP(?:\s+main|\s+primary|\s+rear)?',
262
+ 'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:front|selfie|secondary)',
263
+ 'Operating System': r'(Android|iOS)\s*[\d\.]*',
264
+ 'Processor': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*[\w\d\s]*',
265
+ 'Network': r'(2G|3G|4G|5G|LTE)',
266
+ 'Weight': r'(\d+)\s*(?:g|gram)',
267
+ 'Dimensions': r'(\d+\.?\d*)\s*[xΓ—]\s*(\d+\.?\d*)\s*[xΓ—]\s*(\d+\.?\d*)\s*mm'
268
+ }
269
+
270
+ for spec_name, pattern in spec_patterns.items():
271
+ if spec_name not in specs: # Don't override existing specs
272
+ matches = re.findall(pattern, text_content, re.IGNORECASE)
273
+ if matches:
274
+ if spec_name == 'Display Resolution':
275
+ specs[spec_name] = f"{matches[0][0]}x{matches[0][1]}"
276
+ elif spec_name == 'Dimensions':
277
+ specs[spec_name] = f"{matches[0][0]}Γ—{matches[0][1]}Γ—{matches[0][2]} mm"
278
+ else:
279
+ specs[spec_name] = matches[0] if isinstance(matches[0], str) else str(matches[0])
280
+
281
+ phone_data['specifications'] = specs
282
+
283
+ logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
284
+ return phone_data
285
+
286
+ except Exception as e:
287
+ logger.error(f"Error extracting specs from {phone_url}: {e}")
288
+ return None
289
+
290
+ def scrape_phone_by_name(self, phone_name, get_first_result=True):
291
+ """Main method to scrape phone specs by name"""
292
+ logger.info(f"Searching for: {phone_name}")
293
+
294
+ # Search for the phone
295
+ search_results = self.search_phone(phone_name)
296
+
297
+ if not search_results:
298
+ logger.warning(f"No results found for: {phone_name}")
299
+ return None
300
+
301
+ results = []
302
+
303
+ # Process results
304
+ targets = [search_results[0]] if get_first_result else search_results
305
+
306
+ for result in targets:
307
+ logger.info(f"Scraping: {result['title']}")
308
+
309
+ phone_data = self.get_phone_specs(result['url'])
310
+ if phone_data:
311
+ results.append(phone_data)
312
+
313
+ # Be respectful with requests
314
+ time.sleep(1)
315
+
316
+ return results[0] if get_first_result and results else results
317
+
318
+ def scrape_multiple_phones(self, phone_names):
319
+ """Scrape multiple phones and return structured JSON"""
320
+ all_phones = []
321
+
322
+ for phone_name in phone_names:
323
+ try:
324
+ phone_data = self.scrape_phone_by_name(phone_name)
325
+ if phone_data:
326
+ all_phones.append(phone_data)
327
+ time.sleep(2) # Be respectful between requests
328
+ except Exception as e:
329
+ logger.error(f"Error scraping {phone_name}: {e}")
330
+ continue
331
+
332
+ return all_phones
333
+
334
+ def save_to_json(self, data, filename):
335
+ """Save data to JSON file"""
336
+ try:
337
+ with open(filename, 'w', encoding='utf-8') as f:
338
+ json.dump(data, f, indent=2, ensure_ascii=False)
339
+ logger.info(f"Data saved to {filename}")
340
+ except Exception as e:
341
+ logger.error(f"Error saving to JSON: {e}")
342
+
343
+ # Example usage with error handling and alternative sites
344
+ def main():
345
+ scraper = PhoneDBScraper()
346
+
347
+ # Example 1: Scrape a single phone
348
+ phone_name = "iPhone 15 Pro"
349
+ print(f"Attempting to scrape: {phone_name}")
350
+
351
+ result = scraper.scrape_phone_by_name(phone_name)
352
+
353
+ if result:
354
+ print(f"βœ… Successfully scraped {result['name']}")
355
+ print(f"Found {len(result['specifications'])} specifications")
356
+ print(f"Found {len(result['images'])} images")
357
+ print(json.dumps(result, indent=2))
358
+ scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_specs.json")
359
+ else:
360
+ print(f"❌ Failed to scrape {phone_name}")
361
+ print("This might be due to:")
362
+ print("1. PhoneDB.net blocking automated requests")
363
+ print("2. Phone not found in their database")
364
+ print("3. Site structure changes")
365
+ print("\nAlternative solutions:")
366
+ print("- Try with a different phone name")
367
+ print("- Use a VPN if blocked by IP")
368
+ print("- Consider using alternative sites like GSMArena")
369
+
370
+ # Example 2: Test with multiple phones
371
+ phone_list = [
372
+ "Samsung Galaxy S24",
373
+ "Google Pixel 8",
374
+ "OnePlus 12"
375
+ ]
376
+
377
+ print(f"\nTesting multiple phones: {phone_list}")
378
+ results = scraper.scrape_multiple_phones(phone_list)
379
+
380
+ if results:
381
+ scraper.save_to_json(results, "multiple_phones_specs.json")
382
+ print(f"βœ… Successfully scraped {len(results)}/{len(phone_list)} phones")
383
+
384
+ for phone in results:
385
+ print(f"- {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
386
+ else:
387
+ print("❌ No phones were successfully scraped")
388
+
389
+ # Enhanced GSMArena scraper as main alternative
390
+ class GSMArenaScraperAlternative:
391
+ """Enhanced GSMArena scraper with full functionality"""
392
+
393
+ def __init__(self):
394
+ self.base_url = "https://www.gsmarena.com"
395
+ self.session = requests.Session()
396
+ self.session.headers.update({
397
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
398
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
399
+ 'Accept-Language': 'en-US,en;q=0.5',
400
+ 'Accept-Encoding': 'gzip, deflate',
401
+ 'Connection': 'keep-alive',
402
+ })
403
+
404
+ def search_phone(self, phone_name):
405
+ """Search GSMArena for phone"""
406
+ search_url = f"{self.base_url}/results.php3"
407
+ params = {'sQuickSearch': 'yes', 'sName': phone_name}
408
+
409
+ try:
410
+ logger.info(f"Searching GSMArena for: {phone_name}")
411
+ response = self.session.get(search_url, params=params, timeout=30)
412
+ response.raise_for_status()
413
+
414
+ soup = BeautifulSoup(response.content, 'html.parser')
415
+ results = []
416
+
417
+ # Find search results in makers section
418
+ makers = soup.find_all('div', class_='makers')
419
+ for maker in makers:
420
+ links = maker.find_all('a')
421
+ for link in links[:5]: # Limit results
422
+ href = link.get('href', '')
423
+ title = link.get_text(strip=True)
424
+
425
+ if href and title and phone_name.lower().replace(' ', '') in title.lower().replace(' ', ''):
426
+ full_url = self.base_url + '/' + href if not href.startswith('http') else href
427
+ results.append({
428
+ 'title': title,
429
+ 'url': full_url
430
+ })
431
+
432
+ logger.info(f"Found {len(results)} results on GSMArena")
433
+ return results
434
+
435
+ except Exception as e:
436
+ logger.error(f"GSMArena search failed: {e}")
437
+ return []
438
+
439
+ def get_phone_specs(self, phone_url):
440
+ """Extract detailed specifications from GSMArena phone page"""
441
+ try:
442
+ logger.info(f"Fetching specs from GSMArena: {phone_url}")
443
+ response = self.session.get(phone_url, timeout=30)
444
+ response.raise_for_status()
445
+
446
+ soup = BeautifulSoup(response.content, 'html.parser')
447
+
448
+ phone_data = {
449
+ 'name': '',
450
+ 'brand': '',
451
+ 'images': [],
452
+ 'specifications': {},
453
+ 'source_url': phone_url
454
+ }
455
+
456
+ # Get phone name
457
+ title_elem = soup.find('h1', class_='specs-phone-name-title')
458
+ if not title_elem:
459
+ title_elem = soup.find('h1') or soup.find('title')
460
+
461
+ if title_elem:
462
+ phone_data['name'] = title_elem.get_text(strip=True)
463
+ phone_data['brand'] = phone_data['name'].split()[0] if phone_data['name'] else ''
464
+
465
+ # Get images
466
+ images = []
467
+
468
+ # Main phone image
469
+ main_img_container = soup.find('div', class_='specs-photo-main')
470
+ if main_img_container:
471
+ img = main_img_container.find('img')
472
+ if img and img.get('src'):
473
+ img_url = urljoin(phone_url, img['src'])
474
+ images.append(img_url)
475
+
476
+ # Additional images from carousel or gallery
477
+ carousel = soup.find('div', class_='carousel-item') or soup.find('div', class_='specs-photos')
478
+ if carousel:
479
+ for img in carousel.find_all('img'):
480
+ src = img.get('src', '')
481
+ if src:
482
+ img_url = urljoin(phone_url, src)
483
+ if img_url not in images:
484
+ images.append(img_url)
485
+
486
+ phone_data['images'] = images[:5]
487
+
488
+ # Extract specifications from GSMArena's table structure
489
+ specs = {}
490
+
491
+ # GSMArena uses specific table structure
492
+ spec_tables = soup.find_all('table', cellspacing='0')
493
+
494
+ for table in spec_tables:
495
+ # Get category header
496
+ category = ''
497
+ category_elem = table.find_previous('th') or table.find_previous('h2')
498
+ if category_elem:
499
+ category = category_elem.get_text(strip=True)
500
+
501
+ rows = table.find_all('tr')
502
+ for row in rows:
503
+ cells = row.find_all(['td', 'th'])
504
+ if len(cells) >= 2:
505
+ key = cells[0].get_text(strip=True)
506
+ value = cells[1].get_text(strip=True)
507
+
508
+ # Clean up the key and value
509
+ key = re.sub(r'[^\w\s]', '', key).strip()
510
+ value = re.sub(r'\s+', ' ', value).strip()
511
+
512
+ if key and value and len(key) < 100:
513
+ # Add category prefix if available
514
+ final_key = f"{category} - {key}" if category and len(category) < 30 else key
515
+ specs[final_key] = value
516
+
517
+ # Also extract from the detailed specs list structure
518
+ detail_lists = soup.find_all(['ul', 'li'], class_=re.compile(r'spec|detail'))
519
+ for detail_list in detail_lists:
520
+ items = detail_list.find_all('li') if detail_list.name == 'ul' else [detail_list]
521
+ for item in items:
522
+ text = item.get_text(strip=True)
523
+ if ':' in text:
524
+ parts = text.split(':', 1)
525
+ if len(parts) == 2:
526
+ key, value = parts
527
+ specs[key.strip()] = value.strip()
528
+
529
+ # Extract key specs using patterns from page text
530
+ page_text = soup.get_text()
531
+
532
+ key_patterns = {
533
+ 'Display Size': r'(\d+\.?\d*)\s*(?:inch|")\s*display',
534
+ 'Display Resolution': r'(\d+)\s*[xΓ—]\s*(\d+)\s*pixels',
535
+ 'RAM': r'(\d+)\s*GB\s*RAM',
536
+ 'Storage': r'(\d+)\s*GB\s*(?:storage|internal)',
537
+ 'Battery Capacity': r'(\d+)\s*mAh',
538
+ 'Main Camera': r'(\d+(?:\.\d+)?)\s*MP\s*(?:main|primary|rear)',
539
+ 'Front Camera': r'(\d+(?:\.\d+)?)\s*MP\s*front',
540
+ 'Operating System': r'(Android|iOS)\s*([\d\.]+)?',
541
+ 'Chipset': r'(Snapdragon|Exynos|A\d+|Kirin|MediaTek|Dimensity)\s*([\w\d\s]+)?',
542
+ 'Weight': r'(\d+)\s*g\s*weight',
543
+ 'Launch Date': r'(January|February|March|April|May|June|July|August|September|October|November|December)\s*(\d{4})'
544
+ }
545
+
546
+ for spec_name, pattern in key_patterns.items():
547
+ if spec_name not in specs:
548
+ match = re.search(pattern, page_text, re.IGNORECASE)
549
+ if match:
550
+ if spec_name == 'Display Resolution':
551
+ specs[spec_name] = f"{match.group(1)}Γ—{match.group(2)}"
552
+ elif spec_name == 'Launch Date':
553
+ specs[spec_name] = f"{match.group(1)} {match.group(2)}"
554
+ else:
555
+ specs[spec_name] = match.group(0)
556
+
557
+ phone_data['specifications'] = specs
558
+ logger.info(f"Extracted {len(specs)} specifications for {phone_data.get('name', 'Unknown')}")
559
+
560
+ return phone_data
561
+
562
+ except Exception as e:
563
+ logger.error(f"Error extracting GSMArena specs from {phone_url}: {e}")
564
+ return None
565
+
566
+ def scrape_phone_by_name(self, phone_name, get_first_result=True):
567
+ """Main method to scrape phone specs by name from GSMArena"""
568
+ search_results = self.search_phone(phone_name)
569
+
570
+ if not search_results:
571
+ logger.warning(f"No results found for: {phone_name}")
572
+ return None
573
+
574
+ results = []
575
+ targets = [search_results[0]] if get_first_result else search_results
576
+
577
+ for result in targets:
578
+ logger.info(f"Scraping: {result['title']}")
579
+ phone_data = self.get_phone_specs(result['url'])
580
+ if phone_data:
581
+ results.append(phone_data)
582
+ time.sleep(2) # Be respectful
583
+
584
+ return results[0] if get_first_result and results else results
585
+
586
+ def scrape_multiple_phones(self, phone_names):
587
+ """Scrape multiple phones from GSMArena"""
588
+ all_phones = []
589
+
590
+ for phone_name in phone_names:
591
+ try:
592
+ phone_data = self.scrape_phone_by_name(phone_name)
593
+ if phone_data:
594
+ all_phones.append(phone_data)
595
+ time.sleep(3) # Be respectful between requests
596
+ except Exception as e:
597
+ logger.error(f"Error scraping {phone_name}: {e}")
598
+ continue
599
+
600
+ return all_phones
601
+
602
+ def save_to_json(self, data, filename):
603
+ """Save data to JSON file"""
604
+ try:
605
+ with open(filename, 'w', encoding='utf-8') as f:
606
+ json.dump(data, f, indent=2, ensure_ascii=False)
607
+ logger.info(f"Data saved to {filename}")
608
+ except Exception as e:
609
+ logger.error(f"Error saving to JSON: {e}")
610
+
611
+ def test_alternative_scraper():
612
+ """Test the enhanced GSMArena scraper"""
613
+ print("\n" + "="*50)
614
+ print("Testing Enhanced GSMArena Scraper")
615
+ print("="*50)
616
+
617
+ gsm_scraper = GSMArenaScraperAlternative()
618
+
619
+ # Test single phone
620
+ phone_name = "iPhone 15 Pro"
621
+ print(f"Testing single phone: {phone_name}")
622
+
623
+ result = gsm_scraper.scrape_phone_by_name(phone_name)
624
+
625
+ if result:
626
+ print(f"βœ… Successfully scraped: {result['name']}")
627
+ print(f"πŸ“± Found {len(result['specifications'])} specifications")
628
+ print(f"πŸ–ΌοΈ Found {len(result['images'])} images")
629
+
630
+ # Show some key specs
631
+ key_specs = ['Display Size', 'RAM', 'Storage', 'Battery Capacity', 'Main Camera']
632
+ print("\nπŸ“‹ Key Specifications:")
633
+ for spec in key_specs:
634
+ for key, value in result['specifications'].items():
635
+ if spec.lower() in key.lower():
636
+ print(f" β€’ {key}: {value}")
637
+ break
638
+
639
+ # Save result
640
+ gsm_scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_gsmarena_specs.json")
641
+
642
+ else:
643
+ print(f"❌ Failed to scrape {phone_name}")
644
+
645
+ # Test multiple phones
646
+ print(f"\n" + "-"*40)
647
+ print("Testing Multiple Phones")
648
+ print("-"*40)
649
+
650
+ phone_list = ["Samsung Galaxy S24", "Google Pixel 8"]
651
+ results = gsm_scraper.scrape_multiple_phones(phone_list)
652
+
653
+ if results:
654
+ print(f"βœ… Successfully scraped {len(results)}/{len(phone_list)} phones")
655
+ gsm_scraper.save_to_json(results, "multiple_phones_gsmarena_specs.json")
656
+
657
+ for phone in results:
658
+ print(f"πŸ“± {phone['name']}: {len(phone['specifications'])} specs, {len(phone['images'])} images")
659
+ else:
660
+ print("❌ No phones were successfully scraped")
661
+
662
+ # Main function with both scrapers
663
+ def main():
664
+ print("πŸš€ Phone Specifications Scraper")
665
+ print("="*50)
666
+
667
+ # Try PhoneDB first
668
+ try:
669
+ print("Attempting PhoneDB scraper...")
670
+ scraper = PhoneDBScraper()
671
+ phone_name = "iPhone 15 Pro"
672
+ result = scraper.scrape_phone_by_name(phone_name)
673
+
674
+ if result:
675
+ print(f"βœ… PhoneDB: Successfully scraped {result['name']}")
676
+ scraper.save_to_json(result, f"{phone_name.replace(' ', '_')}_phonedb_specs.json")
677
+ return
678
+ else:
679
+ print("❌ PhoneDB scraper failed, trying GSMArena...")
680
+
681
+ except Exception as e:
682
+ print(f"❌ PhoneDB initialization failed: {str(e)}")
683
+ print("πŸ”„ Switching to GSMArena scraper...")
684
+
685
+ # Use GSMArena as fallback
686
+ test_alternative_scraper()
687
+
688
+ if __name__ == "__main__":
689
+ # main()
690
+ # Uncomment the line below to test GSMArena alternative
691
+ test_alternative_scraper()